├── .github └── workflows │ ├── release.yaml │ ├── validation-python.yaml │ └── validation-rust.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── Pipfile ├── README-py.md ├── README.md ├── benches ├── findings.md ├── logbench.py └── results │ ├── v0.2.2-27-g87c32f5_2022-12-25_2135.bench │ ├── v0.3.1-12-g0d89be1_2022-12-31_0726.bench │ ├── v0.3.3-3-g1e7d5fd_2023-01-02_0118.bench │ └── v0.4.1-16-g72dc016_2023-10-30_0539.bench ├── book.toml ├── book_src ├── SUMMARY.md ├── installation.md ├── introduction.md └── usage.md ├── clippy.toml ├── dictionaries └── README.md ├── en_US.license ├── release.toml ├── rustfmt.toml ├── test-suite ├── update_dictionaries.py ├── zspell-cli ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── src │ ├── cli │ │ └── mod.rs │ ├── download.rs │ ├── main.rs │ └── spelling │ │ └── mod.rs └── tests │ ├── cli_dict.rs │ ├── cli_lev.rs │ └── files │ ├── de_res.txt │ └── sample-index.json ├── zspell-py ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── docs │ ├── Makefile │ ├── conf.py │ ├── index.rst │ ├── make.bat │ └── requirements.txt ├── pyproject.toml ├── python │ └── zspell │ │ ├── __init__.py │ │ ├── py.types │ │ └── zspell.pyi ├── src │ └── lib.rs └── tests │ └── test_basic.py └── zspell ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── datastructure.rs ├── dict_integration.rs ├── slice_contains.rs ├── small_map.rs └── word_splitter.rs ├── build.rs ├── src ├── affix.rs ├── affix │ ├── node.rs │ ├── parse.rs │ ├── rule.rs │ ├── tests.rs │ ├── tests_parse.rs │ └── types.rs ├── dict.rs ├── dict │ ├── flags.rs │ ├── meta.rs │ ├── parse.rs │ ├── rule.rs │ ├── rules_apply.rs │ ├── rules_reverse.rs │ ├── tests.rs │ ├── tests_parse.rs │ └── tests_rule.rs ├── error.rs ├── helpers.rs ├── lib.rs ├── meta.rs ├── morph.rs ├── suggestions.rs ├── system.rs └── system │ └── tests.rs ├── test-suite ├── 0-example.test ├── b-affix-forward-gen-num-flags.test ├── b-affix-forward-gen.test ├── b-flag-long.test ├── b-nosuggest-forbid.test ├── b-stemming-morph.test ├── h-circumfix.test ├── h-ignore-sug.test ├── h-ignore-utf.test ├── h-keepcase.test ├── h-korean.test ├── h-limit-multiple-compounding.test ├── h-map-utf.test ├── h-map.test ├── h-morph.test ├── h-needaffix.test ├── h-needaffix2.test ├── h-needaffix3.test ├── h-needaffix4.test ├── h-needaffix5.test ├── h-nepali.test ├── h-nosuggest.test ├── h-oconv.test ├── h-slash.test ├── h-timelimit.test ├── h-utf8.test ├── h-utfcoumpound.test ├── h-zeroaffix.test ├── i071-number-affixes.test └── i093-separate-dict-afx-flags.test ├── test-util ├── Cargo.toml └── src │ └── lib.rs └── tests ├── files ├── odyssey.txt ├── tortoise_hare_misspelled.txt ├── w1_eng_short.aff └── w1_eng_short.dic └── suite.rs /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Release 3 | 4 | on: 5 | push: 6 | tags: 7 | - 'v*' 8 | 9 | jobs: 10 | # Allow our jobs to block on validation steps 11 | validation_rust: 12 | uses: ./.github/workflows/validation-rust.yaml 13 | 14 | validation_py: 15 | uses: ./.github/workflows/validation-python.yaml 16 | 17 | rust_release: 18 | name: "build & deploy ${{ matrix.build }} binaries" 19 | needs: [validation_rust, validation_py] 20 | runs-on: ${{ matrix.os }} 21 | strategy: 22 | matrix: 23 | include: 24 | - build: linux 25 | os: ubuntu-latest 26 | target: x86_64-unknown-linux-gnu 27 | # target: x86_64-unknown-linux-musl 28 | extension: '' 29 | # Unsuccessful compilation; try on local 30 | # - build: linux-arm 31 | # os: ubuntu-latest 32 | # target: arm-unknown-linux-gnueabihf 33 | # extension: '' 34 | - build: macos 35 | os: macos-latest 36 | target: x86_64-apple-darwin 37 | extension: '' 38 | - build: windows-msvc 39 | os: windows-latest 40 | target: x86_64-pc-windows-msvc 41 | extension: .exe 42 | env: 43 | CARGO: cargo 44 | TARGET_DIR: ./target 45 | TARGET_FLAGS: "" 46 | 47 | steps: 48 | # Retreive git files 49 | - uses: actions/checkout@v4 50 | - uses: dtolnay/rust-toolchain@stable 51 | with: 52 | targets: ${{ matrix.target }} 53 | - uses: Swatinem/rust-cache@v2 54 | # Debugging aid 55 | - name: Show commands 56 | run: | 57 | echo pwd: 58 | pwd 59 | echo ls: 60 | ls 61 | echo "cargo command is: ${{ env.CARGO }}" 62 | echo "target flag is: ${{ env.TARGET_FLAGS }}" 63 | echo "target dir is: ${{ env.TARGET_DIR }}" 64 | # Perform build 65 | - name: Build binary 66 | uses: actions-rs/cargo@v1 67 | with: 68 | command: build 69 | # We only want to build zspell-cli, not plain zspell or zspell-py (does not support cdylib) 70 | args: --package zspell-cli --release --verbose --target ${{ matrix.target }} 71 | - name: Show output 72 | run: | 73 | ls target 74 | ls "target/${{ matrix.target }}" 75 | ls "target/${{ matrix.target }}/release" 76 | # Create .zip or .tar.gz file 77 | - name: Build archive 78 | shell: bash 79 | run: | 80 | echo '\nWorking directory:' && pwd 81 | echo '\nls:' && ls 82 | 83 | # outdir="$(ci/cargo-out-dir "${{ env.TARGET_DIR }}")" 84 | # outdir=target/release/${{ steps.get_repository_name.outputs.REPOSITORY_NAME }}${{ matrix.extension }} 85 | # Find the output directory with the latest timestamp 86 | cargo_outdir="$(find "${{ env.TARGET_DIR }}" -name zspell-stamp -print0 | xargs -0 ls -t | head -n1 | xargs dirname)" 87 | ref_name=${GITHUB_REF##*/} 88 | 89 | # Remove leading 'v' for use where needed 90 | ref_name_stripped=$(echo $ref_name | perl -0pe 's/^v//') 91 | echo "\nRef name: \"$ref_name\"" 92 | echo "\nRef name stripped: \"$ref_name_stripped\"" 93 | 94 | staging="zspell-$ref_name-${{ matrix.target }}" 95 | mkdir -p "$staging"/{completion,doc} 96 | 97 | # Remove the "unreleased" section from our changelog 98 | perl -0777 -i -pe "s/(<\!-- next-header -->.*## \[Unreleased\].*?\n)(?=## |<\!--)//gms" CHANGELOG.md 99 | 100 | cp {README.md,LICENSE} "$staging/" 101 | cp CHANGELOG.md "$staging/doc/" 102 | cp "$cargo_outdir"/zspell.1 "$staging/doc" 103 | cp "$cargo_outdir"/{_zspell,_zspell.ps1,zspell.bash,zspell.elv,zspell.fish} "$staging/completion" 104 | 105 | # Build RNOTES.md, which we will use for our Github release (not shipped in zip) 106 | # Select the release notes from our latest version only 107 | perl -0777 -ne "print /(## \[$ref_name_stripped\].*?\n)(?=\n*^(?:## |<\!--))/gms" CHANGELOG.md > RNOTES.md 108 | # Select the diff URL for this version only 109 | perl -0777 -ne "print /\n\[$ref_name_stripped\]:.*?\n/gms" CHANGELOG.md >> RNOTES.md 110 | 111 | echo "Release notes:" && cat RNOTES.md 112 | 113 | if [ "${{ matrix.os }}" = "windows-latest" ]; then 114 | cp "target/${{ matrix.target }}/release/zspell.exe" "$staging/" 115 | 7z a "$staging.zip" "$staging" 116 | echo "ASSET=$staging.zip" >> $GITHUB_ENV 117 | else 118 | cp "target/${{ matrix.target }}/release/zspell" "$staging/" 119 | tar czf "$staging.tar.gz" "$staging" 120 | echo "ASSET=$staging.tar.gz" >> $GITHUB_ENV 121 | fi 122 | # Upload to github 123 | - name: Release 124 | uses: softprops/action-gh-release@v1 125 | if: startsWith(github.ref, 'refs/tags/') 126 | env: 127 | GITHUB_REPOSITORY: pluots/zspell 128 | with: 129 | body_path: RNOTES.md 130 | # note you'll typically need to create a personal access token 131 | # with permissions to create releases in the other repo 132 | token: ${{ secrets.GITHUB_TOKEN }} 133 | files: | 134 | ${{ env.ASSET }} 135 | 136 | linux_wheels: 137 | runs-on: ubuntu-latest 138 | needs: [validation_rust, validation_py] 139 | steps: 140 | - uses: actions/checkout@v4 141 | - name: build libc wheels 142 | uses: messense/maturin-action@v1 143 | with: 144 | manylinux: auto 145 | command: build 146 | # container default is manylinux 147 | args: --release -o dist -i 3.7 3.8 3.9 3.10 3.11 3.12 --manifest-path zspell-py/Cargo.toml 148 | - name: build musl wheels 149 | uses: messense/maturin-action@v1 150 | with: 151 | target: x86_64-unknown-linux-musl 152 | manylinux: musllinux_1_1 153 | command: build 154 | args: --release -o dist -i 3.7 3.8 3.9 3.10 3.11 3.12 --manifest-path zspell-py/Cargo.toml 155 | - name: upload wheels 156 | uses: actions/upload-artifact@v2 157 | with: 158 | name: wheels 159 | path: dist 160 | 161 | windows_wheels: 162 | runs-on: windows-latest 163 | needs: [validation_rust, validation_py] 164 | steps: 165 | - uses: actions/checkout@v4 166 | - uses: messense/maturin-action@v1 167 | with: 168 | command: build 169 | # FIXME: python 3.12 not yet available on windows runners 170 | args: --release -o dist -i 3.7 3.8 3.9 3.10 3.11 --manifest-path zspell-py/Cargo.toml 171 | - name: upload wheels 172 | uses: actions/upload-artifact@v2 173 | with: 174 | name: wheels 175 | path: dist 176 | 177 | macos_wheels: 178 | runs-on: macos-latest 179 | needs: [validation_rust, validation_py] 180 | steps: 181 | - uses: actions/checkout@v4 182 | - uses: messense/maturin-action@v1 183 | with: 184 | command: build 185 | args: --release -o dist -i 3.7 3.8 3.9 3.10 3.11 3.12 --universal2 --manifest-path zspell-py/Cargo.toml 186 | - name: upload wheels 187 | uses: actions/upload-artifact@v2 188 | with: 189 | name: wheels 190 | path: dist 191 | 192 | release_all_wheels: 193 | name: Release wheels 194 | runs-on: ubuntu-latest 195 | needs: [linux_wheels, macos_wheels, windows_wheels] 196 | steps: 197 | - uses: actions/download-artifact@v2 198 | with: 199 | name: wheels 200 | - name: Publish to PyPI 201 | uses: messense/maturin-action@v1 202 | env: 203 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 204 | with: 205 | command: upload 206 | args: --skip-existing * 207 | 208 | deploy_book: 209 | runs-on: ubuntu-latest 210 | needs: [validation_rust, validation_py] 211 | steps: 212 | - uses: actions/checkout@v4 213 | with: 214 | fetch-depth: 0 215 | - name: Install mdbook 216 | run: | 217 | mkdir mdbook 218 | curl -sSL https://github.com/rust-lang/mdBook/releases/download/v0.4.14/mdbook-v0.4.14-x86_64-unknown-linux-gnu.tar.gz \ 219 | | tar -xz --directory=./mdbook 220 | echo `pwd`/mdbook >> $GITHUB_PATH 221 | - name: Deploy GitHub Pages 222 | run: | 223 | # This assumes your book is in the root of your repository. 224 | # Just add a `cd` here if you need to change to another directory. 225 | mdbook build 226 | git worktree add gh-pages 227 | git config user.name "Deploy from CI" 228 | git config user.email "" 229 | cd gh-pages 230 | # Delete the ref to avoid keeping history. 231 | git update-ref -d refs/heads/gh-pages 232 | rm -rf * 233 | mv ../book/* . 234 | git add . 235 | git commit -m "Deploy $GITHUB_SHA to gh-pages" 236 | git push --force --set-upstream origin gh-pages 237 | -------------------------------------------------------------------------------- /.github/workflows/validation-python.yaml: -------------------------------------------------------------------------------- 1 | name: Python Validation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | workflow_call: # allow this to be run from other workflows 9 | 10 | jobs: 11 | verify: 12 | name: pytest 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/cache@v3 17 | with: 18 | path: ~/.local/share/virtualenvs 19 | key: ${{ runner.os }}-pipenv-${{ hashFiles('**/Pipfile.lock') }} 20 | - uses: dtolnay/rust-toolchain@stable 21 | - uses: Swatinem/rust-cache@v2 22 | - uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.11' 25 | - name: Install pipenv 26 | run: python -m pip install --upgrade pipenv wheel 27 | - name: Install dependencies 28 | run: | 29 | pipenv install --dev 30 | pipenv run maturin develop --manifest-path zspell-py/Cargo.toml 31 | - name: Pytest 32 | run: pipenv run pytest 33 | - name: Validate docs 34 | run: pipenv run make -C zspell-py/docs html 35 | -------------------------------------------------------------------------------- /.github/workflows/validation-rust.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Rust Validation 3 | 4 | on: 5 | push: 6 | branches: 7 | - main 8 | pull_request: 9 | workflow_call: # allow this to be run from other workflows 10 | 11 | env: 12 | RUSTDOCFLAGS: -D warnings 13 | RUSTFLAGS: -D warnings 14 | RUST_BACKTRACE: 1 15 | CARGO_UNSTABLE_SPARSE_REGISTRY: true 16 | 17 | jobs: 18 | clippy: 19 | name: clippy 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: dtolnay/rust-toolchain@beta 24 | with: 25 | components: clippy 26 | - uses: Swatinem/rust-cache@v2 27 | - run: cargo clippy --all-features --all-targets -- -D warnings 28 | - run: cargo clippy --no-default-features --all-targets --features unstable-bench -- -D warnings 29 | 30 | min_versions: 31 | name: check minimum rustc version 32 | runs-on: ubuntu-latest 33 | env: 34 | # we don't care about unused with these feature configurations 35 | RUSTFLAGS: -A unused 36 | steps: 37 | - uses: actions/checkout@v4 38 | - uses: dtolnay/rust-toolchain@1.65 39 | - uses: Swatinem/rust-cache@v2 40 | # check only zspell; we can use later for the CLI 41 | - run: cargo check -p zspell --all-features 42 | - run: cargo check -p zspell --no-default-features --features unstable-bench 43 | 44 | test: 45 | strategy: 46 | fail-fast: true 47 | matrix: 48 | os: [ubuntu-latest, windows-latest, macos-latest] 49 | include: 50 | - os: ubuntu-latest 51 | name: linux 52 | coverage: true 53 | - os: windows-latest 54 | name: windows 55 | - os: macos-latest 56 | name: mac 57 | name: "test on ${{ matrix.name }}" 58 | runs-on: ${{ matrix.os }} 59 | steps: 60 | - uses: actions/checkout@v4 61 | - name: List files 62 | run: | 63 | pwd 64 | ls 65 | - uses: dtolnay/rust-toolchain@nightly 66 | with: 67 | components: ${{ matrix.coverage && 'llvm-tools-preview' || '' }} 68 | - uses: taiki-e/install-action@cargo-llvm-cov 69 | - name: Install nextest 70 | uses: taiki-e/install-action@nextest 71 | - uses: Swatinem/rust-cache@v2 72 | with: 73 | key: ${{ matrix.os }} 74 | - name: run tests without coverage 75 | if: ${{ ! matrix.coverage }} 76 | run: | 77 | cargo nextest run 78 | cargo test --doc 79 | - name: run tests with coverage 80 | if: ${{ matrix.coverage }} 81 | # nextest can't run doctests so we run coverage on those separately 82 | # and do a combined report 83 | run: | 84 | cargo llvm-cov --no-report nextest 85 | cargo llvm-cov --no-report --doc 86 | cargo llvm-cov report --doctests --lcov --output-path lcov.info 87 | - name: Upload coverage data to codecov 88 | if: ${{ matrix.coverage }} 89 | uses: codecov/codecov-action@v3 90 | env: 91 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 92 | with: 93 | fail_ci_if_error: false 94 | files: lcov.info 95 | 96 | sanitizers: 97 | name: Test with leak sanitizer 98 | runs-on: ubuntu-latest 99 | env: 100 | RUSTFLAGS: -Zsanitizer=leak 101 | steps: 102 | - uses: actions/checkout@v4 103 | - uses: dtolnay/rust-toolchain@nightly 104 | with: 105 | components: rust-src 106 | - name: Install nextest 107 | uses: taiki-e/install-action@nextest 108 | - uses: Swatinem/rust-cache@v2 109 | - run: > 110 | cargo nextest run -p zspell 111 | --target=x86_64-unknown-linux-gnu 112 | -Zbuild-std 113 | - run: > 114 | cargo test -p zspell --doc 115 | --target=x86_64-unknown-linux-gnu 116 | -Zbuild-std 117 | 118 | miri: 119 | name: Miri 120 | runs-on: ubuntu-latest 121 | # basically only run this if we're sure we don't cancel since it eats so much CPU 122 | needs: ["clippy", "test", "fmt", "doc"] 123 | env: 124 | # Can't interact with files in isolation 125 | MIRIFLAGS: -Zmiri-disable-isolation 126 | steps: 127 | - uses: actions/checkout@v4 128 | - uses: dtolnay/rust-toolchain@nightly 129 | with: 130 | components: miri 131 | - name: Install nextest 132 | uses: taiki-e/install-action@nextest 133 | - uses: Swatinem/rust-cache@v2 134 | - name: Run Miri 135 | # Miri is _slow_ for our use case, only run a few comprehensive tests 136 | run: > 137 | cargo miri nextest run -p zspell -E ' 138 | test(=test_stemming_morph) + 139 | test(=test_pfxsfx) 140 | ' 141 | 142 | fmt: 143 | name: formatting 144 | runs-on: ubuntu-latest 145 | steps: 146 | - uses: actions/checkout@v4 147 | - uses: dtolnay/rust-toolchain@nightly 148 | with: 149 | components: rustfmt 150 | - uses: Swatinem/rust-cache@v2 151 | - run: cargo fmt --all -- --check 152 | - uses: actions/setup-python@v3 153 | - name: Validate pre-commit 154 | uses: pre-commit/action@v3.0.0 155 | 156 | doc: 157 | name: docs 158 | runs-on: ubuntu-latest 159 | steps: 160 | - uses: actions/checkout@v4 161 | - uses: dtolnay/rust-toolchain@nightly 162 | - uses: Swatinem/rust-cache@v2 163 | - run: cargo doc 164 | 165 | book: 166 | name: book 167 | runs-on: ubuntu-latest 168 | steps: 169 | - uses: actions/checkout@v4 170 | - uses: dtolnay/rust-toolchain@nightly 171 | - uses: Swatinem/rust-cache@v2 172 | - run: | 173 | mkdir -p ~/mdbook 174 | # Tar is weird with ~ as home 175 | curl -sSL https://github.com/rust-lang/mdBook/releases/download/v0.4.21/mdbook-v0.4.21-x86_64-unknown-linux-gnu.tar.gz \ 176 | | tar -xz --directory=$(echo ~)/mdbook 177 | - run: ~/mdbook/mdbook test 178 | 179 | # Make sure we turned the clippy lint off 180 | verify_fixme_critical: 181 | name: verify critical fixmes 182 | runs-on: ubuntu-latest 183 | steps: 184 | - uses: actions/checkout@v4 185 | - run: grep -r "FIXME:CRIT" --exclude-dir="target" --exclude-dir=".git" --exclude="validation-rust.yaml" && exit 1 || exit 0 186 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | dictionaries/*.dic 3 | dictionaries/*.aff 4 | dictionaries/*.license 5 | dictionaries/scowl* 6 | tests/files/odyssey.txt 7 | /book 8 | .docker-cargo 9 | 10 | Pipfile.lock 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | .pytest_cache/ 15 | *.py[cod] 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | .venv/ 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | include/ 34 | man/ 35 | venv/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | pip-selfcheck.json 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | 53 | # Translations 54 | *.mo 55 | 56 | # Mr Developer 57 | .mr.developer.cfg 58 | .project 59 | .pydevproject 60 | 61 | # Rope 62 | .ropeproject 63 | 64 | # Django stuff: 65 | *.log 66 | *.pot 67 | 68 | .DS_Store 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyCharm 74 | .idea/ 75 | 76 | # VSCode 77 | .vscode/ 78 | 79 | # Pyenv 80 | **/docs/_build 81 | .python-version 82 | 83 | # Profiling 84 | perf.data* 85 | callgrind.out* 86 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: check-toml 7 | - id: fix-byte-order-marker 8 | - id: end-of-file-fixer 9 | - id: trailing-whitespace 10 | - id: mixed-line-ending 11 | - id: check-added-large-files 12 | args: ['--maxkb=600'] 13 | 14 | - repo: https://github.com/psf/black 15 | rev: 23.9.1 16 | hooks: 17 | - id: black 18 | 19 | - repo: local 20 | hooks: 21 | - id: cargo-fmt 22 | name: Cargo format 23 | language: system 24 | entry: cargo fmt 25 | args: ["--"] 26 | types_or: ["rust"] 27 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | # Use the latest stable 12 | python: "3.11" 13 | rust: "1.61" 14 | 15 | python: 16 | install: 17 | - requirements: zspell-py/docs/requirements.txt 18 | - method: pip 19 | path: zspell-py/ 20 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | 5 | ## [Unreleased] - ReleaseDate 6 | 7 | ### Added 8 | 9 | ### Changed 10 | 11 | ### Removed 12 | 13 | 14 | 15 | ## [0.5.5] - 2024-06-13 16 | 17 | ### Changed 18 | 19 | - Fix the parsing of long (double ASCII) flags in 20 | [#109](https://github.com/pluots/zspell/pull/109) and 21 | [#111](https://github.com/pluots/zspell/pull/111). 22 | 23 | 24 | ## [0.5.3] - 2023-12-13 25 | 26 | ### Changed 27 | 28 | - Fix duplicate flag issue loading the German dictionary from 29 | [#93](https://github.com/pluots/zspell/issues/93) 30 | 31 | 32 | ## [0.5.2] - 2023-10-30 33 | 34 | ### Added 35 | 36 | Add a `zspell::builder()` shortcut for `DictBuilder::new()`. 37 | 38 | 39 | ## [0.5.1] - 2023-10-30 40 | 41 | ### Additions 42 | 43 | Publish the work in progress command line interface under crate name 44 | `zspell-cli`. 45 | 46 | ### Changed 47 | 48 | - Add support for nonstandard morphological information types. 49 | - Fix issue where all morph information for an affix rule were being reported, 50 | rather than just that for the relevant pattern. Fixes 51 | [#73](https://github.com/pluots/zspell/issues/73). 52 | - Rewrite the `.dic` file parser. 53 | - Improve handling of morphological information in dictionary files. 54 | 55 | ## [0.5.0] - 2023-10-30 56 | 57 | Immediately superceded release, see 0.5.1. 58 | 59 | ## [0.4.1] - 2023-10-18 60 | 61 | ### Additions 62 | 63 | - Add `Dictionary::{entry, entries}` which allow stemming and morphological 64 | analysis. 65 | 66 | ### Fixed 67 | 68 | - Corrected error message for parsing affix headers 69 | - Corrected pattern matching for groups that include hyphens 70 | - Default no longer has a nosuggest flag 71 | - Change output directory to use cargo directory rather than source 72 | 73 | ### Changed 74 | 75 | - Remove features `unstable-analysis` and `unstable-stem` since functionality is 76 | now public 77 | - Wordlist now correctly applies more than one affix rule if it is available 78 | - Moved `DictBuilder::config` behind `zspell-unstable` 79 | - [build] update CI workflows 80 | - [internal] make some changes from `TryFrom` to `FromStr` 81 | - [internal] refactor test system to support stemming and morphological analysis 82 | 83 | ## [0.4.0] - 2023-10-18 84 | 85 | Immediately superceded release, see 0.4.1. 86 | 87 | ## [0.3.3] - 2023-01-01 88 | 89 | ### Changed 90 | 91 | - [build] update python release workflow 92 | 93 | ## [0.3.2] - 2023-01-01 94 | 95 | ### Changed 96 | 97 | - `.dic` parser now ignores lines that start with a tab (sometimes used for 98 | comments) 99 | - Updated python documentation 100 | 101 | ## [0.3.1] - 2022-12-30 102 | 103 | Minor patch to build system workflow 104 | 105 | ## [0.3.0] - 2022-12-30 106 | 107 | This change is a huge rewrite of the library! Hopefully this will pave the way 108 | forward for more features and easier growth. 109 | 110 | ### Changes 111 | 112 | - Added `DictBuilder` to simplify dictionary creation 113 | - Removed `affix::Config` as the representation was limiting & clunky 114 | - The methods on `Dictionary` are now infallible since an uncompiled dictionary 115 | can no longer be created 116 | - Rewrote the `error` module 117 | - Simplified imports, everything needed is now top-level 118 | - Rewrote affix file parser so it is much more efficient and now handles all 119 | known keys. We do not yet act on all possible values. 120 | - Rewrote the dictionary & personal wordlist parsers 121 | 122 | ### Additions 123 | 124 | - `check_indices` is now available to return better information about the 125 | location of errors 126 | - Python modules now have correct bindings (horray!) 127 | 128 | There are also a few new APIs that are feature gated. They should be considered 129 | very unstable until those feature gates are removed. 130 | 131 | - Suggestions 132 | - Stemming 133 | - Morphological analysis 134 | - System tools. These were previously public but have been moved behind the 135 | feature gate. 136 | 137 | ## [0.2.2] - 2022-11-04 138 | 139 | Minor bups in the dependency list 140 | 141 | ## [0.2.1] - 2022-11-04 142 | 143 | ### Changes 144 | 145 | - Changed word breaking to use unicode segmentation, as suggested by 146 | @saona-raimundo 147 | 148 | ## [0.2.0] - 2022-11-04 149 | 150 | ### Additions 151 | 152 | - Ability to automatically locate dictionaries on the system, WIP and not yet 153 | documented 154 | - Command line option to download dictionaries 155 | 156 | ### Changes 157 | 158 | - Rename helper CLI and py crates (only relevant within this project) 159 | 160 | ## [0.1.4] - 2022-08-17 161 | 162 | ### Additions 163 | 164 | - Started generating manpages and autocomplete scripts on build 165 | - Started generating a documentation book for the CLI 166 | 167 | ### Changes 168 | 169 | - Better reserve & shrink vectors and hash sets to save a small ammount of 170 | overhead 171 | 172 | ## [0.1.3] - 2022-08-16 173 | 174 | ### Changes 175 | 176 | - Correction to output generation 177 | 178 | ## [0.1.2] - 2022-08-16 179 | 180 | ### Additions 181 | 182 | - Framework for locating files on a user's local machine 183 | 184 | ### Changes 185 | 186 | - Updated binary output configuration 187 | 188 | ## [0.1.1] - 2022-07-25 189 | 190 | ### Changes 191 | 192 | - Updated wheel release configuration 193 | 194 | ## [0.1.0] - 2022-07-25 195 | 196 | ### Changes 197 | 198 | - Restructured project to make all modules public that might be needed to 199 | interface with this library. 200 | - Restructuring to use `` for all functions that may error 201 | - Behind the scenes work to prepare for automatic dictionary location 202 | 203 | 204 | [Unreleased]: https://github.com/pluots/zspell/compare/v0.5.5...HEAD 205 | [0.5.5]: https://github.com/pluots/zspell/compare/v0.5.3...v0.5.5 206 | [0.5.3]: https://github.com/pluots/zspell/compare/v0.5.2...v0.5.3 207 | [0.5.2]: https://github.com/pluots/zspell/compare/v0.5.1...v0.5.2 208 | [0.5.1]: https://github.com/pluots/zspell/compare/v0.5.0...v0.5.1 209 | [0.5.0]: https://github.com/pluots/zspell/compare/v0.4.1...v0.5.0 210 | [0.4.1]: https://github.com/pluots/zspell/compare/v0.4.0...v0.4.1 211 | [0.4.0]: https://github.com/pluots/zspell/compare/v0.3.3...v0.4.0 212 | [0.3.3]: https://github.com/pluots/zspell/compare/v0.3.2...v0.3.3 213 | [0.3.2]: https://github.com/pluots/zspell/compare/v0.3.1...v0.3.2 214 | [0.3.1]: https://github.com/pluots/zspell/compare/v0.3.0...v0.3.1 215 | [0.3.0]: https://github.com/pluots/zspell/compare/v0.2.2...v0.3.0 216 | [0.2.2]: https://github.com/pluots/zspell/compare/v0.2.1...v0.2.2 217 | [0.2.1]: https://github.com/pluots/zspell/compare/v0.2.0...v0.2.1 218 | [0.2.0]: https://github.com/pluots/zspell/compare/v0.1.4...v0.2.0 219 | [0.1.4]: https://github.com/pluots/zspell/compare/v0.1.3...v0.1.4 220 | [0.1.3]: https://github.com/pluots/zspell/compare/v0.1.2...v0.1.3 221 | [0.1.2]: https://github.com/pluots/zspell/compare/v0.1.1...v0.1.2 222 | [0.1.1]: https://github.com/pluots/zspell/compare/v0.1.0...v0.1.1 223 | [0.1.0]: https://github.com/pluots/zspell/compare/v0.0.1...v0.1.0 224 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = [ 4 | "zspell", 5 | "zspell-py", 6 | "zspell-cli", 7 | "zspell/test-util", 8 | ] 9 | 10 | default-members = [ 11 | "zspell", 12 | "zspell-cli", 13 | ] 14 | 15 | # Build with `cargo build --profile=release-debug` 16 | # Alternatively `cargo bench --profile=release-debug` 17 | # Useful for profiling, not for official releases 18 | [profile.release-debug] 19 | # inherits = "release" 20 | inherits = "dev" 21 | opt-level = 3 22 | debug = true 23 | strip = "none" 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Trevor Gross 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | maturin = "1.3.0" 8 | sphinx = "7.2.6" 9 | furo = "2023.9.10" 10 | 11 | [dev-packages] 12 | black = "23.9.1" 13 | pytest = "7.4.2" 14 | m2r = "0.3.1" 15 | 16 | [requires] 17 | python_version = "3.11" 18 | -------------------------------------------------------------------------------- /README-py.md: -------------------------------------------------------------------------------- 1 | # ZSpell 2 | 3 | Python bindings for the Rust zspell library: a simple yet fast spellchecker. 4 | 5 | To use this library, you will need a dictionary in the language of your choice. 6 | Many are available at the following repository: 7 | 8 | 9 | The full Python API documentation is available at 10 | 11 | ```py 12 | from zspell import Dictionary 13 | 14 | with open ("dictionaries/en_US.aff", "r") as f: 15 | config_str = f.read() 16 | with open ("dictionaries/en_US.dic", "r") as f: 17 | dict_str = f.read() 18 | d = Dictionary(config_str, dict_str) 19 | 20 | assert(d.check("Apples are good! Don't you think?")) 21 | assert(not d.check("Apples are baaaad")) 22 | ``` 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZSpell 2 | 3 | This project is a spellchecker written completely in Rust, that maintains 4 | compatibility with the venerable Hunspell dictionary format. It is entirely 5 | native and does not rely on any other backends (Enchant, Hunspell, Aspell, 6 | etc.). This library also has the goal of being usable via WASM. Full Unicode 7 | support is baked in. 8 | 9 | The library side has a stabalized checker, but the suggestion API is not yet 10 | finalized. The CLI is usable but not yet considered stabalized. See 11 | [Feature Status](#feature-status) for more information on what is available. 12 | 13 | Here are some useful quick links: 14 | 15 | - Crate info: 16 | - Crate CLI docs (incomplete): 17 | - Crate library docs: 18 | - Python library page: 19 | - Crate source: 20 | 21 | ## Interfaces 22 | 23 | This project exposes multiple interfaces to its spellchecker, listed in this 24 | section. 25 | 26 | ### Command Line Interface 27 | 28 | Just want to use this spellchecker from the command line? Check out the book, 29 | located here , for a more in-depth explanation 30 | of installation and usage. 31 | 32 | If you don't want to read further, the easiest way to get started is to download 33 | a prebuilt binary from here: . 34 | 35 | ### Rust Library Interface 36 | 37 | This project also aims to create a fully functional spellchecking library, for 38 | easy programmatic use. See the documentation for the library side here 39 | . This also includes a lot of design methodology 40 | discussions, for those who are interested. 41 | 42 | ### Python Interface 43 | 44 | There is a python wrapper for this library with prebuilt wheels, available here: 45 | . Its source is located in the 46 | [zspell-py crate](zspell-py). 47 | 48 | ### Usage via WASM 49 | 50 | The library API should work out of the box. Official WASM bindings will be added 51 | at some point. 52 | 53 | ## Feature Status 54 | 55 | | Feature | Available via Library | Available via CLI | Tracking Issue | 56 | | ------------------------------ | --------------------- | ----------------- | ------------------------------------------------- | 57 | | Basic spellcheck functionality | ✓ | ✓ | | 58 | | Forbidden word handling | ✓ | ✓ | [#17](https://github.com/pluots/zspell/issues/17) | 59 | | Stemming | ✓ | ✓ | | 60 | | Morph analysis | ✓ | ✓ | | 61 | | Suggestions | WIP | ✕ | [#16](https://github.com/pluots/zspell/issues/16) | 62 | | Compound word handling | ✕ | ✕ | | 63 | | Full Morph/Phone Handling | WIP | ✕ | | 64 | | Python Interface | Beta | N/A | [#18](https://github.com/pluots/zspell/issues/18) | 65 | | Prebuilt WASM bindings | ✕ | N/A | [#19](https://github.com/pluots/zspell/issues/19) | 66 | 67 | ## Performance 68 | 69 | This repository has the goal of highly prioritizing the most expected usage, 70 | i.e., that most words to be checked are correct. With optimizations based around 71 | this concept and with the modern computers now able to store entire compiled 72 | word lists in memory (~20 MiB), `zspell` tends to outperform other 73 | spellcheckers. 74 | 75 | ## MSRV 76 | 77 | This library relies on features from Rust 1.65, so that is our current minimum 78 | supported version. Our CI validates this for the library and examples. 79 | 80 | The CLI and test runner require newer features and do not keep a specific MSRV. 81 | 82 | ## Test suite 83 | 84 | This project keeps a test suite located in `zspell/test-suite` (symlinked to 85 | `test-suite`). Each file has a simple format that combines a simple affix and 86 | dictionary file. To add a test, just duplicate and edit `0-example.test`. 87 | 88 | File names are as follows: 89 | 90 | - `0-*`: meta tests that do not get run 91 | - `b-*`: basic functionality tests 92 | - `h-*`: tests that come from the Hunspell test suite 93 | - `i000-*`: tests that address specific issues 94 | 95 | ## License 96 | 97 | See the LICENSE file for license information. The provided license does allow 98 | for proprietary use and adaptation; that being said, I kindly suggest that if 99 | you come up with an improvement, you submit a pull request and help us all out 100 | :) 101 | 102 | ### Test suite license 103 | 104 | Some tests are taken from Hunspell's test suite. Hunspell has various licenses, 105 | we select MPL and include a SPDX notice on relevant files. 106 | 107 | ### Dictionary data license 108 | 109 | The dictionaries provided in this repository for testing purposed have been 110 | obtained under license. These files have been sourced from here: 111 | [https://github.com/wooorm/dictionaries](https://github.com/wooorm/dictionaries) 112 | 113 | These dictionaries are licensed under various licenses, different from that of 114 | this project. Please see the applicable `.license` file withing the 115 | `dictionaries/` directory. 116 | -------------------------------------------------------------------------------- /benches/findings.md: -------------------------------------------------------------------------------- 1 | # Benchmark Findings 2 | 3 | Simple notes from benchmarks that have been run 4 | 5 | ## Collection types 6 | 7 | Four collections were compared; `Vec` (as a baseline), 8 | `std::collections::BTreeSet`, `std::collections::HashSet`, and 9 | `hashbrown::HashSet`. These were each tested on `.contains()` with values that 10 | did and did not exist, as well as 11 | 12 | ``` 13 | Vec BTreeSet std HashSet hashbrown HashSet 14 | contains 594 us 2.17 us 530 ns 279 ns 15 | not contains 1.91 us 2.40 us 436 ns 160 ns 16 | collect 18.3 us 301 us 204 us 120 us 17 | ``` 18 | 19 | The `HashSet` implementations significantly beat out other alternatives, and the 20 | `hashbrown` implementation outperformed `std::HashSet`. This is expected because 21 | `hashbrown` uses a faster hash that is not cryptographically secure (not a 22 | problem for our applications). 23 | 24 | For some reason, the improvements going from `std` to `hashbrown` don't really 25 | seem to show up for the dictionary integration tests. This will take some 26 | looking into. 27 | 28 | ## Slice `contains` vs. `binary_search` 29 | 30 | Overall, the price of sorting doesn't seem to have any payoff, especially for 31 | our use cases of short arrays. If it is already sorted then we can save time, 32 | about 20% on average. 33 | -------------------------------------------------------------------------------- /benches/logbench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Run `cargo bench`, print the output with CPU information to a timestamped 3 | file. 4 | 5 | Does not work on Windows (WSL works). 6 | """ 7 | 8 | 9 | import platform 10 | import subprocess as sp 11 | import sys 12 | import time 13 | from datetime import datetime 14 | from inspect import cleandoc 15 | from pathlib import Path 16 | 17 | 18 | def decode_sp_out(b: bytes) -> str: 19 | return b.decode(errors="ignore").strip() 20 | 21 | 22 | def get_dtime() -> str: 23 | return datetime.utcnow().strftime(r"%Y-%m-%d_%H%M") 24 | 25 | 26 | def git_describe() -> str: 27 | args = ["git", "describe", "--always", "--tags"] 28 | return decode_sp_out(sp.check_output(args)) 29 | 30 | 31 | def get_fpath(dtime: str, describe: str) -> tuple[str, Path]: 32 | fname = f"{describe}_{dtime}.bench" 33 | fpath = Path(__file__).resolve().parents[0] / "results" / fname 34 | return (fname, fpath) 35 | 36 | 37 | def rustc_version() -> str: 38 | return decode_sp_out(sp.check_output(["rustc", "--version"])) 39 | 40 | 41 | def get_cpu_info() -> str: 42 | s = "" 43 | if platform.system() == "Darwin": 44 | cmd = ["sysctl", "-n", "machdep.cpu.brand_string"] 45 | s += decode_sp_out(sp.check_output(cmd)) 46 | else: 47 | tmp = decode_sp_out(sp.check_output("lscpu")) 48 | for line in tmp.splitlines(): 49 | if ( 50 | "Architecture" in line 51 | or "Model name" in line 52 | or "Socket" in line 53 | or "Thread" in line 54 | or "CPU(s)" in line 55 | or "MHz" in line 56 | ): 57 | s += line 58 | return s 59 | 60 | 61 | def main(): 62 | start_time = time.time() 63 | dtime = get_dtime() 64 | describe = git_describe() 65 | fname, fpath = get_fpath(dtime, describe) 66 | version = rustc_version() 67 | cpu_info = get_cpu_info() 68 | cmd = ["cargo", "bench", "--features", "unstable-bench"] 69 | cmd += sys.argv[1:] 70 | 71 | header_str = ( 72 | cleandoc( 73 | f""" 74 | {fname} 75 | 76 | Benchmark from {dtime} on commit {describe} 77 | {version} 78 | 79 | CPU Information: 80 | {cpu_info} 81 | 82 | Running: '{" ".join(cmd)}' 83 | """ 84 | ) 85 | + "\n\n\n" 86 | ) 87 | 88 | print(header_str) 89 | output = header_str 90 | 91 | with sp.Popen(cmd, stdout=sp.PIPE, bufsize=1, universal_newlines=True) as p: 92 | for line in p.stdout: 93 | print(line, end="") # process line here 94 | output += line 95 | 96 | if p.returncode != 0: 97 | print("\nCommand did not complete successfully") 98 | exit(p.returncode) 99 | 100 | end_time = time.time() 101 | elapsed_time = end_time - start_time 102 | time_str = f"\nTotal execution time: {time.strftime('%H:%M:%S', time.gmtime(elapsed_time))}" 103 | output += time_str 104 | print(time_str) 105 | print("\nWriting file '{fpath}'...", end="") 106 | 107 | with open(fpath, "w") as f: 108 | f.write(output) 109 | 110 | print(" Done!") 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /benches/results/v0.2.2-27-g87c32f5_2022-12-25_2135.bench: -------------------------------------------------------------------------------- 1 | v0.2.2-27-g87c32f5_2022-12-25_2135.bench 2 | 3 | Benchmark from 2022-12-25_2135 on commit v0.2.2-27-g87c32f5 4 | rustc 1.68.0-nightly (b569c9dc5 2022-12-21) 5 | 6 | CPU Information: 7 | Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz 8 | 9 | Running: 'cargo bench --bench datastructure' 10 | 11 | 12 | Vec contains true time: [569.22 µs 583.36 µs 600.88 µs] 13 | change: [+13.517% +17.843% +22.890%] (p = 0.00 < 0.05) 14 | Performance has regressed. 15 | Found 2 outliers among 100 measurements (2.00%) 16 | 2 (2.00%) high severe 17 | 18 | Vec contains false time: [1.7272 ms 1.7367 ms 1.7486 ms] 19 | change: [-2.3433% -0.7517% +0.5590%] (p = 0.35 > 0.05) 20 | No change in performance detected. 21 | Found 6 outliers among 100 measurements (6.00%) 22 | 2 (2.00%) high mild 23 | 4 (4.00%) high severe 24 | 25 | BTree contains true time: [2.0813 µs 2.0988 µs 2.1284 µs] 26 | change: [-33.216% -18.618% -5.3123%] (p = 0.02 < 0.05) 27 | Performance has improved. 28 | Found 5 outliers among 100 measurements (5.00%) 29 | 5 (5.00%) high severe 30 | 31 | BTree contains false time: [2.4196 µs 2.4301 µs 2.4494 µs] 32 | change: [-2.6558% -2.0096% -1.1657%] (p = 0.00 < 0.05) 33 | Performance has improved. 34 | Found 12 outliers among 100 measurements (12.00%) 35 | 2 (2.00%) high mild 36 | 10 (10.00%) high severe 37 | 38 | HashSet contains true time: [478.58 ns 479.76 ns 481.22 ns] 39 | change: [-1.7564% -1.4741% -1.2080%] (p = 0.00 < 0.05) 40 | Performance has improved. 41 | Found 12 outliers among 100 measurements (12.00%) 42 | 3 (3.00%) high mild 43 | 9 (9.00%) high severe 44 | 45 | HashSet contains false time: [393.25 ns 434.25 ns 490.96 ns] 46 | change: [-0.3539% +3.7872% +10.037%] (p = 0.19 > 0.05) 47 | No change in performance detected. 48 | Found 18 outliers among 100 measurements (18.00%) 49 | 2 (2.00%) high mild 50 | 16 (16.00%) high severe 51 | 52 | HashBrownSet contains true 53 | time: [329.93 ns 358.98 ns 394.22 ns] 54 | change: [+11.832% +15.944% +20.455%] (p = 0.00 < 0.05) 55 | Performance has regressed. 56 | Found 4 outliers among 100 measurements (4.00%) 57 | 1 (1.00%) high mild 58 | 3 (3.00%) high severe 59 | 60 | HashBrownSet contains false 61 | time: [209.08 ns 233.62 ns 266.77 ns] 62 | change: [+59.011% +105.51% +168.79%] (p = 0.00 < 0.05) 63 | Performance has regressed. 64 | Found 16 outliers among 100 measurements (16.00%) 65 | 8 (8.00%) high mild 66 | 8 (8.00%) high severe 67 | 68 | VecMap contains true time: [1.3709 ms 1.5246 ms 1.6924 ms] 69 | change: [+63.260% +80.742% +100.35%] (p = 0.00 < 0.05) 70 | Performance has regressed. 71 | Found 7 outliers among 100 measurements (7.00%) 72 | 6 (6.00%) high mild 73 | 1 (1.00%) high severe 74 | 75 | VecMap contains false time: [3.7792 ms 4.1292 ms 4.5332 ms] 76 | change: [+46.003% +60.872% +75.969%] (p = 0.00 < 0.05) 77 | Performance has regressed. 78 | Found 4 outliers among 100 measurements (4.00%) 79 | 2 (2.00%) high mild 80 | 2 (2.00%) high severe 81 | 82 | VecMap get true time: [834.35 µs 884.64 µs 937.83 µs] 83 | change: [+15.380% +22.165% +29.051%] (p = 0.00 < 0.05) 84 | Performance has regressed. 85 | Found 7 outliers among 100 measurements (7.00%) 86 | 5 (5.00%) high mild 87 | 2 (2.00%) high severe 88 | 89 | VecMap get false time: [4.9732 ms 5.5763 ms 6.2481 ms] 90 | change: [+91.748% +117.22% +146.27%] (p = 0.00 < 0.05) 91 | Performance has regressed. 92 | Found 4 outliers among 100 measurements (4.00%) 93 | 3 (3.00%) high mild 94 | 1 (1.00%) high severe 95 | 96 | BTreeMap contains true time: [2.1304 µs 2.1587 µs 2.1911 µs] 97 | change: [+3.8901% +6.2983% +9.0957%] (p = 0.00 < 0.05) 98 | Performance has regressed. 99 | Found 7 outliers among 100 measurements (7.00%) 100 | 5 (5.00%) high mild 101 | 2 (2.00%) high severe 102 | 103 | BTreeMap contains false time: [3.3337 µs 4.1256 µs 4.9965 µs] 104 | change: [+18.633% +34.493% +54.500%] (p = 0.00 < 0.05) 105 | Performance has regressed. 106 | Found 17 outliers among 100 measurements (17.00%) 107 | 2 (2.00%) high mild 108 | 15 (15.00%) high severe 109 | 110 | BTreeMap get true time: [2.2010 µs 2.2381 µs 2.2866 µs] 111 | change: [-13.097% -7.2918% -1.8966%] (p = 0.01 < 0.05) 112 | Performance has improved. 113 | Found 4 outliers among 100 measurements (4.00%) 114 | 3 (3.00%) high mild 115 | 1 (1.00%) high severe 116 | 117 | BTreeMap get false time: [2.6190 µs 2.6996 µs 2.7880 µs] 118 | change: [-31.970% -14.791% +1.5950%] (p = 0.19 > 0.05) 119 | No change in performance detected. 120 | Found 5 outliers among 100 measurements (5.00%) 121 | 5 (5.00%) high mild 122 | 123 | HashMap contains true time: [521.98 ns 532.74 ns 544.49 ns] 124 | change: [-4.4057% +1.0033% +6.5691%] (p = 0.73 > 0.05) 125 | No change in performance detected. 126 | Found 6 outliers among 100 measurements (6.00%) 127 | 3 (3.00%) high mild 128 | 3 (3.00%) high severe 129 | 130 | HashMap contains false time: [439.96 ns 506.63 ns 592.46 ns] 131 | change: [-14.677% -3.6708% +7.9234%] (p = 0.56 > 0.05) 132 | No change in performance detected. 133 | Found 13 outliers among 100 measurements (13.00%) 134 | 5 (5.00%) high mild 135 | 8 (8.00%) high severe 136 | 137 | HashMap get true time: [515.70 ns 522.12 ns 529.77 ns] 138 | change: [+8.3012% +13.486% +19.008%] (p = 0.00 < 0.05) 139 | Performance has regressed. 140 | Found 14 outliers among 100 measurements (14.00%) 141 | 6 (6.00%) high mild 142 | 8 (8.00%) high severe 143 | 144 | HashMap get false time: [418.13 ns 441.50 ns 478.80 ns] 145 | change: [-33.490% -21.735% -9.8900%] (p = 0.00 < 0.05) 146 | Performance has improved. 147 | Found 12 outliers among 100 measurements (12.00%) 148 | 7 (7.00%) high mild 149 | 5 (5.00%) high severe 150 | 151 | HashBrownMap contains true 152 | time: [287.48 ns 288.11 ns 288.81 ns] 153 | change: [-27.185% -21.547% -15.816%] (p = 0.00 < 0.05) 154 | Performance has improved. 155 | Found 17 outliers among 100 measurements (17.00%) 156 | 4 (4.00%) high mild 157 | 13 (13.00%) high severe 158 | 159 | HashBrownMap contains false 160 | time: [190.02 ns 233.14 ns 291.77 ns] 161 | change: [+15.667% +28.449% +44.790%] (p = 0.00 < 0.05) 162 | Performance has regressed. 163 | Found 8 outliers among 100 measurements (8.00%) 164 | 3 (3.00%) high mild 165 | 5 (5.00%) high severe 166 | 167 | HashBrownMap get true time: [302.78 ns 336.35 ns 388.80 ns] 168 | Found 8 outliers among 100 measurements (8.00%) 169 | 5 (5.00%) high mild 170 | 3 (3.00%) high severe 171 | 172 | HashBrownMap get false time: [172.41 ns 175.16 ns 178.23 ns] 173 | Found 1 outliers among 100 measurements (1.00%) 174 | 1 (1.00%) high mild 175 | -------------------------------------------------------------------------------- /benches/results/v0.3.3-3-g1e7d5fd_2023-01-02_0118.bench: -------------------------------------------------------------------------------- 1 | v0.3.3-3-g1e7d5fd_2023-01-02_0118.bench 2 | 3 | Benchmark from 2023-01-02_0118 on commit v0.3.3-3-g1e7d5fd 4 | rustc 1.68.0-nightly (77429957a 2023-01-01) 5 | 6 | CPU Information: 7 | Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz 8 | 9 | Running: 'cargo bench --features benchmarking --bench dict_integration' 10 | 11 | 12 | Parse affix file time: [1.7233 ms 1.7281 ms 1.7355 ms] 13 | change: [-0.7695% +0.2918% +1.6412%] (p = 0.67 > 0.05) 14 | No change in performance detected. 15 | Found 15 outliers among 100 measurements (15.00%) 16 | 3 (3.00%) high mild 17 | 12 (12.00%) high severe 18 | 19 | Parse dict file time: [68.950 ms 69.097 ms 69.264 ms] 20 | change: [-1.8725% -1.4052% -0.9341%] (p = 0.00 < 0.05) 21 | Change within noise threshold. 22 | Found 4 outliers among 100 measurements (4.00%) 23 | 2 (2.00%) high mild 24 | 2 (2.00%) high severe 25 | 26 | Spellcheck: compile dictionary 27 | time: [289.23 ms 293.48 ms 300.70 ms] 28 | change: [+1.3808% +2.8849% +5.6673%] (p = 0.00 < 0.05) 29 | Performance has regressed. 30 | Found 5 outliers among 100 measurements (5.00%) 31 | 2 (2.00%) high mild 32 | 3 (3.00%) high severe 33 | 34 | Spellcheck: 1 correct word 35 | time: [177.69 ns 177.83 ns 178.01 ns] 36 | change: [-5.0761% -2.1543% -0.1512%] (p = 0.09 > 0.05) 37 | No change in performance detected. 38 | Found 17 outliers among 100 measurements (17.00%) 39 | 6 (6.00%) high mild 40 | 11 (11.00%) high severe 41 | 42 | Spellcheck: 1 incorrect word 43 | time: [207.22 ns 207.87 ns 208.60 ns] 44 | change: [+2.3077% +2.8911% +3.6198%] (p = 0.00 < 0.05) 45 | Performance has regressed. 46 | Found 7 outliers among 100 measurements (7.00%) 47 | 4 (4.00%) high mild 48 | 3 (3.00%) high severe 49 | 50 | Spellcheck: 15 correct words 51 | time: [5.7807 µs 5.7842 µs 5.7888 µs] 52 | change: [+0.1442% +0.9362% +1.8589%] (p = 0.02 < 0.05) 53 | Change within noise threshold. 54 | Found 14 outliers among 100 measurements (14.00%) 55 | 4 (4.00%) high mild 56 | 10 (10.00%) high severe 57 | 58 | Spellcheck: 15 incorrect words 59 | time: [6.6361 µs 6.6398 µs 6.6443 µs] 60 | change: [-1.1998% -0.6706% -0.0092%] (p = 0.02 < 0.05) 61 | Change within noise threshold. 62 | Found 15 outliers among 100 measurements (15.00%) 63 | 4 (4.00%) high mild 64 | 11 (11.00%) high severe 65 | 66 | Spellcheck: 188 word paragraph 67 | time: [1.0548 µs 1.0564 µs 1.0585 µs] 68 | change: [+0.5199% +1.3356% +2.0162%] (p = 0.00 < 0.05) 69 | Change within noise threshold. 70 | Found 13 outliers among 100 measurements (13.00%) 71 | 5 (5.00%) high mild 72 | 8 (8.00%) high severe 73 | 74 | 75 | Total execution time: 00:04:21 76 | -------------------------------------------------------------------------------- /book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Trevor Gross"] 3 | language = "en" 4 | multilingual = false 5 | src = "book_src" 6 | title = "The ZSpell Book" 7 | -------------------------------------------------------------------------------- /book_src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | - [Introduction](./introduction.md) 4 | - [Installation](./installation.md) 5 | - [Usage](./usage.md) 6 | -------------------------------------------------------------------------------- /book_src/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Installing a prebuilt binary 4 | 5 | The easiest way to get started is to download a prebuilt binary for your system. 6 | Binaries are avilable for for Windows, Linux, and Mac on the x86_64 platform. 7 | These do not require anything else to be installed. 8 | 9 | Head to and download the latest 10 | binary for your system. Simply extract the download and run the executable. 11 | 12 | If you would like the tool to be accessible from anywhere on your system, you 13 | will need to copy or link this executable to a location that is in your system 14 | path. 15 | 16 | ## Installing via Cargo 17 | 18 | If you already have rust installed and would like to install zspell via Cargo, 19 | this is fairly straightforward: 20 | 21 | ```sh 22 | cargo install zspell 23 | ``` 24 | 25 | ## Building from source 26 | 27 | If you would like to build the latest version (potentially unreleased) from 28 | source without installing (e.g. for development purposes), that can be done as 29 | follows: 30 | 31 | ```sh 32 | git clone https://github.com/pluots/zspell 33 | cd zspell 34 | cargo build --release 35 | ``` 36 | -------------------------------------------------------------------------------- /book_src/introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | For the time being, this book is still very much under construction. However, 4 | there is still some good information to be found in the available sections, so 5 | please feel free to take a look. 6 | -------------------------------------------------------------------------------- /book_src/usage.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | 3 | This program can be run with the following: 4 | 5 | ```sh 6 | zspell -d dictionaries/en_US 7 | ``` 8 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | # for performance, we always want to use HashBrown 2 | disallowed-types = ["std::collections::HashMap", "std::collections::HashSet"] 3 | doc-valid-idents = ["ZSpell"] 4 | -------------------------------------------------------------------------------- /dictionaries/README.md: -------------------------------------------------------------------------------- 1 | These dictionary files come from Titus Wormer's repository here: 2 | [https://github.com/wooorm/dictionaries](https://github.com/wooorm/dictionaries) 3 | 4 | They have been automatically collected. 5 | 6 | These dictionaries are licensed under various licenses, different from that of 7 | this project. Please see the applicable `.license` file withing this directory. 8 | -------------------------------------------------------------------------------- /release.toml: -------------------------------------------------------------------------------- 1 | allow-branch = ["main"] 2 | shared-version = true 3 | # Single commit for all crates since we are in one repo 4 | consolidate-commits = true 5 | tag-name = "v{{version}}" 6 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | imports_granularity = "Module" 2 | newline_style = "Unix" 3 | group_imports = "StdExternalCrate" 4 | format_code_in_doc_comments = true 5 | format_macro_bodies = true 6 | format_macro_matchers = true 7 | -------------------------------------------------------------------------------- /test-suite: -------------------------------------------------------------------------------- 1 | zspell/test-suite/ -------------------------------------------------------------------------------- /update_dictionaries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """This script downloads files from the github `wooorm/dictionaries` repository. 4 | 5 | Be sure to obey licensing. 6 | """ 7 | 8 | import argparse 9 | import base64 10 | import json 11 | import os 12 | import urllib.request 13 | from dataclasses import dataclass 14 | from typing import Any 15 | 16 | # Path to directory with all dictionaries 17 | ROOT_GH_URL = "https://api.github.com/repos/wooorm/dictionaries/contents/dictionaries" 18 | 19 | 20 | @dataclass 21 | class AuthInfo: 22 | """Login information""" 23 | 24 | username: str 25 | token: str 26 | 27 | 28 | @dataclass 29 | class LangDict: 30 | """Represent the URLs for a specific language""" 31 | 32 | name: str 33 | dir_url: str 34 | dict_url: str = None 35 | affix_url: str = None 36 | license_url: str = None 37 | 38 | def set_urls(self, auth: AuthInfo | None): 39 | """Set dict, affix, and license URLs from the name and dir URL""" 40 | listing: list[dict[str, Any]] = get_url_data_json(self.dir_url, auth) 41 | self.dict_url = next( 42 | d["download_url"] for d in listing if d["name"].endswith(".dic") 43 | ) 44 | self.affix_url = next( 45 | d["download_url"] for d in listing if d["name"].endswith(".aff") 46 | ) 47 | self.license_url = next( 48 | d["download_url"] for d in listing if d["name"].lower() == "license" 49 | ) 50 | 51 | def download(self, path: str, auth: AuthInfo | None) -> None: 52 | """Download the files to a designated path""" 53 | print(f"Downloading files for language '{self.name}'") 54 | 55 | dict_path = f"{path}/{self.dict_fname}" 56 | affix_path = f"{path}/{self.affix_fname}" 57 | license_path = f"{path}/{self.license_fname}" 58 | all_paths = (dict_path, affix_path, license_path) 59 | 60 | for fname in all_paths: 61 | if os.path.exists(fname): 62 | print(f"Language '{self.name}' already exists, found '{fname}'") 63 | print("Skipping") 64 | return 65 | 66 | download_file(self.dict_url, f"{path}/{self.dict_fname}.tmp", auth) 67 | download_file(self.affix_url, f"{path}/{self.affix_fname}.tmp", auth) 68 | download_file(self.license_url, f"{path}/{self.license_fname}.tmp", auth) 69 | 70 | # If all goes well, there will be no problems. If one failed, program would abort 71 | # Now remove the old ones, if present 72 | for fname in all_paths: 73 | if os.path.exists(fname): 74 | os.remove(fname) 75 | 76 | # And replace with the new 77 | os.rename(f"{fname}.tmp", f"{fname}") 78 | 79 | print(f"Finished downloading files for '{self.name}'") 80 | 81 | @property 82 | def dict_fname(self): 83 | return f"{self.name}.dic" 84 | 85 | @property 86 | def affix_fname(self): 87 | return f"{self.name}.aff" 88 | 89 | @property 90 | def license_fname(self): 91 | return f"{self.name}.license" 92 | 93 | 94 | def make_req(url: str, auth: AuthInfo | None) -> str | urllib.request.Request: 95 | """Make a request with auth information""" 96 | if auth is None: 97 | return url 98 | 99 | auth_str = base64.b64encode(bytes(f"{auth.username}:{auth.token}", "utf8")) 100 | req = urllib.request.Request(url) 101 | req.add_header("Authorization", f"Basic {auth_str}") 102 | return req 103 | 104 | 105 | def get_url_data_json(url: str, auth: AuthInfo | None): 106 | return json.loads(urllib.request.urlopen(make_req(url, auth)).read()) 107 | 108 | 109 | def download_file(url: str, path: str, auth: AuthInfo | None): 110 | return urllib.request.urlretrieve(make_req(url, auth), path) 111 | 112 | 113 | def parse_args(): 114 | parser = argparse.ArgumentParser( 115 | prog="Dictionary downloader", 116 | description="Download dictionaries for development", 117 | ) 118 | parser.add_argument( 119 | "languages", nargs="+", help="Specify language codes to download" 120 | ) 121 | parser.add_argument("--username", help="specify a github username") 122 | parser.add_argument("--token", help="specify a github token") 123 | parser.add_argument( 124 | "--output-dir", help="specify the output directory", default="dictionaries" 125 | ) 126 | args = parser.parse_args() 127 | return args 128 | 129 | 130 | def make_lang_dicts(languages: list[str], auth: AuthInfo) -> list[LangDict]: 131 | print("Gathering listing") 132 | 133 | listing_data: list[dict] = get_url_data_json(ROOT_GH_URL, auth) 134 | 135 | lang_dicts: list[LangDict] = [] 136 | 137 | for lang in languages: 138 | lang_name = lang.replace("_", "-") 139 | listing = next( 140 | (listing for listing in listing_data if listing.get("name") == lang_name), 141 | None, 142 | ) 143 | if listing is None: 144 | print(f"Unable to find language {lang}") 145 | exit(1) 146 | lang_dicts.append(LangDict(listing["name"], listing["url"])) 147 | 148 | return lang_dicts 149 | 150 | 151 | def main(): 152 | print(__doc__) 153 | args = parse_args() 154 | username = args.username or os.environ.get("GH_USERNAME") 155 | token = args.token or os.environ.get("GH_TOKEN") 156 | 157 | if username is None or token is None: 158 | print("Not using authentication, large requests may fail") 159 | auth = None 160 | else: 161 | print("Using token authentication") 162 | auth = AuthInfo(username, token) 163 | 164 | print(username, token) 165 | lang_dicts = make_lang_dicts(args.languages, auth) 166 | 167 | for ldict in lang_dicts: 168 | ldict.set_urls(auth) 169 | ldict.download(args.output_dir, auth) 170 | 171 | 172 | if __name__ == "__main__": 173 | main() 174 | -------------------------------------------------------------------------------- /zspell-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "zspell-cli" 3 | version = "0.5.5" 4 | edition = "2021" 5 | publish = true 6 | description = "Command line interface for the ZSpell spellchecking library" 7 | rust-version = "1.70" 8 | 9 | 10 | # Note: this is symlinked from top level 11 | readme = "README.md" 12 | license-file = "LICENSE" 13 | 14 | documentation = "https://docs.rs/zspell" 15 | repository = "https://github.com/pluots/zspell" 16 | 17 | keywords = ["spellcheck", "spelling", "cli"] 18 | categories = ["algorithms", "text-processing", "command-line-utilities"] 19 | 20 | # Required to make name "zspell" instead of "zspell-cli" 21 | [[bin]] 22 | name = "zspell" 23 | path = "src/main.rs" 24 | 25 | [dependencies] 26 | cfg-if = "1.0" 27 | clap = { version = "4.4.18", features = ["derive", "wrap_help"] } 28 | hex = "0.4" 29 | indicatif = "0.17" 30 | serde = { version = "1.0.203", features = ["derive"] } 31 | serde_json = "1.0.117" 32 | sha1 = "0.10.6" 33 | stringmetrics = "2.2" 34 | termcolor = "1.4.1" 35 | anyhow = "1.0.86" 36 | zspell = { path = "../zspell", version = "0.5.5", features = ["zspell-unstable"] } 37 | ureq = { version = "2.9.7", features = ["json"] } 38 | zspell-index = "0.5.0" 39 | 40 | [dev-dependencies] 41 | criterion = "0.5" 42 | assert_cmd = "2.0" 43 | predicates = "3.1" 44 | tempfile = "3.10" 45 | httpmock = "0.7" 46 | # util = { path = "util" } 47 | 48 | [build-dependencies] 49 | clap = { version = "4.4", features = ["derive", "wrap_help"] } 50 | clap_mangen = "0.2" 51 | clap_complete = "4.4" 52 | 53 | [package.metadata.release] 54 | shared-version = true 55 | allow-branch = ["main", "release"] 56 | 57 | [[package.metadata.release.pre-release-replacements]] 58 | file = "Cargo.toml" 59 | # Need \d match so we don't accidentally match our pattern here 60 | search = 'zspell = \{ path = "../zspell", version = "[\d\.]*", features = \["zspell-unstable"\] \}' 61 | replace = 'zspell = { path = "../zspell", version = "{{version}}", features = ["zspell-unstable"] }' 62 | -------------------------------------------------------------------------------- /zspell-cli/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /zspell-cli/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /zspell-cli/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs::File; 3 | use std::io::Error; 4 | // Need to rename PathBuf because of the `include!` macro 5 | use std::path::{self, Path}; 6 | 7 | use clap::{Command, CommandFactory}; 8 | use clap_complete::generate_to; 9 | use clap_complete::shells::Shell; 10 | 11 | include!("src/cli/mod.rs"); 12 | 13 | fn build_shell_completion(cmd: &mut Command, outdir: &path::PathBuf) -> Result<(), Error> { 14 | // Generate shell completion scripts for our 15 | for shell in [ 16 | Shell::Bash, 17 | Shell::Elvish, 18 | Shell::Fish, 19 | Shell::PowerShell, 20 | Shell::Zsh, 21 | ] { 22 | let path = generate_to( 23 | shell, cmd, // We need to specify what generator to use 24 | "zspell", // We need to specify the bin name manually 25 | outdir, // We need to specify where to write 26 | )?; 27 | 28 | println!("cargo:warning=completion file written to {path:?}"); 29 | } 30 | 31 | Ok(()) 32 | } 33 | 34 | fn build_man_pages(cmd: Command, outdir: &Path) -> Result<(), Error> { 35 | // Generate man pages 36 | let man = clap_mangen::Man::new(cmd); 37 | let mut buffer: Vec = Default::default(); 38 | 39 | man.render(&mut buffer)?; 40 | 41 | let manpage_out = outdir.join("zspell.1"); 42 | 43 | println!("cargo:warning=manpage written to {manpage_out:?}"); 44 | 45 | std::fs::write(manpage_out, buffer)?; 46 | 47 | Ok(()) 48 | } 49 | 50 | fn main() -> Result<(), Error> { 51 | // Output directory will be a cargo-generated random directory 52 | let outdir = match env::var_os("OUT_DIR") { 53 | Some(outdir) => std::path::PathBuf::from(outdir), 54 | None => return Ok(()), 55 | }; 56 | 57 | let profile = std::env::var("PROFILE").unwrap(); 58 | 59 | // Don't generate outputs if we're in debug mode 60 | match profile.as_str() { 61 | "debug" => (), 62 | _ => { 63 | // Create a dummy file to help find the latest output 64 | let stamp_path = Path::new(&outdir).join("zspell-stamp"); 65 | if let Err(err) = File::create(&stamp_path) { 66 | panic!("failed to write {}: {}", stamp_path.display(), err); 67 | } 68 | 69 | let mut cmd = Cli::command(); 70 | 71 | build_shell_completion(&mut cmd, &outdir)?; 72 | build_man_pages(cmd, &outdir)?; 73 | } 74 | } 75 | 76 | Ok(()) 77 | } 78 | -------------------------------------------------------------------------------- /zspell-cli/src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::{Parser, Subcommand}; 4 | 5 | #[derive(Parser, Debug)] 6 | #[command(version, about, long_about = None)] 7 | pub struct Cli { 8 | /// If specified, run spellchecking on a file 9 | pub file: Option, 10 | 11 | /// Path to a dictionary file. Specify e.g. dictionaries/de_DE if 12 | /// dictionaries/de_DE.aff and dictionaries/de_DE.dic exist 13 | #[arg(short = 'd', long)] 14 | pub dict_path: Option, 15 | 16 | /// Whether to print misspelled words 17 | #[arg(short = 'l', long, default_value_t = false)] 18 | pub misspelled_words: bool, 19 | 20 | /// Whether to print lines with misspelled words 21 | #[arg(short = 'L', long, default_value_t = false)] 22 | pub misspelled_lines: bool, 23 | 24 | /// Print the a compiled dictionary's word list to stdout and exit 25 | #[arg(long, default_value_t = false)] 26 | pub generate_wordlist: bool, 27 | 28 | /// Enable morpological analysis mode 29 | #[arg(short = 'm', long, default_value_t = false)] 30 | pub analyze: bool, 31 | 32 | /// Enable word stemming mode 33 | #[arg(short = 's', long, default_value_t = false)] 34 | pub stem: bool, 35 | 36 | /// Print the search path and found dictionaries 37 | #[arg(short = 'D', long, default_value_t = false)] 38 | pub show_dictionaries: bool, 39 | 40 | /// Add a text or personal dictionary 41 | #[arg(short = 't', long, default_value_t = false)] 42 | pub text_dictionary: bool, 43 | 44 | #[command(subcommand)] 45 | pub command: Option, 46 | } 47 | 48 | impl Cli { 49 | pub fn validate(&self) -> Result<(), String> { 50 | if self.analyze && self.stem { 51 | Err("cannot use analysis and stemming together".into()) 52 | } else { 53 | Ok(()) 54 | } 55 | } 56 | } 57 | 58 | #[derive(Subcommand, Debug)] 59 | pub enum Commands { 60 | /// Calculate levenshtein distance 61 | Lev { 62 | /// The start string to calculate distance from 63 | string_a: String, 64 | 65 | /// The end string to calculate distance to 66 | string_b: String, 67 | 68 | /// Specify a maximum difference limit for the levenshthein distance 69 | #[arg(short, long, default_value_t = 1000)] 70 | limit: u32, 71 | }, 72 | } 73 | -------------------------------------------------------------------------------- /zspell-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | #![forbid(unsafe_code)] 2 | 3 | use std::process::ExitCode; 4 | 5 | use clap::Parser; 6 | 7 | mod cli; 8 | mod download; 9 | mod spelling; 10 | 11 | use spelling::spellcheck_cli; 12 | use stringmetrics::levenshtein_limit; 13 | 14 | fn main() -> ExitCode { 15 | let cli_parse = cli::Cli::parse(); 16 | if let Err(e) = cli_parse.validate() { 17 | eprintln!("{e}"); 18 | return ExitCode::FAILURE; 19 | } 20 | 21 | if let Some(cli::Commands::Lev { 22 | string_a, 23 | string_b, 24 | limit, 25 | }) = &cli_parse.command 26 | { 27 | println!("{}", levenshtein_limit(string_a, string_b, *limit)); 28 | return ExitCode::SUCCESS; 29 | } 30 | 31 | spellcheck_cli(&cli_parse); 32 | 33 | ExitCode::SUCCESS 34 | } 35 | -------------------------------------------------------------------------------- /zspell-cli/src/spelling/mod.rs: -------------------------------------------------------------------------------- 1 | //! Helpers for CLI spelling features 2 | 3 | use std::io::{self, BufRead, Write}; 4 | use std::process::ExitCode; 5 | use std::time::{Instant, SystemTime, UNIX_EPOCH}; 6 | 7 | use zspell::error::Error; 8 | use zspell::system::{create_dict_from_path, PKG_NAME, PKG_VERSION}; 9 | use zspell::Dictionary; 10 | 11 | use crate::cli::Cli; 12 | 13 | // A reminder that code is written by humans 14 | const SALUTATIONS: [&str; 9] = [ 15 | "goodbye", 16 | "auf Wiedersehen", 17 | "adios", 18 | "au revoir", 19 | "arrivederci", 20 | "annyeong", 21 | "sayōnara", 22 | "see you later calculator", 23 | "abyssinia", 24 | ]; 25 | 26 | pub fn spellcheck_cli(cli: &Cli) -> ExitCode { 27 | eprint!("{PKG_NAME} {PKG_VERSION} loading dictionaries... "); 28 | 29 | io::stdout().flush().unwrap(); 30 | 31 | let dict_path = if let Some(v) = cli.dict_path.as_ref() { 32 | v.as_str() 33 | } else { 34 | eprintln!("Dictionary path not specified. Please specify with `-d /path/to/dic`."); 35 | return ExitCode::FAILURE; 36 | }; 37 | 38 | let load_start = Instant::now(); 39 | let dict = match create_dict_from_path(dict_path) { 40 | Ok(v) => v, 41 | Err(e) => { 42 | match e { 43 | Error::Io(e) => eprintln!("IO error: {e}"), 44 | Error::Parse(e) => eprintln!("Error parsing: {e}"), 45 | Error::Build(e) => eprintln!("Error building: {e}"), 46 | Error::Regex(e) => eprintln!("Regex error: {e}"), 47 | _ => unreachable!(), 48 | }; 49 | return ExitCode::FAILURE; 50 | } 51 | }; 52 | let load_time = load_start.elapsed().as_secs_f32(); 53 | let wc = dict.wordlist().inner().len() + dict.wordlist_nosuggest().inner().len(); 54 | eprintln!("loaded {wc} words in {load_time:.2}s. started session"); 55 | 56 | if cli.generate_wordlist { 57 | todo!(); 58 | // for item in dic.iter_wordlist_items().unwrap() { 59 | // println!("{item}"); 60 | // } 61 | } else if cli.analyze { 62 | runner_morph_analysis(&dict); 63 | } else if cli.stem { 64 | runner_stemming(&dict); 65 | } else { 66 | runner_spellcheck(&dict); 67 | } 68 | 69 | // Quick RNG without external crates 70 | let bye = SALUTATIONS[SystemTime::now() 71 | .duration_since(UNIX_EPOCH) 72 | .unwrap() 73 | .as_micros() as usize 74 | % SALUTATIONS.len()]; 75 | 76 | eprintln!("\n\nsession ended, {bye}"); 77 | 78 | ExitCode::SUCCESS 79 | } 80 | 81 | fn runner_spellcheck(dict: &Dictionary) { 82 | stdin_interactive_runner(|line| { 83 | for (_, misspelled) in dict.check_indices(&line) { 84 | println!("{misspelled}"); 85 | } 86 | }) 87 | } 88 | 89 | /// From hunspell: 90 | /// 91 | /// ```text 92 | /// banana foo drinkable dofjjos 93 | /// banana banana 94 | /// 95 | /// foo foo 96 | /// 97 | /// drinkable drinkable 98 | /// drinkable drink 99 | /// 100 | /// dofjjos 101 | /// ``` 102 | fn runner_stemming(dict: &Dictionary) { 103 | stdin_interactive_runner(|line| { 104 | for entry in dict.entries(&line) { 105 | if let Some(stems) = entry.stems() { 106 | for stem in stems { 107 | println!("{} {stem}", entry.word()); 108 | } 109 | } else { 110 | println!("{}", entry.word()) 111 | } 112 | } 113 | println!(); 114 | }) 115 | } 116 | 117 | fn runner_morph_analysis(dict: &Dictionary) { 118 | stdin_interactive_runner(|line| { 119 | for entry in dict.entries(&line) { 120 | print!("{}", entry.word()); 121 | 122 | if let Some(morphs) = entry.analyze() { 123 | for morph in morphs { 124 | print!(" {morph}"); 125 | } 126 | } else { 127 | println!("{}", entry.word()) 128 | } 129 | } 130 | println!("\n"); 131 | }) 132 | } 133 | 134 | /// Common runner interface that calls a function once per stdin line 135 | // FIXME: if not a tty, lock output once before writing 136 | fn stdin_interactive_runner(f: F) 137 | where 138 | F: Fn(String), 139 | { 140 | let stdin = io::stdin(); 141 | // This is a false positive, see clippy #9135 142 | // #[allow(clippy::significant_drop_in_scrutinee)] 143 | for line in stdin.lock().lines() { 144 | let line_val = line.expect("received invalid input from IO!"); 145 | f(line_val); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /zspell-cli/tests/cli_dict.rs: -------------------------------------------------------------------------------- 1 | //! Tests for the dictionary command line interface 2 | -------------------------------------------------------------------------------- /zspell-cli/tests/cli_lev.rs: -------------------------------------------------------------------------------- 1 | //! Tests for the levenshtein command line interface 2 | 3 | use std::process::Command; // Run programs 4 | 5 | use assert_cmd::prelude::*; // Add methods on commands 6 | use predicates::prelude::*; // Used for writing assertions 7 | 8 | #[test] 9 | fn lev_basic() -> Result<(), Box> { 10 | let mut cmd = Command::cargo_bin("zspell")?; 11 | 12 | cmd.arg("lev") 13 | .arg("the quick brown fox") 14 | .arg("the slow brown flocks"); 15 | cmd.assert().success().stdout(predicate::str::contains("9")); 16 | 17 | Ok(()) 18 | } 19 | -------------------------------------------------------------------------------- /zspell-cli/tests/files/de_res.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluots/zspell/497fb8fa7c6a98d879e7541be942efa54242f595/zspell-cli/tests/files/de_res.txt -------------------------------------------------------------------------------- /zspell-cli/tests/files/sample-index.json: -------------------------------------------------------------------------------- 1 | { 2 | "schema_version": 1, 3 | "updated": "2024-01-18T09:49:04Z", 4 | "items": [ 5 | { 6 | "lang": "de-AT", 7 | "tags": [ 8 | "source-wooorm" 9 | ], 10 | "is_ext": false, 11 | "id": "018d1bf7-22c1-7618-b42e-80592e77bc8a", 12 | "fmt": "hunspell", 13 | "aff": { 14 | "urls": [ 15 | "{{ROOT_URL}}/dictionaries/de-AT/index.aff" 16 | ], 17 | "hash": "sha1:a464def0d8bb136f20012d431b60faae2cc915b5", 18 | "size": 19199 19 | }, 20 | "dic": { 21 | "urls": [ 22 | "{{ROOT_URL}}/dictionaries/de-AT/index.dic" 23 | ], 24 | "hash": "sha1:eee2f5c4eddac4175d67c00bc808032b02058b5d", 25 | "size": 1121822 26 | }, 27 | "lic": { 28 | "urls": [ 29 | "{{ROOT_URL}}/dictionaries/de-AT/license" 30 | ], 31 | "hash": "sha1:c4d083267263c478591c4856981f32f31690456d", 32 | "size": 760 33 | } 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /zspell-py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "zspell-py" 3 | version = "0.5.5" 4 | edition = "2021" 5 | publish = false 6 | 7 | readme = "README.md" 8 | # Note: this is symlinked from top level 9 | license-file = "LICENSE" 10 | 11 | [lib] 12 | name = "zspell" 13 | crate-type = ["cdylib"] 14 | doc = false 15 | 16 | [dependencies] 17 | regex = "1.10" 18 | pyo3 = { version = "0.21.2", features = ["extension-module"] } 19 | zspell = { path = "../zspell" } 20 | 21 | [build-dependencies] 22 | pyo3-build-config = "0.21.2" 23 | 24 | [package.metadata.release] 25 | shared-version = true 26 | -------------------------------------------------------------------------------- /zspell-py/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /zspell-py/README.md: -------------------------------------------------------------------------------- 1 | ../README-py.md -------------------------------------------------------------------------------- /zspell-py/build.rs: -------------------------------------------------------------------------------- 1 | // Special build script is needed to link to python C source on mac 2 | 3 | fn main() { 4 | pyo3_build_config::add_extension_module_link_args(); 5 | } 6 | -------------------------------------------------------------------------------- /zspell-py/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /zspell-py/docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # sys.path.insert(0, os.path.abspath('.')) 14 | 15 | import re 16 | from pathlib import Path 17 | 18 | import m2r 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "zspell" 23 | copyright = "2023, Trevor Gross" 24 | author = "Trevor Gross" 25 | 26 | # The full version, including alpha/beta/rc tags 27 | path = Path(__file__).parent.parent.joinpath("Cargo.toml") 28 | with path.open() as fs: 29 | fstr = fs.read() 30 | 31 | # Single source of truth for the version 32 | release = re.search(r"^version\s*=\s*\"(.*)\"$", fstr, re.MULTILINE).groups()[0] 33 | 34 | 35 | # -- General configuration --------------------------------------------------- 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | "sphinx.ext.duration", 42 | "sphinx.ext.doctest", 43 | "sphinx.ext.autodoc", 44 | "sphinx.ext.autosummary", 45 | "sphinx.ext.intersphinx", 46 | ] 47 | 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ["_templates"] 51 | 52 | # List of patterns, relative to source directory, that match files and 53 | # directories to ignore when looking for source files. 54 | # This pattern also affects html_static_path and html_extra_path. 55 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 56 | 57 | github_url = "https://github.com/pluots/zspell/" 58 | 59 | # Autodoc options 60 | autodoc_member_order = "bysource" 61 | autoclass_content = "both" 62 | 63 | 64 | # -- Options for HTML output ------------------------------------------------- 65 | 66 | # The theme to use for HTML and HTML Help pages. See the documentation for 67 | # a list of builtin themes. 68 | # 69 | html_theme = "furo" 70 | 71 | # Add any paths that contain custom static files (such as style sheets) here, 72 | # relative to this directory. They are copied after the builtin static files, 73 | # so a file named "default.css" will overwrite the builtin "default.css". 74 | html_static_path = ["_static"] 75 | 76 | 77 | def convert_docstrings(app, what, name, obj, options, lines): 78 | """Convert docstrings from markdown to RST""" 79 | md = "\n".join(lines) 80 | rst = m2r.convert(md) 81 | lines.clear() 82 | lines += rst.splitlines() 83 | 84 | 85 | def setup(app): 86 | app.connect("autodoc-process-docstring", convert_docstrings) 87 | -------------------------------------------------------------------------------- /zspell-py/docs/index.rst: -------------------------------------------------------------------------------- 1 | .. zspell documentation master file, created by 2 | sphinx-quickstart on Thu Jun 30 00:40:42 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ZSpell Documentation: Python Interface 7 | ========================================= 8 | 9 | Welcome to the documentation for ZSpell's Python interface. This contains the 10 | basics, please visit the repository at https://github.com/pluots/zspell for 11 | more information. 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | :caption: Contents: 16 | 17 | .. module:: zspell 18 | 19 | .. autoclass:: Dictionary 20 | :members: 21 | :undoc-members: 22 | .. :special-members: 23 | 24 | .. autoclass:: BuildError 25 | :members: 26 | 27 | .. autoclass:: ParseError 28 | :members: 29 | 30 | .. autoclass:: RegexError 31 | :members: 32 | 33 | .. autoclass:: IoError 34 | :members: 35 | 36 | 37 | .. Indices and tables 38 | .. ================== 39 | 40 | .. * :ref:`genindex` 41 | .. * :ref:`modindex` 42 | .. * :ref:`search` 43 | -------------------------------------------------------------------------------- /zspell-py/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /zspell-py/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | furo>=2022.12 2 | sphinx>=6.0 3 | pygments>=2.13 4 | m2r>=0.3 5 | maturin>=0.14 6 | -------------------------------------------------------------------------------- /zspell-py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=0.14,<0.15"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "zspell" 7 | requires-python = ">=3.7" 8 | description = "A simple yet fast spellchecker that works with Hunspell dictionaries" 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | ] 14 | 15 | [tool.pytest.ini_options] 16 | minversion = "6.0" 17 | testpaths = ["tests"] 18 | -------------------------------------------------------------------------------- /zspell-py/python/zspell/__init__.py: -------------------------------------------------------------------------------- 1 | from .zspell import * 2 | 3 | __doc__ = zspell.__doc__ 4 | if hasattr(zspell, "__all__"): 5 | __all__ = zspell.__all__ 6 | -------------------------------------------------------------------------------- /zspell-py/python/zspell/py.types: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pluots/zspell/497fb8fa7c6a98d879e7541be942efa54242f595/zspell-py/python/zspell/py.types -------------------------------------------------------------------------------- /zspell-py/python/zspell/zspell.pyi: -------------------------------------------------------------------------------- 1 | class Dictionary: 2 | def __new__( 3 | config_str: str, dict_str: str, personal_str: str | None 4 | ) -> Dictionary: ... 5 | def check(self, input: str) -> bool: ... 6 | def check_word(self, word: str) -> bool: ... 7 | 8 | class BuildError: ... 9 | class IoError: ... 10 | class ParseError: ... 11 | class RegexError: ... 12 | -------------------------------------------------------------------------------- /zspell-py/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Wrappers around the `zspell` module to expose it to Python 2 | #![forbid(unsafe_code)] 3 | 4 | use ::zspell as z; 5 | use pyo3::create_exception; 6 | use pyo3::exceptions::PyException; 7 | use pyo3::prelude::*; 8 | 9 | #[pyclass] 10 | #[derive(Debug)] 11 | /// This is the main dictionary interface. 12 | /// 13 | /// To use it, you need to load in both an affix configuration file and a 14 | /// dictionary file. Sometimes these are installed on your system but if not, 15 | /// this repository has them available: 16 | /// . 17 | /// 18 | /// ```pycon 19 | /// >>> from zspell import Dictionary 20 | /// >>> with open ("dictionaries/en_US.aff", "r") as f: 21 | /// ... config_str = f.read() 22 | /// ... 23 | /// >>> with open ("dictionaries/en_US.dic", "r") as f: 24 | /// ... dict_str = f.read() 25 | /// ... 26 | /// >>> d = Dictionary(config_str, dict_str) 27 | /// >>> d.check("Apples are good! Don't you think?") 28 | /// True 29 | /// >>> d.check("Apples are baaaad") 30 | /// False 31 | /// ``` 32 | struct Dictionary(z::Dictionary); 33 | 34 | #[pymethods] 35 | impl Dictionary { 36 | /// Create a new dictionary 37 | #[new] 38 | #[pyo3(text_signature = "(config_str, dict_str)")] 39 | fn new(config_str: &str, dict_str: &str, personal_str: Option<&str>) -> PyResult { 40 | let mut builder = z::DictBuilder::new() 41 | .dict_str(dict_str) 42 | .config_str(config_str); 43 | 44 | if let Some(personal) = personal_str { 45 | builder = builder.personal_str(personal); 46 | } 47 | 48 | match builder.build() { 49 | Ok(dict) => Ok(Self(dict)), 50 | Err(err) => Err(convert_error(err)), 51 | } 52 | } 53 | 54 | /// Check if a string is valid. 55 | #[pyo3(text_signature = "($self, input)")] 56 | fn check(&self, input: &str) -> bool { 57 | self.0.check(input) 58 | } 59 | 60 | /// Check if a single word is valid. 61 | #[pyo3(text_signature = "($self, word)")] 62 | fn check_word(&self, word: &str) -> bool { 63 | self.0.check_word(word) 64 | } 65 | // TODO: figure out how to convert to a python iterator 66 | // fn check_indices<'a: 'd, 'd>(&'d self, word: &'a str) -> impl Iterator + 'd{ 67 | // self.0.check_indices(word) 68 | // } 69 | } 70 | 71 | fn convert_error(err: z::Error) -> PyErr { 72 | match err { 73 | z::Error::Parse(e) => ParseError::new_err(format!("{e}")), 74 | z::Error::Build(e) => BuildError::new_err(format!("{e}")), 75 | z::Error::Regex(e) => RegexError::new_err(format!("{e}")), 76 | z::Error::Io(e) => IoError::new_err(format!("{e}")), 77 | _ => unreachable!(), 78 | } 79 | } 80 | 81 | create_exception!( 82 | my_module, 83 | BuildError, 84 | PyException, 85 | "Raised when there is an error building the dictionary." 86 | ); 87 | create_exception!( 88 | my_module, 89 | ParseError, 90 | PyException, 91 | "Raised when there is an error parsing dictionary input." 92 | ); 93 | create_exception!( 94 | my_module, 95 | RegexError, 96 | PyException, 97 | "Raised when there is an error with parsed regex." 98 | ); 99 | create_exception!( 100 | my_module, 101 | IoError, 102 | PyException, 103 | "Raised when there is an I/O error." 104 | ); 105 | 106 | #[pymodule] 107 | fn zspell(py: Python<'_>, m: &PyModule) -> PyResult<()> { 108 | m.add_class::()?; 109 | m.add("BuildError", py.get_type::())?; 110 | m.add("ParseError", py.get_type::())?; 111 | m.add("IoError", py.get_type::())?; 112 | m.add("RegexError", py.get_type::())?; 113 | Ok(()) 114 | } 115 | -------------------------------------------------------------------------------- /zspell-py/tests/test_basic.py: -------------------------------------------------------------------------------- 1 | from zspell import Dictionary 2 | 3 | CFG_STR = """SET UTF-8 4 | 5 | PFX A Y 1 6 | PFX A 0 aa . 7 | 8 | SFX B Y 2 9 | SFX B y bb y 10 | SFX B 0 cc [^y] 11 | """ 12 | 13 | DICT_STR = """3 14 | xxx/A 15 | yyy/B 16 | zzz/AB 17 | """ 18 | 19 | 20 | def test_simple(): 21 | d = Dictionary(CFG_STR, DICT_STR) 22 | assert d.check("xxx") 23 | assert d.check("aaxxx") 24 | assert d.check("aazzzcc") 25 | -------------------------------------------------------------------------------- /zspell/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "zspell" 3 | version = "0.5.5" 4 | edition = "2021" 5 | authors = ["Trevor Gross "] 6 | description = "Native Rust library for spellchecking" 7 | rust-version = "1.65" 8 | 9 | readme = "README.md" 10 | license-file = "LICENSE" 11 | 12 | documentation = "https://docs.rs/zspell" 13 | repository = "https://github.com/pluots/zspell" 14 | 15 | keywords = ["spellcheck", "spelling", "cli"] 16 | categories = ["algorithms", "text-processing", "command-line-utilities"] 17 | 18 | [badges] 19 | maintenance = { status = "actively-developed" } 20 | 21 | # Config for the rust-usable library and the executable binary 22 | [lib] 23 | name = "zspell" 24 | path = "src/lib.rs" 25 | 26 | 27 | [dependencies] 28 | # Base dependencies 29 | cfg-if = "1.0" 30 | dirs = "5.0.1" 31 | hashbrown = "0.14.5" 32 | itertools = "0.13.0" 33 | lazy_static = "1.4" 34 | regex = "1.10" 35 | stringmetrics = "2.2.2" 36 | sys-locale = "0.3.1" 37 | unicode-segmentation = "1.11.0" 38 | visibility = "0.1.0" 39 | xxhash-rust = { version = "0.8.10", features = ["xxh32"] } 40 | 41 | [dev-dependencies] 42 | criterion = "0.5" 43 | pretty_assertions = "1.4" 44 | tempfile = "3.10" 45 | rand = "0.8.5" 46 | test-util = { path = "test-util" } 47 | indoc = "2.0.5" 48 | 49 | [build-dependencies] 50 | autocfg = "1.3.0" 51 | indoc = "2.0.5" 52 | 53 | [features] 54 | unstable-suggestions = [] 55 | unstable-system = [] 56 | unstable-bench = [] 57 | zspell-unstable = ["unstable-suggestions", "unstable-system"] 58 | 59 | [[bench]] 60 | name = "datastructure" 61 | harness = false 62 | 63 | [[bench]] 64 | name = "dict_integration" 65 | harness = false 66 | 67 | [[bench]] 68 | name = "slice_contains" 69 | harness = false 70 | 71 | [[bench]] 72 | name = "small_map" 73 | harness = false 74 | 75 | [[bench]] 76 | name = "word_splitter" 77 | harness = false 78 | 79 | [package.metadata.release] 80 | shared-version = true 81 | allow-branch = ["main", "release"] 82 | 83 | # Can't run replacements at workspace root. Need to use this "hacky" sort of way. 84 | [[package.metadata.release.pre-release-replacements]] 85 | file = "../CHANGELOG.md" 86 | search = "Unreleased" 87 | replace = "{{version}}" 88 | 89 | [[package.metadata.release.pre-release-replacements]] 90 | file = "../CHANGELOG.md" 91 | search = "\\.\\.\\.HEAD" 92 | replace = "...{{tag_name}}" 93 | exactly = 1 94 | 95 | [[package.metadata.release.pre-release-replacements]] 96 | file = "../CHANGELOG.md" 97 | search = "ReleaseDate" 98 | replace = "{{date}}" 99 | 100 | [[package.metadata.release.pre-release-replacements]] 101 | file = "../CHANGELOG.md" 102 | search = "" 103 | replace = """\ 104 | \n\n\ 105 | ## [Unreleased] - ReleaseDate\n\n\ 106 | ### Added\n\n\ 107 | ### Changed\n\n\ 108 | ### Removed\n\n\ 109 | """ 110 | exactly = 1 111 | 112 | [[package.metadata.release.pre-release-replacements]] 113 | file = "../CHANGELOG.md" 114 | search = "" 115 | replace = """\ 116 | \n\ 117 | [Unreleased]: https://github.com/pluots/zspell/compare/{{tag_name}}...HEAD\ 118 | """ 119 | exactly = 1 120 | -------------------------------------------------------------------------------- /zspell/LICENSE: -------------------------------------------------------------------------------- 1 | ../LICENSE -------------------------------------------------------------------------------- /zspell/README.md: -------------------------------------------------------------------------------- 1 | ../README.md -------------------------------------------------------------------------------- /zspell/benches/datastructure.rs: -------------------------------------------------------------------------------- 1 | //! Benchmarks for operations on datastructures that resemble operations we 2 | //! might use in our spellchecker 3 | 4 | #![allow(clippy::disallowed_types)] 5 | #![allow(clippy::incompatible_msrv)] 6 | 7 | use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; 8 | use std::fs::File; 9 | use std::hint::black_box; 10 | use std::io::{self, BufRead}; 11 | use std::iter::FromIterator; 12 | 13 | use criterion::{criterion_group, criterion_main, Criterion}; 14 | use hashbrown::{HashMap as HashBrownMap, HashSet as HashBrownSet}; 15 | 16 | // We will check all variables in these contains and contains false lists - we 17 | // want a variety of names from throughout the set 18 | const CONTAINS_LIST: [&str; 15] = [ 19 | "Accenture", 20 | "Curie", 21 | "Gujranwala", 22 | "Hesperus", 23 | "Juneau", 24 | "Lakeland", 25 | "Mephistopheles", 26 | "O'Connell", 27 | "Sweden", 28 | "Sarajevo", 29 | "sweptback", 30 | "tigerish", 31 | "Vespucci", 32 | "zymurgy", 33 | "0", 34 | ]; 35 | 36 | const NOT_CONTAINS_LIST: [&str; 15] = [ 37 | "aaaaaa", 38 | "Curied", 39 | "gujranwalda", 40 | "Hesperuds", 41 | "Junaeau", 42 | "Lakaeland", 43 | "Mepsifstopheles", 44 | "OFonnell", 45 | "Swayden", 46 | "Sarajayovo", 47 | "sweptabback", 48 | "tigerstripeish", 49 | "Vespucki", 50 | "zzzzzzz", 51 | "000000", 52 | ]; 53 | 54 | static STR_REF: &str = "SOMETHING"; 55 | 56 | /// Load lines from a file 57 | /// Strip the affix "/" directive 58 | fn lines_loader() -> Vec { 59 | let file = File::open("../dictionaries/en_US.dic").unwrap(); 60 | let lines = io::BufReader::new(file).lines(); 61 | 62 | let mut v: Vec = Vec::new(); 63 | 64 | for line in lines { 65 | v.push(line.unwrap().split('/').next().unwrap().to_owned()); 66 | } 67 | 68 | // Validate items 69 | for item in CONTAINS_LIST { 70 | assert!(v.contains(&item.to_string())) 71 | } 72 | for item in NOT_CONTAINS_LIST { 73 | assert!(!v.contains(&item.to_string())) 74 | } 75 | 76 | v 77 | } 78 | 79 | type NestedVecMap = Vec<(T1, Vec)>; 80 | 81 | /// Take the results of `lines_loader` and create a map datatype 82 | /// This replicates the data structure we store with some meta 83 | fn map_loader() -> NestedVecMap { 84 | let lines = lines_loader(); 85 | lines 86 | .iter() 87 | .map(|line| (line.clone(), vec![STR_REF])) 88 | .collect() 89 | } 90 | 91 | // Actual benchmark calling functions 92 | 93 | pub fn bench_vec(c: &mut Criterion) { 94 | let vec: Vec = lines_loader(); 95 | 96 | c.bench_function("Vec contains true", |b| { 97 | b.iter(|| { 98 | for item in CONTAINS_LIST { 99 | black_box(vec.iter().any(|x| x == black_box(item))); 100 | } 101 | }) 102 | }); 103 | 104 | c.bench_function("Vec contains false", |b| { 105 | b.iter(|| { 106 | for item in NOT_CONTAINS_LIST { 107 | black_box(vec.iter().any(|x| x == black_box(item))); 108 | } 109 | }) 110 | }); 111 | } 112 | 113 | pub fn bench_btree(c: &mut Criterion) { 114 | let bt = BTreeSet::from_iter(lines_loader()); 115 | 116 | c.bench_function("BTree contains true", |b| { 117 | b.iter(|| { 118 | for item in CONTAINS_LIST { 119 | black_box(bt.contains(black_box(item))); 120 | } 121 | }) 122 | }); 123 | 124 | c.bench_function("BTree contains false", |b| { 125 | b.iter(|| { 126 | for item in NOT_CONTAINS_LIST { 127 | black_box(bt.contains(black_box(item))); 128 | } 129 | }) 130 | }); 131 | } 132 | 133 | pub fn bench_hashset(c: &mut Criterion) { 134 | let hs: HashSet = HashSet::from_iter(lines_loader()); 135 | 136 | c.bench_function("HashSet contains true", |b| { 137 | b.iter(|| { 138 | for item in CONTAINS_LIST { 139 | black_box(hs.contains(black_box(item))); 140 | } 141 | }) 142 | }); 143 | 144 | c.bench_function("HashSet contains false", |b| { 145 | b.iter(|| { 146 | for item in NOT_CONTAINS_LIST { 147 | black_box(hs.contains(black_box(item))); 148 | } 149 | }) 150 | }); 151 | } 152 | 153 | pub fn bench_hashbrownset(c: &mut Criterion) { 154 | let hs: HashBrownSet = HashBrownSet::from_iter(lines_loader()); 155 | 156 | c.bench_function("HashBrownSet contains true", |b| { 157 | b.iter(|| { 158 | for item in CONTAINS_LIST { 159 | black_box(hs.contains(black_box(item))); 160 | } 161 | }) 162 | }); 163 | 164 | c.bench_function("HashBrownSet contains false", |b| { 165 | b.iter(|| { 166 | for item in NOT_CONTAINS_LIST { 167 | black_box(hs.contains(black_box(item))); 168 | } 169 | }) 170 | }); 171 | } 172 | 173 | // Map type benchmarks 174 | 175 | pub fn bench_vecmap(c: &mut Criterion) { 176 | let vm: NestedVecMap<_, _> = map_loader(); 177 | 178 | c.bench_function("VecMap contains true", |b| { 179 | b.iter(|| { 180 | for item in CONTAINS_LIST { 181 | black_box(vm.iter().any(|x| x.0 == black_box(item))); 182 | } 183 | }) 184 | }); 185 | 186 | c.bench_function("VecMap contains false", |b| { 187 | b.iter(|| { 188 | for item in NOT_CONTAINS_LIST { 189 | black_box(vm.iter().any(|x| x.0 == black_box(item))); 190 | } 191 | }) 192 | }); 193 | 194 | c.bench_function("VecMap get true", |b| { 195 | b.iter(|| { 196 | for item in CONTAINS_LIST { 197 | black_box(vm.iter().find(|x| x.0 == black_box(item)).map(|x| &x.1)); 198 | } 199 | }) 200 | }); 201 | 202 | c.bench_function("VecMap get false", |b| { 203 | b.iter(|| { 204 | for item in NOT_CONTAINS_LIST { 205 | black_box(vm.iter().find(|x| x.0 == black_box(item)).map(|x| &x.1)); 206 | } 207 | }) 208 | }); 209 | } 210 | 211 | pub fn bench_btreemap(c: &mut Criterion) { 212 | let bt: BTreeMap = BTreeMap::from_iter(map_loader()); 213 | 214 | c.bench_function("BTreeMap contains true", |b| { 215 | b.iter(|| { 216 | for item in CONTAINS_LIST { 217 | black_box(bt.contains_key(black_box(item))); 218 | } 219 | }) 220 | }); 221 | 222 | c.bench_function("BTreeMap contains false", |b| { 223 | b.iter(|| { 224 | for item in NOT_CONTAINS_LIST { 225 | black_box(bt.contains_key(black_box(item))); 226 | } 227 | }) 228 | }); 229 | 230 | c.bench_function("BTreeMap get true", |b| { 231 | b.iter(|| { 232 | for item in CONTAINS_LIST { 233 | black_box(bt.get(black_box(item))); 234 | } 235 | }) 236 | }); 237 | 238 | c.bench_function("BTreeMap get false", |b| { 239 | b.iter(|| { 240 | for item in NOT_CONTAINS_LIST { 241 | black_box(bt.get(black_box(item))); 242 | } 243 | }) 244 | }); 245 | } 246 | 247 | pub fn bench_hashmap(c: &mut Criterion) { 248 | let hm: HashMap = HashMap::from_iter(map_loader()); 249 | 250 | c.bench_function("HashMap contains true", |b| { 251 | b.iter(|| { 252 | for item in CONTAINS_LIST { 253 | black_box(hm.contains_key(black_box(item))); 254 | } 255 | }) 256 | }); 257 | 258 | c.bench_function("HashMap contains false", |b| { 259 | b.iter(|| { 260 | for item in NOT_CONTAINS_LIST { 261 | black_box(hm.contains_key(black_box(item))); 262 | } 263 | }) 264 | }); 265 | 266 | c.bench_function("HashMap get true", |b| { 267 | b.iter(|| { 268 | for item in CONTAINS_LIST { 269 | black_box(hm.get(black_box(item))); 270 | } 271 | }) 272 | }); 273 | 274 | c.bench_function("HashMap get false", |b| { 275 | b.iter(|| { 276 | for item in NOT_CONTAINS_LIST { 277 | black_box(hm.get(black_box(item))); 278 | } 279 | }) 280 | }); 281 | } 282 | 283 | pub fn bench_hashbrownmap(c: &mut Criterion) { 284 | let hm: HashBrownMap = HashBrownMap::from_iter(map_loader()); 285 | 286 | c.bench_function("HashBrownMap contains true", |b| { 287 | b.iter(|| { 288 | for item in CONTAINS_LIST { 289 | black_box(hm.contains_key(black_box(item))); 290 | } 291 | }) 292 | }); 293 | 294 | c.bench_function("HashBrownMap contains false", |b| { 295 | b.iter(|| { 296 | for item in NOT_CONTAINS_LIST { 297 | black_box(hm.contains_key(black_box(item))); 298 | } 299 | }) 300 | }); 301 | 302 | c.bench_function("HashBrownMap get true", |b| { 303 | b.iter(|| { 304 | for item in CONTAINS_LIST { 305 | black_box(hm.get(black_box(item))); 306 | } 307 | }) 308 | }); 309 | 310 | c.bench_function("HashBrownMap get false", |b| { 311 | b.iter(|| { 312 | for item in NOT_CONTAINS_LIST { 313 | black_box(hm.get(black_box(item))); 314 | } 315 | }) 316 | }); 317 | } 318 | 319 | criterion_group!( 320 | datastructure, 321 | bench_vec, 322 | bench_btree, 323 | bench_hashset, 324 | bench_hashbrownset, 325 | bench_vecmap, 326 | bench_btreemap, 327 | bench_hashmap, 328 | bench_hashbrownmap 329 | ); 330 | criterion_main!(datastructure); 331 | -------------------------------------------------------------------------------- /zspell/benches/dict_integration.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::incompatible_msrv)] 2 | 3 | use std::fs; 4 | use std::hint::black_box; 5 | 6 | use criterion::{criterion_group, criterion_main, Criterion}; 7 | use zspell::bench::{affix_from_str, DictEntry, FlagType}; 8 | use zspell::{DictBuilder, Dictionary}; 9 | 10 | const TEXT: &str = "A Hare was mking fun of the Tortoise one day for being so slow. 11 | 12 | Do you ever get anywhere? he asked with a mocking laugh. 13 | 14 | Yes, replied the Tortoise, and I get there sooner than you think. I'll 15 | run you a race and prove it. 16 | 17 | The Hare was much amused at the iea of running a race with the Tortise, 18 | but for the fun of the thing he agreed. So the Fox, who had consented to 19 | act as judge, maarked the distance and started the runners off. 20 | 21 | The Hare was soon far out of sight, and to make the Tortoise feel very 22 | deeply how ridiculous it was for him to try a race with a Hare, he lay 23 | down beside the course to take a nap until the Tortoise should catch up. 24 | 25 | The Tortoise meanwhile kept going sloly but steadily, and, after a time, 26 | passed the place where the Hare was sleeping. But the Hare slept on very 27 | peacefully; and when at last he did wake up, the Tortoise was near the goal. 28 | The Hare now ran his swiftest, but he could not overtaake the Tortoise 29 | in time."; 30 | 31 | const CONTAINS_LIST: [&str; 15] = [ 32 | "Accenture", 33 | "Curie", 34 | "Gujranwala", 35 | "Hesperus", 36 | "Juneau", 37 | "Lakeland", 38 | "Mephistopheles", 39 | "O'Connell", 40 | "Sweden", 41 | "Sarajevo", 42 | "sweptback", 43 | "tigerish", 44 | "Vespucci", 45 | "zymurgy", 46 | "0", 47 | ]; 48 | 49 | const NOT_CONTAINS_LIST: [&str; 15] = [ 50 | "aaaaaa", 51 | "Curied", 52 | "gujranwalda", 53 | "Hesperuds", 54 | "Junaeau", 55 | "Lakaeland", 56 | "Mepsifstopheles", 57 | "OFonnell", 58 | "Swayden", 59 | "Sarajayovo", 60 | "sweptabback", 61 | "tigerstripeish", 62 | "Vespucki", 63 | "zzzzzzz", 64 | "000000", 65 | ]; 66 | 67 | fn fixture_create_en_dict() -> Dictionary { 68 | // Test that we correctly compile the short wordlist 69 | 70 | let aff_content = fs::read_to_string("../dictionaries/en_US.aff").unwrap(); 71 | let dic_content = fs::read_to_string("../dictionaries/en_US.dic").unwrap(); 72 | 73 | DictBuilder::new() 74 | .dict_str(black_box(&dic_content)) 75 | .config_str(black_box(&aff_content)) 76 | .build() 77 | .unwrap() 78 | } 79 | 80 | pub fn bench_parsers(c: &mut Criterion) { 81 | let aff_content = fs::read_to_string("../dictionaries/en_US.aff").unwrap(); 82 | let dic_content = fs::read_to_string("../dictionaries/en_US.dic").unwrap(); 83 | 84 | c.bench_function("Parse affix file", |b| { 85 | b.iter(|| black_box(affix_from_str(black_box(&aff_content)).unwrap())) 86 | }); 87 | 88 | c.bench_function("Parse dict file", |b| { 89 | b.iter(|| { 90 | black_box( 91 | DictEntry::parse_all(black_box(&dic_content), black_box(FlagType::Utf8)).unwrap(), 92 | ) 93 | }) 94 | }); 95 | } 96 | 97 | /// This test just creates a dictionary. The compiling is the slow step. 98 | pub fn bench_dict_compile(c: &mut Criterion) { 99 | let aff_content = fs::read_to_string("../dictionaries/en_US.aff").unwrap(); 100 | let dic_content = fs::read_to_string("../dictionaries/en_US.dic").unwrap(); 101 | 102 | c.bench_function("Spellcheck: compile dictionary", |b| { 103 | b.iter(|| { 104 | black_box( 105 | DictBuilder::new() 106 | .dict_str(black_box(&dic_content)) 107 | .config_str(black_box(&aff_content)) 108 | .build() 109 | .unwrap(), 110 | ) 111 | }) 112 | }); 113 | } 114 | 115 | pub fn bench_dict_simple(c: &mut Criterion) { 116 | let dict = fixture_create_en_dict(); 117 | c.bench_function("Spellcheck: 1 correct word", |b| { 118 | b.iter(|| black_box(dict.check_word(black_box("turbidity's")))) 119 | }); 120 | 121 | c.bench_function("Spellcheck: 1 incorrect word", |b| { 122 | b.iter(|| black_box(dict.check_word(black_box("turbiditated")))) 123 | }); 124 | 125 | c.bench_function("Spellcheck: 15 correct words", |b| { 126 | b.iter(|| { 127 | for item in CONTAINS_LIST { 128 | black_box(dict.check(black_box(item))); 129 | } 130 | }) 131 | }); 132 | 133 | c.bench_function("Spellcheck: 15 incorrect words", |b| { 134 | b.iter(|| { 135 | for item in NOT_CONTAINS_LIST { 136 | black_box(dict.check(black_box(item))); 137 | } 138 | }) 139 | }); 140 | } 141 | 142 | pub fn bench_dict_paragraph(c: &mut Criterion) { 143 | let dict = fixture_create_en_dict(); 144 | 145 | c.bench_function("Spellcheck: 188 word paragraph", |b| { 146 | b.iter(|| black_box(dict.check(black_box(TEXT)))) 147 | }); 148 | } 149 | 150 | criterion_group!( 151 | dict_integration, 152 | bench_parsers, 153 | bench_dict_compile, 154 | bench_dict_simple, 155 | bench_dict_paragraph, 156 | // bench_parallel, 157 | ); 158 | criterion_main!(dict_integration); 159 | -------------------------------------------------------------------------------- /zspell/benches/slice_contains.rs: -------------------------------------------------------------------------------- 1 | //! Benchmark the difference between contains & `binary_search`es, intended 2 | 3 | #![allow(clippy::incompatible_msrv)] 4 | 5 | use std::hint::black_box; 6 | 7 | use criterion::{criterion_group, criterion_main, Criterion}; 8 | 9 | pub fn benches(c: &mut Criterion) { 10 | const EMPTY: [&str; 0] = []; 11 | const SORT1: [&str; 1] = ["A"]; 12 | const SORT3: [&str; 3] = ["A", "B", "C"]; 13 | const SORT10: [&str; 10] = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]; 14 | const UNSORT3: [&str; 3] = ["C", "A", "B"]; 15 | const UNSORT10: [&str; 10] = ["B", "F", "C", "A", "I", "J", "E", "D", "H", "G"]; 16 | 17 | c.bench_function("Empty: `contains`", |b| { 18 | b.iter(|| black_box(&EMPTY).contains(black_box(&"A"))) 19 | }); 20 | 21 | c.bench_function("Empty: `binary_search`", |b| { 22 | b.iter(|| black_box(&EMPTY).binary_search(black_box(&"A")).is_ok()) 23 | }); 24 | 25 | c.bench_function("1x: `contains`", |b| { 26 | b.iter(|| black_box(&SORT1).contains(black_box(&"A"))) 27 | }); 28 | 29 | c.bench_function("1x: `binary_search`", |b| { 30 | b.iter(|| black_box(&SORT1).binary_search(black_box(&"A")).is_ok()) 31 | }); 32 | 33 | c.bench_function("3 sorted: `contains`", |b| { 34 | b.iter(|| black_box(SORT3).contains(black_box(&"B"))) 35 | }); 36 | 37 | c.bench_function("3 sorted: `binary_search`", |b| { 38 | b.iter(|| black_box(SORT3).binary_search(black_box(&"B")).is_ok()) 39 | }); 40 | 41 | c.bench_function("10 sorted: `contains` early", |b| { 42 | b.iter(|| black_box(SORT10).contains(black_box(&"B"))) 43 | }); 44 | 45 | c.bench_function("10 sorted: `binary_search` early", |b| { 46 | b.iter(|| black_box(SORT10).binary_search(black_box(&"B")).is_ok()) 47 | }); 48 | 49 | c.bench_function("10 sorted: `contains` mid", |b| { 50 | b.iter(|| black_box(SORT10).contains(black_box(&"G"))) 51 | }); 52 | 53 | c.bench_function("10 sorted: `binary_search` mid", |b| { 54 | b.iter(|| black_box(SORT10).binary_search(black_box(&"G")).is_ok()) 55 | }); 56 | 57 | c.bench_function("10 sorted: `contains` late", |b| { 58 | b.iter(|| black_box(SORT10).contains(black_box(&"J"))) 59 | }); 60 | 61 | c.bench_function("10 sorted: `binary_search` late", |b| { 62 | b.iter(|| black_box(SORT10).binary_search(black_box(&"J")).is_ok()) 63 | }); 64 | 65 | c.bench_function("3 unsorted: `contains`", |b| { 66 | b.iter(|| black_box(SORT3).contains(black_box(&"B"))) 67 | }); 68 | 69 | c.bench_function("3 unsorted: `binary_search`", |b| { 70 | b.iter(|| { 71 | let mut arr = black_box(UNSORT3); 72 | arr.sort_unstable(); 73 | black_box(arr).binary_search(black_box(&"B")).is_ok() 74 | }) 75 | }); 76 | 77 | c.bench_function("10 unsorted: `contains`", |b| { 78 | b.iter(|| black_box(SORT10).contains(black_box(&"G"))) 79 | }); 80 | 81 | c.bench_function("10 unsorted: `binary_search`", |b| { 82 | b.iter(|| { 83 | let mut arr = black_box(UNSORT10); 84 | arr.sort_unstable(); 85 | black_box(arr).binary_search(black_box(&"G")).is_ok() 86 | }) 87 | }); 88 | } 89 | 90 | criterion_group!(slice_contains, benches,); 91 | criterion_main!(slice_contains); 92 | -------------------------------------------------------------------------------- /zspell/benches/word_splitter.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::incompatible_msrv)] 2 | 3 | use std::hint::black_box; 4 | 5 | use criterion::{criterion_group, criterion_main, Criterion}; 6 | use unicode_segmentation::UnicodeSegmentation; 7 | 8 | const TESTSTR: &str = "the quick brown. Fox Jum-ped -- where? 'over' (the) very-lazy dog"; 9 | 10 | // These aren't fair comparisons because they don't return indices 11 | pub fn basic_splits(c: &mut Criterion) { 12 | c.bench_function("Split whitespace", |b| { 13 | b.iter(|| black_box(black_box(TESTSTR).split_whitespace().last().unwrap())) 14 | }); 15 | c.bench_function("Split ascii whitespace", |b| { 16 | b.iter(|| black_box(black_box(TESTSTR).split_ascii_whitespace().last().unwrap())) 17 | }); 18 | } 19 | 20 | pub fn segmentation(c: &mut Criterion) { 21 | c.bench_function("Simple segmentation", |b| { 22 | b.iter(|| { 23 | black_box( 24 | black_box(TESTSTR) 25 | .split_word_bound_indices() 26 | .last() 27 | .unwrap(), 28 | ) 29 | }) 30 | }); 31 | c.bench_function("Skip whitespace using all", |b| { 32 | b.iter(|| { 33 | black_box( 34 | black_box(TESTSTR) 35 | .split_word_bound_indices() 36 | .filter(|split| split.1.chars().all(|c| c.is_alphanumeric() || c == '-')) 37 | .last() 38 | .unwrap(), 39 | ) 40 | }) 41 | }); 42 | c.bench_function("Skip whitespace using first", |b| { 43 | b.iter(|| { 44 | black_box( 45 | black_box(TESTSTR) 46 | .split_word_bound_indices() 47 | .filter(|split| { 48 | let first = split.1.chars().next().unwrap(); 49 | first.is_alphanumeric() || first == '-' 50 | }) 51 | .last() 52 | .unwrap(), 53 | ) 54 | }) 55 | }); 56 | c.bench_function("Skip whitespace using first nohyphen", |b| { 57 | b.iter(|| { 58 | black_box( 59 | black_box(TESTSTR) 60 | .split_word_bound_indices() 61 | .filter(|split| split.1.chars().next().unwrap().is_alphanumeric()) 62 | .last() 63 | .unwrap(), 64 | ) 65 | }) 66 | }); 67 | } 68 | 69 | // pub fn segmentation_peek(c: &mut Criterion) { 70 | // c.bench_function("Skip whitespace using first", |b| { 71 | // b.iter(|| { 72 | // black_box( 73 | // black_box(TESTSTR) 74 | // .split_word_bound_indices() 75 | // .filter(|split| { 76 | // let first = split.1.chars().next().unwrap(); 77 | // first.is_alphanumeric() || first == '-' 78 | // }) 79 | // .last() 80 | // .unwrap(), 81 | // ) 82 | // }) 83 | // }); 84 | // } 85 | 86 | criterion_group!(word_splitter, basic_splits, segmentation); 87 | criterion_main!(word_splitter); 88 | -------------------------------------------------------------------------------- /zspell/build.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Write; 2 | use std::path::Path; 3 | use std::{env, fs}; 4 | 5 | use indoc::indoc; 6 | 7 | fn main() { 8 | update_tests(); 9 | emit_autocfg(); 10 | } 11 | 12 | const TEST_PREFIX: &str = "// autogenerated file, do not edit manually 13 | // one test is generated for each `.test` file 14 | 15 | "; 16 | 17 | /// Autogenerate an integration test for every `.test` file 18 | fn update_tests() { 19 | let root = Path::new(env!("CARGO_MANIFEST_DIR")); 20 | let out_path = Path::new(&env::var("OUT_DIR").unwrap()).join("auto_suite.rs"); 21 | let suite_dir = root.join("test-suite"); 22 | let test_paths = fs::read_dir(suite_dir).unwrap(); 23 | 24 | let mut to_write = TEST_PREFIX.to_owned(); 25 | let mut all_test_names = Vec::new(); 26 | 27 | for path in test_paths { 28 | let path = path.unwrap().path(); 29 | // let path_str = path.display(); 30 | let fname = path.file_name().unwrap().to_string_lossy(); 31 | let test_name = fname 32 | .strip_suffix(".test") 33 | .unwrap() 34 | .trim_start_matches(char::is_numeric) 35 | .trim_start_matches(['_', '-']) 36 | .replace('-', "_"); 37 | 38 | if all_test_names.contains(&test_name) { 39 | panic!("duplicate generated test name {test_name}"); 40 | } 41 | 42 | if test_name == "example" { 43 | continue; 44 | } 45 | 46 | write!( 47 | to_write, 48 | indoc! {" 49 | 50 | #[test] 51 | fn test_{test_name}() {{ 52 | let path = std::path::Path::new(env!(\"CARGO_MANIFEST_DIR\")); 53 | let path = path.join(\"test-suite/{fname}\"); 54 | let mgr = test_util::TestManager::new_from_file(path); 55 | let dict = mgr.build_dict(); 56 | mgr.check_all(&dict); 57 | }} 58 | "}, 59 | test_name = test_name, 60 | fname = fname, 61 | ) 62 | .unwrap(); 63 | 64 | all_test_names.push(test_name); 65 | } 66 | 67 | fs::write(out_path, to_write).unwrap(); 68 | } 69 | 70 | /// Add configuration that depends on rust version 71 | fn emit_autocfg() { 72 | const PROBE_BOX: &str = r#" || { 73 | let s = "foo".to_owned(); 74 | let _b: Box<[String]> = [s].as_slice().into(); 75 | } 76 | "#; 77 | 78 | let ac = autocfg::new(); 79 | 80 | // check if we have `Box<[T]>: From<&[T: Clone]>` loosened from `T: Copy` (1.71) 81 | ac.emit_expression_cfg(PROBE_BOX, "box_from_slice_has_clone_bound"); 82 | println!("cargo:rustc-check-cfg=cfg(box_from_slice_has_clone_bound)"); 83 | } 84 | -------------------------------------------------------------------------------- /zspell/src/affix/node.rs: -------------------------------------------------------------------------------- 1 | //! Parser representations of an affix file 2 | 3 | use super::ParsedRuleGroup; 4 | use crate::affix::{CompoundPattern, CompoundSyllable, Conversion, Encoding, FlagType, Phonetic}; 5 | 6 | /// A single line entry in an affix file 7 | /// 8 | /// We represent all flags as strings, and parse them later 9 | #[non_exhaustive] 10 | #[derive(Debug, PartialEq, Eq)] 11 | pub enum AffixNode { 12 | /* 13 | General ptions 14 | */ 15 | /// `SET` 16 | Encoding(Encoding), 17 | /// `FLAG` 18 | FlagType(FlagType), 19 | /// `COMPLEXPREFIXES` twofold prefix stripping 20 | ComplexPrefixes, 21 | /// `LANG` 22 | Language(String), 23 | /// `IGNORE` 24 | IgnoreChars(Vec), 25 | /// `AF` 26 | AffixAlias(Vec), 27 | /// `AM` 28 | MorphAlias(Vec), 29 | 30 | /* 31 | Suggestion Options 32 | */ 33 | /// `KEY` 34 | NeighborKeys(Vec), 35 | /// `TRY` 36 | TryCharacters(String), 37 | /// `NOSUGGEST` 38 | NoSuggestFlag(String), 39 | /// `MAXCPDSUGS` 40 | CompoundSugMax(u16), 41 | /// `MAXNGRAMSUGS` 42 | NGramSugMax(u16), 43 | /// `MAXDIFF` 44 | NGramDiffMax(u8), 45 | /// `ONLYMAXDIFF` 46 | NGramLimitToDiffMax, 47 | /// `NOSPLITSUGS` 48 | NoSplitSuggestions, 49 | /// `SUGSWITHDOTS` 50 | KeepTermDots, 51 | /// `REP` 52 | Replacement(Vec), 53 | /// `MAP` 54 | Mapping(Vec<(char, char)>), 55 | /// `PHONE` 56 | Phonetic(Vec), 57 | /// `WARN` 58 | WarnRareFlag(String), 59 | /* 60 | Compounding Options 61 | */ 62 | /// `FORBIDWARN` 63 | ForbidWarnWords, 64 | /// `BREAK` 65 | BreakSeparator(Vec), 66 | /// `COMPOUNDRULE` 67 | #[allow(dead_code)] 68 | CompoundRule(Vec), 69 | /// `COMPOUNDMIN` 70 | CompoundMinLen(u16), 71 | /// `COMPOUNDFLAG` 72 | CompoundFlag(String), 73 | /// `COMPOUNDBEGIN` 74 | CompoundBeginFlag(String), 75 | /// `COMPOUNDLAST` 76 | CompoundEndFlag(String), 77 | /// `COMPOUNDMIDDLE` 78 | CompoundMiddleFlag(String), 79 | /// `ONLYINCOMPOUND` 80 | CompoundOnlyFlag(String), 81 | /// `COMPOUNDPERMITFLAG` 82 | CompoundPermitFlag(String), 83 | /// `COMPOUNDFORBIDFLAG` 84 | CompoundForbidFlag(String), 85 | /// `COMPOUNDMORESUFFIXES` 86 | CompoundMoreSuffixes, 87 | /// `COMPOUNDROOT` 88 | CompoundRootFlag(String), 89 | /// `COMPOUNDWORDMAX` 90 | CompoundWordMax(u16), 91 | /// `CHECKCOMPOUNDDUP` 92 | CompoundForbidDup, 93 | /// `CHECKCOMPOUNDREP` 94 | CompoundForbidRepeat, 95 | /// `CHECKCOMPOUNDCASE` 96 | CompoundCheckCase, 97 | /// `CHECKCOMPOUNDTRIPLE` 98 | CompoundCheckTriple, 99 | /// `SIMPLIFIEDTRIPLE` 100 | CompoundSimplifyTriple, 101 | /// `CHECKCOMPOUNDPATTERN` 102 | CompoundForbidPats(Vec), 103 | /// `FORCEUCASE` 104 | CompoundForceUpFlag(String), 105 | /// `COMPOUNDSYLLABLE` 106 | CompoundSyllable(CompoundSyllable), 107 | /// `SYLLABLENUM` 108 | SyllableNum(String), 109 | 110 | /* 111 | Affix Options 112 | */ 113 | /// `PFX` 114 | Prefix(ParsedRuleGroup), 115 | /// `SFX` 116 | Suffix(ParsedRuleGroup), 117 | 118 | /* 119 | Other options 120 | */ 121 | /// `CIRCUMFIX` 122 | AfxCircumfixFlag(String), 123 | /// `FORBIDDENWORD` 124 | ForbiddenWordFlag(String), 125 | /// `FULLSTRIP` 126 | AfxFullStrip, 127 | /// `KEEPCASE` 128 | AfxKeepCaseFlag(String), 129 | /// `ICONV` 130 | AfxInputConversion(Vec), 131 | /// `OCONV` 132 | AfxOutputConversion(Vec), 133 | /// `LEMMA_PRESENT` this flag is deprecated 134 | AfxLemmaPresentFlag(String), 135 | /// `NEEDAFFIX` 136 | AfxNeededFlag(String), 137 | /// `PSEUDOROOT` this flag is deprecated 138 | AfxPseudoRootFlag(String), 139 | /// `SUBSTANDARD` 140 | AfxSubstandardFlag(String), 141 | /// `WORDCHARS` 142 | AfxWordChars(String), 143 | /// `CHECKSHARPS` 144 | AfxCheckSharps, 145 | /// `#` line 146 | Comment, 147 | /// `NAME` 148 | Name(String), 149 | /// `HOME` 150 | HomePage(String), 151 | /// `VERSION` 152 | Version(String), 153 | } 154 | 155 | impl AffixNode { 156 | pub const fn name_str(&self) -> &'static str { 157 | match self { 158 | AffixNode::Encoding(_) => "SET", 159 | AffixNode::FlagType(_) => "FLAG", 160 | AffixNode::ComplexPrefixes => "COMPLEXPREFIXES", 161 | AffixNode::Language(_) => "LANG", 162 | AffixNode::IgnoreChars(_) => "IGNORE", 163 | AffixNode::AffixAlias(_) => "AF", 164 | AffixNode::MorphAlias(_) => "AM", 165 | AffixNode::NeighborKeys(_) => "KEY", 166 | AffixNode::TryCharacters(_) => "TRY", 167 | AffixNode::NoSuggestFlag(_) => "NOSUGGEST", 168 | AffixNode::CompoundSugMax(_) => "MAXCPDSUGS", 169 | AffixNode::NGramSugMax(_) => "MAXNGRAMSUGS", 170 | AffixNode::NGramDiffMax(_) => "MAXDIFF", 171 | AffixNode::NGramLimitToDiffMax => "ONLYMAXDIFF", 172 | AffixNode::NoSplitSuggestions => "NOSPLITSUGS", 173 | AffixNode::KeepTermDots => "SUGSWITHDOTS", 174 | AffixNode::Replacement(_) => "REP", 175 | AffixNode::Mapping(_) => "MAP", 176 | AffixNode::Phonetic(_) => "PHONE", 177 | AffixNode::WarnRareFlag(_) => "WARN", 178 | AffixNode::ForbidWarnWords => "FORBIDWARN", 179 | AffixNode::BreakSeparator(_) => "BREAK", 180 | AffixNode::CompoundRule(_) => "COMPOUNDRULE", 181 | AffixNode::CompoundMinLen(_) => "COMPOUNDMIN", 182 | AffixNode::CompoundFlag(_) => "COMPOUNDFLAG", 183 | AffixNode::CompoundBeginFlag(_) => "COMPOUNDBEGIN", 184 | AffixNode::CompoundEndFlag(_) => "COMPOUNDLAST", 185 | AffixNode::CompoundMiddleFlag(_) => "COMPOUNDMIDDLE", 186 | AffixNode::CompoundOnlyFlag(_) => "ONLYINCOMPOUND", 187 | AffixNode::CompoundPermitFlag(_) => "COMPOUNDPERMITFLAG", 188 | AffixNode::CompoundForbidFlag(_) => "COMPOUNDFORBIDFLAG", 189 | AffixNode::CompoundMoreSuffixes => "COMPOUNDMORESUFFIXES", 190 | AffixNode::CompoundRootFlag(_) => "COMPOUNDROOT", 191 | AffixNode::CompoundWordMax(_) => "COMPOUNDWORDMAX", 192 | AffixNode::CompoundForbidDup => "CHECKCOMPOUNDDUP", 193 | AffixNode::CompoundForbidRepeat => "CHECKCOMPOUNDREP", 194 | AffixNode::CompoundCheckCase => "CHECKCOMPOUNDCASE", 195 | AffixNode::CompoundCheckTriple => "CHECKCOMPOUNDTRIPLE", 196 | AffixNode::CompoundSimplifyTriple => "SIMPLIFIEDTRIPLE", 197 | AffixNode::CompoundForbidPats(_) => "CHECKCOMPOUNDPATTERN", 198 | AffixNode::CompoundForceUpFlag(_) => "FORCEUCASE", 199 | AffixNode::CompoundSyllable(_) => "COMPOUNDSYLLABLE", 200 | AffixNode::SyllableNum(_) => "SYLLABLENUM", 201 | AffixNode::Prefix(_) => "PFX", 202 | AffixNode::Suffix(_) => "SFX", 203 | AffixNode::AfxCircumfixFlag(_) => "CIRCUMFIX", 204 | AffixNode::ForbiddenWordFlag(_) => "FORBIDDENWORD", 205 | AffixNode::AfxFullStrip => "FULLSTRIP", 206 | AffixNode::AfxKeepCaseFlag(_) => "KEEPCASE", 207 | AffixNode::AfxInputConversion(_) => "ICONV", 208 | AffixNode::AfxOutputConversion(_) => "OCONV", 209 | AffixNode::AfxLemmaPresentFlag(_) => "LEMMA_PRESENT", 210 | AffixNode::AfxNeededFlag(_) => "NEEDAFFIX", 211 | AffixNode::AfxPseudoRootFlag(_) => "PSEUDOROOT", 212 | AffixNode::AfxSubstandardFlag(_) => "SUBSTANDARD", 213 | AffixNode::AfxWordChars(_) => "WORDCHARS", 214 | AffixNode::AfxCheckSharps => "CHECKSHARPS", 215 | AffixNode::Comment => "#", 216 | AffixNode::Name(_) => "NAME", 217 | AffixNode::HomePage(_) => "HOME", 218 | AffixNode::Version(_) => "VERSION", 219 | } 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /zspell/src/affix/rule.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | 3 | use super::RuleType; 4 | use crate::error::ParseErrorKind; 5 | use crate::helpers::{compile_re_pattern, ReWrapper}; 6 | use crate::morph::MorphInfo; 7 | use crate::Error; 8 | 9 | /// A simple prefix or suffix rule 10 | /// 11 | /// This struct represents a prefix or suffix option that may be applied to any 12 | /// base word. It contains multiple possible rule definitions that describe how 13 | /// to apply the rule. 14 | #[derive(Clone, Debug, PartialEq, Eq)] 15 | pub struct ParsedRuleGroup { 16 | /// Character identifier for this specific affix, usually any uppercase 17 | /// letter 18 | pub(crate) flag: String, 19 | /// Prefix or suffix 20 | pub(crate) kind: RuleType, 21 | /// Whether or not this can be combined with the opposite affix 22 | pub(crate) can_combine: bool, 23 | /// Actual rules for replacing 24 | pub(crate) rules: Vec, 25 | } 26 | 27 | #[derive(Clone, Debug, PartialEq, Eq)] 28 | pub struct ParsedRule { 29 | /// Affix to be added 30 | pub(crate) affix: String, 31 | /// Characters to remove from the beginning or end 32 | pub(crate) strip: Option>, 33 | /// Regex-based rule for when this rule is true. `None` indicates `.`, i.e., 34 | /// always true 35 | pub(crate) condition: Option, 36 | /// Morphological information 37 | pub(crate) morph_info: Vec>, 38 | } 39 | 40 | impl ParsedRule { 41 | #[allow(unused)] 42 | pub(crate) fn new( 43 | kind: RuleType, 44 | affix: &str, 45 | strip: Option<&str>, 46 | condition: Option<&str>, 47 | morph_info: Vec>, 48 | ) -> Result { 49 | let cond_re = match condition { 50 | Some(c) => compile_re_pattern(c, kind)?, 51 | None => None, 52 | }; 53 | 54 | Ok(Self { 55 | strip: strip.map(Into::into), 56 | affix: affix.to_owned(), 57 | condition: cond_re, 58 | morph_info, 59 | }) 60 | } 61 | 62 | /// Same as `new` but don't modify the regex string 63 | #[allow(unused)] 64 | pub(crate) fn new_raw_re( 65 | kind: RuleType, 66 | affix: &str, 67 | strip: Option<&str>, 68 | condition: Option<&str>, 69 | morph_info: Vec>, 70 | ) -> Result { 71 | let cond_re = match condition { 72 | Some(c) => Some(ReWrapper::new(c)?), 73 | None => None, 74 | }; 75 | 76 | Ok(Self { 77 | strip: strip.map(Into::into), 78 | affix: affix.to_owned(), 79 | condition: cond_re, 80 | morph_info, 81 | }) 82 | } 83 | 84 | /// Create from the information we have available during parse 85 | pub(crate) fn new_parse( 86 | kind: RuleType, 87 | affix: &str, 88 | strip: &str, 89 | condition: &str, 90 | morph_info: Vec>, 91 | ) -> Result { 92 | let cond_re = compile_re_pattern(condition, kind)?; 93 | let strip_chars = if strip == "0" { 94 | None 95 | } else { 96 | Some(strip.into()) 97 | }; 98 | 99 | Ok(Self { 100 | strip: strip_chars, 101 | affix: affix.to_owned(), 102 | condition: cond_re, 103 | morph_info, 104 | }) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /zspell/src/affix/tests.rs: -------------------------------------------------------------------------------- 1 | //! Affix tests 2 | 3 | use super::*; 4 | 5 | #[test] 6 | fn test_flagtype_convert_ok() { 7 | assert_eq!(FlagType::Ascii.str_to_flag("T"), Ok(Flag(84))); 8 | } 9 | -------------------------------------------------------------------------------- /zspell/src/affix/tests_parse.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | use pretty_assertions::assert_eq; 4 | use test_util::workspace_root; 5 | 6 | use super::*; 7 | use crate::affix::PartOfSpeech; 8 | use crate::error::Span; 9 | 10 | #[test] 11 | fn test_line_splitter_none() { 12 | let s = "no key here # abcd"; 13 | assert_eq!(line_splitter(s, "KEY"), None); 14 | } 15 | 16 | #[test] 17 | fn test_line_splitter_some() { 18 | let s1 = "KEY key here\nnext line"; 19 | let s2 = "KEY key here# comment"; 20 | let s3 = "KEY key here\rnext line"; 21 | let s4 = "# comment here\n#next line"; 22 | assert_eq!(line_splitter(s1, "KEY"), Some(("key here", "\nnext line"))); 23 | assert_eq!(line_splitter(s2, "KEY"), Some(("key here", "# comment"))); 24 | assert_eq!(line_splitter(s3, "KEY"), Some(("key here", "\rnext line"))); 25 | assert_eq!( 26 | line_splitter(s4, "#"), 27 | Some(("comment here", "\n#next line")) 28 | ); 29 | } 30 | 31 | #[test] 32 | fn test_line_key_parser_none() { 33 | let s = "no key here # abcd"; 34 | assert_eq!( 35 | line_key_parser(s, "KEY", |_| Ok(AffixNode::Comment)), 36 | Ok(None) 37 | ); 38 | } 39 | 40 | #[test] 41 | fn test_line_key_parser_some() { 42 | let s = "KEY key here\nnext line"; 43 | assert_eq!( 44 | line_key_parser(s, "KEY", |_| Ok(AffixNode::Comment)), 45 | Ok(Some((AffixNode::Comment, "\nnext line", 0))) 46 | ); 47 | } 48 | 49 | #[test] 50 | fn test_line_key_parser_err() { 51 | let s = "KEY key here\nnext line"; 52 | let e = ParseError::new_nospan(ParseErrorKind::Boolean, ""); 53 | assert_eq!(line_key_parser(s, "KEY", |_| Err(e.clone())), Err(e)); 54 | } 55 | 56 | #[test] 57 | fn test_line_key_parser() { 58 | let err = ParseError::new_nospan(ParseErrorKind::Boolean, ""); 59 | let get_lang = |s: &str| { 60 | if s == "apple" { 61 | Ok(AffixNode::Language("apple".to_owned())) 62 | } else { 63 | Err(err.clone()) 64 | } 65 | }; 66 | 67 | let txt1 = "LANG apple"; 68 | let txt2 = "LANG apple\nLANG banana"; 69 | let txt3 = "LANG failure"; 70 | 71 | assert_eq!( 72 | line_key_parser(txt1, "LANG", get_lang), 73 | Ok(Some((AffixNode::Language("apple".to_owned()), "", 0))) 74 | ); 75 | assert_eq!( 76 | line_key_parser(txt2, "LANG", get_lang), 77 | Ok(Some(( 78 | AffixNode::Language("apple".to_owned()), 79 | "\nLANG banana", 80 | 0 81 | ))) 82 | ); 83 | assert_eq!(line_key_parser(txt3, "LANG", get_lang), Err(err)); 84 | } 85 | 86 | #[test] 87 | fn test_parse_neighbor_keys() { 88 | let s = "KEY abc|def|ghi # end"; 89 | let res = parse_neighbor_keys(s); 90 | assert_eq!( 91 | res, 92 | Ok(Some(( 93 | AffixNode::NeighborKeys(vec!["abc".to_owned(), "def".to_owned(), "ghi".to_owned()]), 94 | "# end", 95 | 0 96 | ))) 97 | ); 98 | } 99 | 100 | #[test] 101 | fn test_bool_parser_ok() { 102 | let s = "COMPLEXPREFIXES\nmore stuff"; 103 | let res = parse_complex_prefixes(s); 104 | assert_eq!( 105 | res, 106 | Ok(Some((AffixNode::ComplexPrefixes, "\nmore stuff", 0))) 107 | ); 108 | } 109 | 110 | #[test] 111 | fn test_bool_parser_err() { 112 | let s = "COMPLEXPREFIXES unneeded things\nmore stuff"; 113 | let res = parse_complex_prefixes(s); 114 | assert!(res.is_err()); 115 | } 116 | 117 | #[test] 118 | fn test_munch_newline_some() { 119 | let s1 = " \nabc"; 120 | let s2 = "\n"; 121 | assert_eq!(munch_newline(s1), Ok(Some("abc"))); 122 | assert_eq!(munch_newline(s2), Ok(Some(""))); 123 | } 124 | 125 | #[test] 126 | fn test_munch_newline_none() { 127 | let s = " "; 128 | assert_eq!(munch_newline(s), Ok(None)); 129 | } 130 | 131 | #[test] 132 | fn test_munch_newline_cmt() { 133 | let s = " # abcd \nresid"; 134 | assert_eq!(munch_newline(s), Ok(Some("resid"))); 135 | } 136 | 137 | #[test] 138 | fn test_munch_newline_err() { 139 | let s = " abcd \nresid"; 140 | assert!(munch_newline(s).is_err()); 141 | } 142 | 143 | #[test] 144 | fn test_table_parser_ok() { 145 | let s = "REP 3\nREP a b\nREP c d\nREP longer val"; 146 | let expected = AffixNode::Replacement(vec![ 147 | Conversion::new("a", "b", false), 148 | Conversion::new("c", "d", false), 149 | Conversion::new("longer", "val", false), 150 | ]); 151 | assert_eq!(parse_replacement(s), Ok(Some((expected, "", 3)))); 152 | } 153 | 154 | #[test] 155 | fn test_afx_table_parser_err() { 156 | // check line offset count 157 | let s = "PFX A N 2\nPFX 10 a b x .\nPFX A 0 c a"; 158 | let res = parse_prefix(s); 159 | assert_eq!(res.unwrap_err().span().unwrap(), &Span::new(1, 0)); 160 | } 161 | 162 | const SAMPLE_AFX_OK: &str = r#" 163 | SET UTF-8 164 | TRY abcd' 165 | # comment 166 | ICONV 2 # comment 167 | ICONV a b # comment 168 | ICONV ' " 169 | NOSUGGEST X 170 | ONLYINCOMPOUND C 171 | WORDCHARS 01234 172 | # comment 173 | PFX A N 2 174 | PFX A 0 ar . po:verb st:foot is:ay other:foo otherfoo po:xyz 175 | PFX A 0 br a 176 | 177 | SFX B Y 2 178 | SFX B 0 ar . 179 | SFX B 0 br [^a] 180 | 181 | REP 2 182 | REP a b 183 | REP abcd 123 184 | 185 | PHONE 1 186 | PHONE abcd 1234 187 | "#; 188 | 189 | #[test] 190 | fn test_full_parse() { 191 | let expected = vec![ 192 | AffixNode::Encoding(Encoding::Utf8), 193 | AffixNode::TryCharacters("abcd'".into()), 194 | AffixNode::Comment, 195 | AffixNode::AfxInputConversion(vec![ 196 | Conversion::new("a", "b", false), 197 | Conversion::new("'", "\"", false), 198 | ]), 199 | AffixNode::NoSuggestFlag("X".into()), 200 | AffixNode::CompoundOnlyFlag("C".into()), 201 | AffixNode::AfxWordChars("01234".into()), 202 | AffixNode::Comment, 203 | AffixNode::Prefix(ParsedRuleGroup { 204 | flag: "A".to_owned(), 205 | kind: RuleType::Prefix, 206 | can_combine: false, 207 | rules: vec![ 208 | ParsedRule::new_raw_re( 209 | RuleType::Prefix, 210 | "ar", 211 | None, 212 | None, 213 | vec![ 214 | MorphInfo::Part(PartOfSpeech::Verb).into(), 215 | MorphInfo::Stem("foot".into()).into(), 216 | MorphInfo::InflecSfx("ay".into()).into(), 217 | MorphInfo::Other("other:foo".into()).into(), 218 | MorphInfo::Other("otherfoo".into()).into(), 219 | MorphInfo::Part(PartOfSpeech::Other("xyz".into())).into(), 220 | ], 221 | ) 222 | .unwrap(), 223 | ParsedRule::new_raw_re(RuleType::Prefix, "br", None, Some("^a.*$"), Vec::new()) 224 | .unwrap(), 225 | ], 226 | }), 227 | AffixNode::Suffix(ParsedRuleGroup { 228 | flag: "B".to_owned(), 229 | kind: RuleType::Suffix, 230 | can_combine: true, 231 | rules: vec![ 232 | ParsedRule::new_raw_re(RuleType::Suffix, "ar", None, None, Vec::new()).unwrap(), 233 | ParsedRule::new_raw_re(RuleType::Suffix, "br", None, Some("^.*[^a]$"), Vec::new()) 234 | .unwrap(), 235 | ], 236 | }), 237 | AffixNode::Replacement(vec![ 238 | Conversion::new("a", "b", false), 239 | Conversion::new("abcd", "123", false), 240 | ]), 241 | AffixNode::Phonetic(vec![Phonetic::new("abcd", "1234")]), 242 | ]; 243 | 244 | assert_eq!(affix_from_str(SAMPLE_AFX_OK), Ok(expected)); 245 | } 246 | 247 | #[test] 248 | fn test_large_file_parse() { 249 | let mut aff_path = workspace_root(); 250 | aff_path.push("dictionaries"); 251 | aff_path.push("en_US.aff"); 252 | 253 | let Ok(aff_content) = fs::read_to_string(aff_path) else { 254 | eprintln!("skipping large test flies; not found"); 255 | return; 256 | }; 257 | 258 | assert!(affix_from_str(&aff_content).is_ok()); 259 | } 260 | -------------------------------------------------------------------------------- /zspell/src/dict/flags.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::{self, Display}; 2 | use std::sync::Arc; 3 | 4 | use super::rule::AfxRule; 5 | 6 | /// A flag representation is either an ASCII char, unicode char, or number. We can fit 7 | /// any of those in a u32. 8 | #[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] 9 | pub struct Flag(pub u32); 10 | 11 | impl Flag { 12 | pub fn new_ascii(ch: u8) -> Self { 13 | debug_assert!(ch.is_ascii()); 14 | Self(ch.into()) 15 | } 16 | 17 | pub fn new_utf8(ch: char) -> Self { 18 | Self(ch.into()) 19 | } 20 | 21 | /// Must be a 2-character string 22 | pub fn new_long(s: &str) -> Self { 23 | debug_assert!(s.len() == 2, "invalid string length: {s}"); 24 | debug_assert!( 25 | s.chars().all(|ch| ch.is_ascii()), 26 | "invalid string characters: {s}" 27 | ); 28 | 29 | let num = u16::from_le_bytes(s[..=1].as_bytes().try_into().unwrap()); 30 | 31 | Self(num.into()) 32 | } 33 | 34 | pub fn new_number(num: u32) -> Self { 35 | Self(num) 36 | } 37 | } 38 | 39 | impl fmt::Debug for Flag { 40 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 41 | if let Ok(single_flag) = u8::try_from(self.0) { 42 | write!(f, "{}", char::from(single_flag)) 43 | } else if let Ok(long_flag) = u16::try_from(self.0) { 44 | let [a, b] = long_flag.to_le_bytes(); 45 | write!(f, "{}{}", char::from(a), char::from(b)) 46 | } else { 47 | write!(f, "{:#06x}", self.0) 48 | } 49 | } 50 | } 51 | 52 | /// A representation of a flag value 53 | #[non_exhaustive] 54 | #[derive(Debug, Clone, Hash, PartialEq, Eq)] 55 | pub enum FlagValue { 56 | // LemmaPresent and PseudoRoot are missing as they are deprecated 57 | AfxCircumfix, 58 | AfxKeepCase, 59 | AfxNeeded, 60 | AfxPseudoRoot, 61 | AfxSubstandard, 62 | Compound, 63 | CompoundBegin, 64 | CompoundEnd, 65 | CompoundForbid, 66 | CompoundForceUp, 67 | CompoundMiddle, 68 | CompoundOnly, 69 | CompoundPermit, 70 | CompoundRoot, 71 | ForbiddenWord, 72 | NoSuggest, 73 | WarnRare, 74 | /// Special case 75 | Rule(Arc), 76 | } 77 | 78 | impl Display for FlagValue { 79 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 80 | match self { 81 | FlagValue::AfxCircumfix => write!(f, "AfxCircumfix"), 82 | FlagValue::AfxKeepCase => write!(f, "AfxKeepCase"), 83 | FlagValue::AfxNeeded => write!(f, "AfxNeeded"), 84 | FlagValue::AfxPseudoRoot => write!(f, "AfxPseudoRoot"), 85 | FlagValue::AfxSubstandard => write!(f, "AfxSubstandard"), 86 | FlagValue::Compound => write!(f, "Compound"), 87 | FlagValue::CompoundBegin => write!(f, "CompoundBegin"), 88 | FlagValue::CompoundEnd => write!(f, "CompoundEnd"), 89 | FlagValue::CompoundForbid => write!(f, "CompoundForbid"), 90 | FlagValue::CompoundForceUp => write!(f, "CompoundForceUp"), 91 | FlagValue::CompoundMiddle => write!(f, "CompoundMiddle"), 92 | FlagValue::CompoundOnly => write!(f, "CompoundOnly"), 93 | FlagValue::CompoundPermit => write!(f, "CompoundPermit"), 94 | FlagValue::CompoundRoot => write!(f, "CompoundRoot"), 95 | FlagValue::ForbiddenWord => write!(f, "ForbiddenWord"), 96 | FlagValue::NoSuggest => write!(f, "NoSuggest"), 97 | FlagValue::WarnRare => write!(f, "WarnRare"), 98 | FlagValue::Rule(_) => write!(f, "Rule"), 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /zspell/src/dict/meta.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Borrow; 2 | use std::sync::Arc; 3 | 4 | use super::rule::AfxRule; 5 | use crate::morph::MorphInfo; 6 | 7 | /// Additional information attached to an entry in a dictionary 8 | /// 9 | /// Cheaply cloneable 10 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 11 | pub struct Meta { 12 | stem: Arc, 13 | source: Source, 14 | } 15 | 16 | impl Meta { 17 | pub(crate) fn new(stem_rc: Arc, source: Source) -> Self { 18 | Self { 19 | stem: stem_rc, 20 | source, 21 | } 22 | } 23 | 24 | /// Return the stem of a word. Prefers the stem from the morph info if it is available 25 | pub fn stem(&self) -> &str { 26 | // If we have a dictionary source, check if we have a stem-type `MorphInfo` 27 | // and return it 28 | if let Source::Dict(morphvec) = &self.source { 29 | if let Some(stem) = morphvec.iter().find_map(|morph| { 30 | if let MorphInfo::Stem(st) = morph.borrow() { 31 | Some(st) 32 | } else { 33 | None 34 | } 35 | }) { 36 | return stem.as_ref(); 37 | } 38 | } 39 | 40 | &self.stem 41 | } 42 | 43 | pub fn source(&self) -> &Source { 44 | &self.source 45 | } 46 | } 47 | 48 | /// Source information 49 | #[allow(clippy::box_collection)] 50 | #[non_exhaustive] 51 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 52 | pub enum Source { 53 | /// This meta came from an affix and has a full affix rule 54 | Affix { 55 | /// The full rule that created this 56 | rule: Arc, 57 | /// Index of the relevant pattern within the rule. This could potentially be a reference 58 | /// but that might require a `RefCell`, and I don't want to risk reference 59 | pat_idx: usize, 60 | }, 61 | /// This meta came from a .dic file, only contains morphinfo 62 | Dict(Arc<[Arc]>), 63 | /// This meta came from the personal dictionary 64 | Personal(Arc), 65 | /// The source is a raw text file with no additional metadata 66 | Raw, 67 | } 68 | 69 | impl Source { 70 | /// Iterate through all morph info available 71 | pub fn morphs(&self) -> impl Iterator { 72 | match self { 73 | Source::Affix { rule, pat_idx } => rule.patterns()[*pat_idx].morph_info(), 74 | Source::Dict(v) => v.as_ref(), 75 | Source::Personal(v) => v.morph.as_ref(), 76 | Source::Raw => &[], 77 | } 78 | .iter() 79 | .map(AsRef::as_ref) 80 | } 81 | 82 | /// Helper to create an `Affix` source when the `Arc` already exists 83 | pub(crate) fn new_affix(rule: &Arc, pat_idx: usize) -> Self { 84 | Self::Affix { 85 | rule: Arc::clone(rule), 86 | pat_idx, 87 | } 88 | } 89 | } 90 | 91 | /// Representation of meta info for a personal dictionary 92 | #[derive(Debug, PartialEq, Eq, Hash)] 93 | pub struct PersonalMeta { 94 | friend: Option>, 95 | morph: Vec>, 96 | } 97 | 98 | impl PersonalMeta { 99 | pub fn new(friend: Option>, morph: Vec>) -> Self { 100 | Self { friend, morph } 101 | } 102 | } 103 | 104 | #[cfg(test)] 105 | #[allow(unused)] 106 | mod tests { 107 | use std::collections::hash_map::DefaultHasher; 108 | use std::hash::{Hash, Hasher}; 109 | 110 | use super::*; 111 | 112 | fn calculate_hash(t: &T) -> u64 { 113 | let mut s = DefaultHasher::new(); 114 | t.hash(&mut s); 115 | s.finish() 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /zspell/src/dict/parse.rs: -------------------------------------------------------------------------------- 1 | //! Parse a dict file 2 | 3 | use std::sync::Arc; 4 | 5 | use super::Flag; 6 | use crate::affix::FlagType; 7 | use crate::error::ParseError; 8 | use crate::helpers::convertu32; 9 | use crate::morph::MorphInfo; 10 | 11 | /// Represent a single line in a dictionary file 12 | /// 13 | /// Format is as follows: 14 | /// 15 | /// ```text 16 | /// word[/flags...] [morphinfo ...] 17 | /// band/ESGD po:noun 18 | /// laser/M 19 | /// fruit 20 | /// ``` 21 | /// Flags and morph info are optional 22 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 23 | pub struct DictEntry { 24 | pub(super) stem: Arc, 25 | pub(super) flags: Vec, 26 | pub(super) morph: Vec>, 27 | } 28 | 29 | impl DictEntry { 30 | /// Test config: create a new `DictEntry` 31 | #[cfg(test)] 32 | pub(crate) fn new(stem: &str, flags: &[Flag], morph: &[MorphInfo]) -> Self { 33 | Self { 34 | stem: stem.into(), 35 | flags: flags.to_owned(), 36 | morph: morph.iter().map(|v| Arc::new(v.clone())).collect(), 37 | } 38 | } 39 | 40 | /// Create a `DictEntry` from a single line in a `.dic` file. Does not strip comments. 41 | fn parse_single(value: &str, flag_type: FlagType, line_num: u32) -> Result { 42 | let (stem, flagstr, morphstr) = separate_into_parts(value); 43 | 44 | let flags: Vec = match flagstr { 45 | Some(s) => flag_type 46 | .parse_str(s.trim()) 47 | .map_err(|e| ParseError::new_nocol(e, s, line_num))?, 48 | None => Vec::new(), 49 | }; 50 | let morph = MorphInfo::many_from_str(morphstr.trim()) 51 | .map(Arc::new) 52 | .collect(); 53 | let ret = Self { 54 | stem: stem.trim().into(), 55 | flags, 56 | morph, 57 | }; 58 | Ok(ret) 59 | } 60 | 61 | /// Parse a complete dictionary file (usually `.dic`) 62 | /// 63 | /// # Errors 64 | /// 65 | /// Returns an error if any entry is incorrect. 66 | #[inline] 67 | #[allow(clippy::option_if_let_else)] 68 | pub fn parse_all(input: &str, flag_type: FlagType) -> Result, ParseError> { 69 | // Ignore empty lines and 70 | let mut lines_iter = extract_content(input); 71 | let lines_backup = lines_iter.clone(); 72 | 73 | let Some(first) = lines_iter.next() else { 74 | return Ok(Vec::new()); 75 | }; 76 | 77 | // Try to parse the first line as an integer; if not, ignore it 78 | let (mut ret, start) = if let Ok(cap) = first.parse::() { 79 | (Vec::with_capacity(cap), 2) 80 | } else { 81 | lines_iter = lines_backup; 82 | (Vec::new(), 1) 83 | }; 84 | 85 | for (i, line) in lines_iter.enumerate() { 86 | ret.push( 87 | DictEntry::parse_single(line, flag_type, convertu32(i + start)) 88 | .map_err(|e| e.add_offset_ret(i + start, 0))?, 89 | ); 90 | } 91 | Ok(ret) 92 | } 93 | } 94 | 95 | /// Represent an entry from a personal dictionary 96 | /// 97 | /// Format is as follows: 98 | /// 99 | /// ```text 100 | /// [*]word[/friend] [morphinfo ...] 101 | /// enum/apple po:noun 102 | /// someword 103 | /// *ignoreword 104 | /// ``` 105 | /// 106 | /// The hunspell spec doesn't say anything about morph info, but why not allow 107 | /// it 108 | #[derive(Clone, Debug, PartialEq, Eq, Hash)] 109 | pub struct PersonalEntry { 110 | pub stem: Arc, 111 | /// Reference to a main word in the dictionary that this word should inherit 112 | /// its metadata (stemming, affixes, etc) from 113 | pub friend: Option>, 114 | pub morph: Vec, 115 | pub forbid: bool, 116 | } 117 | 118 | impl PersonalEntry { 119 | #[cfg(test)] 120 | pub(crate) fn new( 121 | stem: &str, 122 | friend: Option<&str>, 123 | morph: Vec, 124 | forbid: bool, 125 | ) -> Self { 126 | Self { 127 | stem: stem.into(), 128 | friend: friend.map(Into::into), 129 | morph, 130 | forbid, 131 | } 132 | } 133 | 134 | pub fn parse_single(value: &str) -> Self { 135 | let (stem, friend, morphstr) = separate_into_parts(value); 136 | let forbid = stem.starts_with('*'); 137 | let stem = stem.strip_prefix('*').unwrap_or(stem); 138 | let morph = MorphInfo::many_from_str(morphstr).collect(); 139 | 140 | Self { 141 | stem: stem.trim().into(), 142 | friend: friend.map(|f| f.trim().into()), 143 | morph, 144 | forbid, 145 | } 146 | } 147 | /// Parse a personal dictionary file 148 | pub fn parse_all(s: &str) -> Vec { 149 | extract_content(s).map(Self::parse_single).collect() 150 | } 151 | } 152 | 153 | /// Separate `(stem, flagstr, morphstr)` into parts 154 | fn separate_into_parts(value: &str) -> (&str, Option<&str>, &str) { 155 | let stem: &str; 156 | let flagstr: Option<&str>; 157 | let morphstr: &str; 158 | 159 | let value = value.split_once('#').unwrap_or((value, "")).0; 160 | 161 | // Split out the sections 162 | if let Some((word, rest)) = value.split_once('/') { 163 | // Easy case, we have an affix and can split on `/`. Then just split the first 164 | // whitespace to separate morph from the flags. 165 | stem = word; 166 | let (tmpflag, tmpmorph) = rest 167 | .split_once(|ch: char| ch.is_ascii_whitespace()) 168 | .unwrap_or((rest, "")); 169 | flagstr = Some(tmpflag); 170 | morphstr = tmpmorph; 171 | } else { 172 | // Trickier case; we look for a colon, find whitespace to the left, and assume 173 | // everything to the left of that is the word 174 | flagstr = None; 175 | (stem, morphstr) = value.find(':').map_or((value, ""), |idx| { 176 | value[..idx] 177 | .rfind(|ch: char| ch.is_ascii_whitespace()) 178 | .map_or((value, ""), |ws_idx| (&value[..ws_idx], &value[ws_idx..])) 179 | }); 180 | }; 181 | (stem, flagstr, morphstr) 182 | } 183 | 184 | /// Extract nonempty lines that do not contain a comment 185 | fn extract_content(input: &str) -> impl Iterator + Clone { 186 | input 187 | .lines() 188 | // Dictionary files sometimes use tabs for comments, need to check before trim 189 | .filter(|line| !line.starts_with('\t')) 190 | // Trim hash comments 191 | .map(|line| line.split_once('#').unwrap_or((line, "")).0) 192 | .map(str::trim) 193 | .filter(|line| !line.is_empty()) 194 | } 195 | 196 | #[cfg(test)] 197 | #[path = "tests_parse.rs"] 198 | mod tests; 199 | -------------------------------------------------------------------------------- /zspell/src/dict/rule.rs: -------------------------------------------------------------------------------- 1 | //! Implementation for a stored rule 2 | 3 | use std::hash::Hash; 4 | use std::sync::Arc; 5 | 6 | use crate::affix::{ParsedCfg, ParsedRuleGroup, RuleType}; 7 | use crate::helpers::ReWrapper; 8 | use crate::morph::MorphInfo; 9 | 10 | /// A single rule group 11 | #[derive(Debug, PartialEq, Eq, Hash)] 12 | pub struct AfxRule { 13 | kind: RuleType, 14 | can_combine: bool, 15 | patterns: Vec, 16 | } 17 | 18 | impl AfxRule { 19 | /// Creates a rule with a single pattern 20 | #[cfg(test)] 21 | pub fn new( 22 | kind: RuleType, 23 | affixes: &[&str], 24 | patterns: &[&str], 25 | can_combine: bool, 26 | _strip: Option<&str>, 27 | _condition: Option<&str>, 28 | ) -> Self { 29 | let mut ret = Self { 30 | kind, 31 | can_combine, 32 | patterns: affixes 33 | .iter() 34 | .map(|afx| AfxRulePattern::new(afx, None)) 35 | .collect(), 36 | }; 37 | for (idx, pat) in patterns.iter().enumerate() { 38 | ret.patterns[idx].set_pattern(pat, kind).unwrap(); 39 | } 40 | ret 41 | } 42 | 43 | /// Take a [`ParsedGroup`] and turn it into a vector of `AfxRule` 44 | /// 45 | /// NOTE: returns a vec reference and `Self`'s morph vec will be empty! 46 | /// Needs construction wherever the Arc target is 47 | // PERF: bench with & without vec reference instead of output 48 | pub fn from_parsed_group(_cfg: &ParsedCfg, group: &ParsedRuleGroup) -> Self { 49 | let mut ret = Self { 50 | kind: group.kind, 51 | can_combine: group.can_combine, 52 | patterns: Vec::with_capacity(group.rules.len()), 53 | }; 54 | 55 | for rule in &group.rules { 56 | let morph_info: Vec> = rule.morph_info.clone(); 57 | 58 | ret.patterns.push(AfxRulePattern { 59 | affix: rule.affix.as_str().into(), 60 | condition: rule.condition.clone(), 61 | // FIXME: `rule.strip.as_ref().map(Arc::clone)` is more accurate, but flagged by 62 | // clippy 63 | strip: rule.strip.clone(), 64 | morph_info, 65 | }); 66 | } 67 | 68 | ret 69 | } 70 | 71 | pub fn is_pfx(&self) -> bool { 72 | self.kind == RuleType::Prefix 73 | } 74 | 75 | pub fn is_sfx(&self) -> bool { 76 | self.kind == RuleType::Prefix 77 | } 78 | 79 | pub fn can_combine(&self) -> bool { 80 | self.can_combine 81 | } 82 | 83 | /// Apply this rules patterns. Returns an iterator over the index of the 84 | /// pattern and the resulting string 85 | pub fn apply_patterns<'a>( 86 | &'a self, 87 | stem: &'a str, 88 | ) -> impl Iterator + 'a { 89 | self.patterns 90 | .iter() 91 | .enumerate() 92 | .filter_map(|(idx, pat)| pat.apply_pattern(stem, self.kind).map(|s| (idx, s))) 93 | } 94 | 95 | pub(crate) fn patterns(&self) -> &[AfxRulePattern] { 96 | &self.patterns 97 | } 98 | } 99 | 100 | /// A single affix rule application 101 | #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)] 102 | pub struct AfxRulePattern { 103 | affix: Box, 104 | /// Condition to be met to apply this rule. 105 | condition: Option, 106 | /// Characters to strip 107 | strip: Option>, 108 | /// Associated morph info 109 | morph_info: Vec>, 110 | } 111 | 112 | impl AfxRulePattern { 113 | /// New with a specified affix, otherwise default values 114 | #[cfg(test)] 115 | pub fn new(afx: &str, strip: Option<&str>) -> Self { 116 | Self { 117 | affix: afx.into(), 118 | condition: None, 119 | strip: strip.map(Into::into), 120 | morph_info: Vec::new(), 121 | } 122 | } 123 | 124 | /// Helper for testing, sets the condition based on a kind 125 | #[cfg(test)] 126 | pub fn set_pattern(&mut self, condition: &str, kind: RuleType) -> Result<(), regex::Error> { 127 | self.condition = crate::helpers::compile_re_pattern(condition, kind)?; 128 | Ok(()) 129 | } 130 | 131 | /// Check whether a condition is applicable 132 | #[allow(clippy::option_if_let_else)] 133 | pub fn check_condition(&self, s: &str) -> bool { 134 | match &self.condition { 135 | Some(re) => re.is_match(s), 136 | None => true, 137 | } 138 | } 139 | 140 | pub(crate) fn morph_info(&self) -> &[Arc] { 141 | &self.morph_info 142 | } 143 | 144 | // Verify the match condition and apply this rule 145 | #[allow(clippy::option_if_let_else)] 146 | fn apply_pattern(&self, s: &str, kind: RuleType) -> Option { 147 | // No return if condition doesn't match 148 | if !self.check_condition(s) { 149 | return None; 150 | } 151 | 152 | match kind { 153 | RuleType::Prefix => { 154 | // If stripping chars exist, strip them from the prefix 155 | let mut working: String = self.affix.as_ref().into(); 156 | 157 | if let Some(sc) = &self.strip { 158 | working.push_str(s.strip_prefix(sc.as_ref()).unwrap_or(s)); 159 | } else { 160 | working.push_str(s); 161 | } 162 | working.shrink_to_fit(); 163 | Some(working) 164 | } 165 | RuleType::Suffix => { 166 | // Same logic as above 167 | let mut working = if let Some(sc) = &self.strip { 168 | s.strip_suffix(sc.as_ref()).unwrap_or(s).to_owned() 169 | } else { 170 | s.to_owned() 171 | }; 172 | working.push_str(&self.affix); 173 | working.shrink_to_fit(); 174 | Some(working) 175 | } 176 | } 177 | } 178 | } 179 | 180 | #[cfg(test)] 181 | #[path = "tests_rule.rs"] 182 | mod tests; 183 | -------------------------------------------------------------------------------- /zspell/src/dict/rules_reverse.rs: -------------------------------------------------------------------------------- 1 | //! Take rules and qpply them to a word, trying to find a match in an 2 | //! existing wordlist. 3 | #![allow(unused)] 4 | 5 | use crate::affix::CompoundConfig; 6 | use crate::Dictionary; 7 | 8 | /// Try to create a word 9 | fn entrypoint(dict: &Dictionary, word: &str) -> bool { 10 | todo!() 11 | } 12 | 13 | fn try_strip_pfx() { 14 | todo!() 15 | } 16 | 17 | fn try_strip_sfx() {} 18 | 19 | /// Try splitting the word at each position and testing the parts according to 20 | /// compound rules 21 | fn compound_thing(cfg: &CompoundConfig) {} 22 | -------------------------------------------------------------------------------- /zspell/src/dict/tests.rs: -------------------------------------------------------------------------------- 1 | //! Tests for a dict file 2 | 3 | use std::fs; 4 | 5 | use indoc::indoc; 6 | use pretty_assertions::assert_eq; 7 | use test_util::workspace_root; 8 | 9 | use super::*; 10 | 11 | #[test] 12 | fn test_update_personal() { 13 | let personal_str = r" 14 | abcd po:verb 15 | efgh st:something 16 | *ijkl 17 | mnop 18 | qrst 19 | uvwx st:something 20 | *yz12 po:verb 21 | 3456 22 | "; 23 | 24 | let mut d = Dictionary::new(ParsedCfg::default()).unwrap(); 25 | d.parse_update_personal(personal_str, &[]).unwrap(); 26 | assert!(d.wordlist.0.contains_key("abcd")); 27 | assert!(d.wordlist.0.contains_key("efgh")); 28 | assert!(!d.wordlist.0.contains_key("ijkl")); 29 | assert!(d.wordlist_forbidden.0.contains_key("ijkl")); 30 | assert!(d.check("abcd")); 31 | assert!(d.check("uvwx")); 32 | assert!(!d.check("ijkl")); 33 | 34 | let entry = d.entry("efgh"); 35 | let stems: Vec<_> = entry.stems().unwrap().collect(); 36 | assert_eq!(stems, vec!["efgh", "something"]); 37 | } 38 | 39 | #[test] 40 | #[cfg(not(miri))] // slow! 41 | fn test_builder() { 42 | let aff_content = fs::read_to_string("tests/files/w1_eng_short.aff").unwrap(); 43 | let dic_content = fs::read_to_string("tests/files/w1_eng_short.dic").unwrap(); 44 | let dict = DictBuilder::new() 45 | .config_str(&aff_content) 46 | .dict_str(&dic_content) 47 | .build() 48 | .unwrap(); 49 | 50 | assert_eq!(dict.check("reptiles pillow bananas"), true); 51 | assert_eq!(dict.check("pine missssspelled"), false); 52 | } 53 | 54 | #[test] 55 | fn test_builder_large_file() { 56 | let mut aff_path = workspace_root(); 57 | aff_path.push("dictionaries"); 58 | let mut dic_path = aff_path.clone(); 59 | aff_path.push("en_US.aff"); 60 | dic_path.push("en_US.dic"); 61 | 62 | let Ok(aff_content) = fs::read_to_string(aff_path) else { 63 | eprintln!("skipping large test flies; not found"); 64 | return; 65 | }; 66 | 67 | let dic_content = fs::read_to_string(dic_path).unwrap(); 68 | let dict = DictBuilder::new() 69 | .config_str(&aff_content) 70 | .dict_str(&dic_content) 71 | .build() 72 | .unwrap(); 73 | 74 | assert_eq!(dict.check("reptiles pillow bananas"), true); 75 | assert_eq!(dict.check("pine missssspelled"), false); 76 | } 77 | 78 | // Test how data is inserted 79 | #[test] 80 | fn test_morph() { 81 | use crate::DictBuilder; 82 | 83 | let dict_str = "drink/X po:verb"; 84 | let aff_str = indoc! {" 85 | SFX X Y 1 86 | SFX X 0 able . ds:able 87 | "}; 88 | 89 | let d = DictBuilder::new() 90 | .dict_str(dict_str) 91 | .config_str(aff_str) 92 | .build() 93 | .unwrap(); 94 | 95 | let meta = d.wordlist.0.get("drinkable").unwrap(); 96 | assert_eq!(meta[0].stem(), "drink"); 97 | assert_eq!(meta[1].stem(), "drink"); 98 | assert!(matches!( 99 | meta[0].source(), 100 | Source::Affix { 101 | rule: _, 102 | pat_idx: 0 103 | } 104 | )); 105 | 106 | let Source::Dict(mvec) = meta[1].source() else { 107 | panic!() 108 | }; 109 | 110 | let po = MorphInfo::Part(crate::PartOfSpeech::Verb); 111 | assert_eq!(mvec.as_ref(), [po.clone().into()]); 112 | 113 | let entry = d.entry("drinkable"); 114 | let morph = entry.analyze().unwrap().collect::>(); 115 | assert_eq!(morph, [&MorphInfo::DerivSfx("able".into()), &po]); 116 | 117 | let stems = entry.stems().unwrap().collect::>(); 118 | assert_eq!(stems, ["drink"]); 119 | // assert_eq!(stems, ["drinkable", "drink"]); 120 | } 121 | -------------------------------------------------------------------------------- /zspell/src/dict/tests_parse.rs: -------------------------------------------------------------------------------- 1 | use pretty_assertions::assert_eq; 2 | 3 | use super::*; 4 | 5 | #[test] 6 | fn test_dict_entry_ok() { 7 | let f1 = FlagType::Utf8; 8 | let f2 = FlagType::Ascii; 9 | let f3 = FlagType::Long; 10 | 11 | let s_0f0m_1 = "abcd"; 12 | let s_0f0m_2 = "abcd # comment"; 13 | let s_4f0m_1 = "abcd/ABCD"; 14 | let s_4f0m_2 = "abcd/ABCD # comment"; 15 | let s_4f2m_1 = "abcd/ABCD ip:m1 tp:m2"; 16 | let s_4f2m_2 = "abcd/ABCD ip:m1 tp:m2 # comment"; 17 | let s_0f2m_1 = "abcd ip:m1 tp:m2"; 18 | let s_0f2m_2 = "abcd ip:m1 tp:m2 # comment"; 19 | 20 | // No flags 21 | let r_0f0m = DictEntry::new("abcd", &[], &[]); 22 | 23 | // All flags 24 | let r_4f0m = DictEntry::new( 25 | "abcd", 26 | &[ 27 | Flag::new_ascii(b'A'), 28 | Flag::new_ascii(b'B'), 29 | Flag::new_ascii(b'C'), 30 | Flag::new_ascii(b'D'), 31 | ], 32 | &[], 33 | ); 34 | 35 | let r_2f0m = DictEntry::new("abcd", &[Flag::new_long("AB"), Flag::new_long("CD")], &[]); 36 | 37 | // All flags plus morph info 38 | let r_4f2m = DictEntry::new( 39 | "abcd", 40 | &[ 41 | Flag::new_ascii(b'A'), 42 | Flag::new_ascii(b'B'), 43 | Flag::new_ascii(b'C'), 44 | Flag::new_ascii(b'D'), 45 | ], 46 | &[ 47 | MorphInfo::InflecPfx("m1".into()), 48 | MorphInfo::TermPfx("m2".into()), 49 | ], 50 | ); 51 | 52 | let r_2f2m = DictEntry::new( 53 | "abcd", 54 | &[Flag::new_long("AB"), Flag::new_long("CD")], 55 | &[ 56 | MorphInfo::InflecPfx("m1".into()), 57 | MorphInfo::TermPfx("m2".into()), 58 | ], 59 | ); 60 | 61 | // No flags, including morph info 62 | let r_0f2m = DictEntry::new( 63 | "abcd", 64 | &[], 65 | &[ 66 | MorphInfo::InflecPfx("m1".into()), 67 | MorphInfo::TermPfx("m2".into()), 68 | ], 69 | ); 70 | 71 | assert_eq!(DictEntry::parse_single(s_0f0m_1, f1, 0), Ok(r_0f0m.clone())); 72 | assert_eq!(DictEntry::parse_single(s_0f0m_2, f1, 0), Ok(r_0f0m.clone())); 73 | assert_eq!(DictEntry::parse_single(s_4f0m_1, f1, 0), Ok(r_4f0m.clone())); 74 | assert_eq!(DictEntry::parse_single(s_4f0m_2, f1, 0), Ok(r_4f0m.clone())); 75 | assert_eq!(DictEntry::parse_single(s_4f2m_1, f1, 0), Ok(r_4f2m.clone())); 76 | assert_eq!(DictEntry::parse_single(s_4f2m_2, f1, 0), Ok(r_4f2m.clone())); 77 | assert_eq!(DictEntry::parse_single(s_0f2m_1, f1, 0), Ok(r_0f2m.clone())); 78 | assert_eq!(DictEntry::parse_single(s_0f2m_2, f1, 0), Ok(r_0f2m.clone())); 79 | 80 | assert_eq!(DictEntry::parse_single(s_0f0m_1, f2, 0), Ok(r_0f0m.clone())); 81 | assert_eq!(DictEntry::parse_single(s_0f0m_2, f2, 0), Ok(r_0f0m.clone())); 82 | assert_eq!(DictEntry::parse_single(s_4f0m_1, f2, 0), Ok(r_4f0m.clone())); 83 | assert_eq!(DictEntry::parse_single(s_4f0m_2, f2, 0), Ok(r_4f0m)); 84 | assert_eq!(DictEntry::parse_single(s_4f2m_1, f2, 0), Ok(r_4f2m.clone())); 85 | assert_eq!(DictEntry::parse_single(s_4f2m_1, f2, 0), Ok(r_4f2m)); 86 | assert_eq!(DictEntry::parse_single(s_0f2m_2, f2, 0), Ok(r_0f2m.clone())); 87 | assert_eq!(DictEntry::parse_single(s_0f2m_2, f2, 0), Ok(r_0f2m.clone())); 88 | 89 | assert_eq!(DictEntry::parse_single(s_0f0m_1, f3, 0), Ok(r_0f0m.clone())); 90 | assert_eq!(DictEntry::parse_single(s_0f0m_2, f3, 0), Ok(r_0f0m)); 91 | assert_eq!(DictEntry::parse_single(s_4f0m_1, f3, 0), Ok(r_2f0m.clone())); 92 | assert_eq!(DictEntry::parse_single(s_4f0m_2, f3, 0), Ok(r_2f0m)); 93 | assert_eq!(DictEntry::parse_single(s_4f2m_1, f3, 0), Ok(r_2f2m.clone())); 94 | assert_eq!(DictEntry::parse_single(s_4f2m_1, f3, 0), Ok(r_2f2m)); 95 | assert_eq!(DictEntry::parse_single(s_0f2m_1, f3, 0), Ok(r_0f2m.clone())); 96 | assert_eq!(DictEntry::parse_single(s_0f2m_2, f3, 0), Ok(r_0f2m)); 97 | } 98 | 99 | #[test] 100 | fn test_personal_entry_ok() { 101 | let s1 = "abcd # comment"; 102 | let s2 = "abcd/ABC # comment"; 103 | let s3 = "*abcd/ABC # comment"; 104 | let s4 = "abcd/ABC ip:m1 tp:m2 # comment"; 105 | 106 | let r1 = PersonalEntry::new("abcd", None, Vec::new(), false); 107 | let r2 = PersonalEntry::new("abcd", Some("ABC"), Vec::new(), false); 108 | let r3 = PersonalEntry::new("abcd", Some("ABC"), Vec::new(), true); 109 | let r4 = PersonalEntry::new( 110 | "abcd", 111 | Some("ABC"), 112 | vec![ 113 | MorphInfo::InflecPfx("m1".into()), 114 | MorphInfo::TermPfx("m2".into()), 115 | ], 116 | false, 117 | ); 118 | 119 | assert_eq!(PersonalEntry::parse_single(s1), r1); 120 | assert_eq!(PersonalEntry::parse_single(s2), r2); 121 | assert_eq!(PersonalEntry::parse_single(s3), r3); 122 | assert_eq!(PersonalEntry::parse_single(s4), r4); 123 | } 124 | -------------------------------------------------------------------------------- /zspell/src/dict/tests_rule.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[test] 4 | fn test_check_condition() { 5 | let mut kind = RuleType::Suffix; 6 | let mut rule = AfxRulePattern::default(); 7 | rule.set_pattern("[^aeiou]y", kind).unwrap(); 8 | 9 | // General tests, including with pattern in the middle 10 | assert!(rule.check_condition("xxxy")); 11 | assert!(!rule.check_condition("xxxay")); 12 | assert!(!rule.check_condition("xxxyxx")); 13 | 14 | // Test with prefix 15 | kind = RuleType::Prefix; 16 | rule.set_pattern("y[^aeiou]", kind).unwrap(); 17 | assert!(rule.check_condition("yxxx")); 18 | assert!(!rule.check_condition("yaxxx")); 19 | assert!(!rule.check_condition("xxxyxxx")); 20 | 21 | // Test other real rules 22 | kind = RuleType::Suffix; 23 | rule.set_pattern("[sxzh]", kind).unwrap(); 24 | assert!(rule.check_condition("access")); 25 | assert!(rule.check_condition("abyss")); 26 | assert!(!rule.check_condition("accomplishment")); 27 | assert!(rule.check_condition("mmms")); 28 | assert!(!rule.check_condition("mmsmm")); 29 | 30 | // Check with default condition 31 | rule.set_pattern(".", kind).unwrap(); 32 | assert!(rule.check_condition("xxx")); 33 | } 34 | 35 | #[test] 36 | fn test_apply_pattern() { 37 | let mut kind = RuleType::Suffix; 38 | let mut rule = AfxRulePattern::new("zzz", Some("y")); 39 | 40 | rule.set_pattern("[^aeiou]y", kind).unwrap(); 41 | assert_eq!(rule.apply_pattern("xxxy", kind), Some("xxxzzz".to_owned())); 42 | 43 | kind = RuleType::Prefix; 44 | rule.set_pattern("y[^aeiou]", kind).unwrap(); 45 | assert_eq!(rule.apply_pattern("yxxx", kind), Some("zzzxxx".to_owned())); 46 | 47 | kind = RuleType::Suffix; 48 | rule.set_pattern(".", kind).unwrap(); 49 | assert_eq!(rule.apply_pattern("xxx", kind), Some("xxxzzz".to_owned())); 50 | } 51 | 52 | // #[test] 53 | // fn test_rule_group_apply_pattern() { 54 | // let kind = RuleType::Suffix; 55 | // let rules= vec![ 56 | // AfxRule::new(0, kind, "iness",false, Some("y"), Some("[^aeiou]y"), Vec::new()), 57 | // AfxRule::new(0, kind, "ness",false, None, Some("[aeiou]y"), Vec::new()), 58 | // AfxRule::new(0, kind, "ness",false, None, Some("[^y]"), Vec::new()), 59 | // ]; 60 | 61 | // assert_eq!(group.apply_pattern("blurry").unwrap(), "blurriness"); 62 | // assert_eq!(group.apply_pattern("coy").unwrap(), "coyness"); 63 | // assert_eq!(group.apply_pattern("acute").unwrap(), "acuteness"); 64 | // } 65 | -------------------------------------------------------------------------------- /zspell/src/helpers.rs: -------------------------------------------------------------------------------- 1 | //! Various functions that are helpful throughout the crate 2 | 3 | use core::fmt::Display; 4 | use std::borrow::Cow; 5 | use std::hash::Hash; 6 | use std::ops::Deref; 7 | use std::rc::Rc; 8 | use std::sync::Arc; 9 | 10 | use hashbrown::Equivalent; 11 | use regex::Regex; 12 | 13 | use crate::affix::RuleType; 14 | 15 | /// Wrap `Regex` objects so they can be hashed 16 | #[derive(Clone, Debug)] 17 | pub struct ReWrapper(Regex); 18 | 19 | impl ReWrapper { 20 | pub fn new(re: &str) -> Result { 21 | Ok(Self(Regex::new(re)?)) 22 | } 23 | } 24 | 25 | impl Eq for ReWrapper {} 26 | 27 | impl PartialEq for ReWrapper { 28 | fn eq(&self, other: &Self) -> bool { 29 | self.0.as_str() == other.0.as_str() 30 | } 31 | } 32 | 33 | impl Hash for ReWrapper { 34 | fn hash(&self, state: &mut H) { 35 | self.0.as_str().hash(state); 36 | } 37 | } 38 | 39 | impl Deref for ReWrapper { 40 | type Target = Regex; 41 | 42 | fn deref(&self) -> &Self::Target { 43 | &self.0 44 | } 45 | } 46 | 47 | /// Convert any integer to a u32, panic if it does not fit 48 | #[inline] 49 | pub fn convertu32 + Display + Copy>(value: T) -> u32 { 50 | value 51 | .try_into() 52 | .unwrap_or_else(|_| panic!("value {value} overflows u32 max of {}", u32::MAX)) 53 | } 54 | 55 | /// Compile a regex pattern in the context of an affix. Returns None if 56 | /// the universal pattern "." is provided 57 | pub fn compile_re_pattern( 58 | condition: &str, 59 | kind: RuleType, 60 | ) -> Result, regex::Error> { 61 | if condition == "." { 62 | return Ok(None); 63 | } 64 | // Escape hyphens in groups 65 | let cond = condition.replace('-', r"\-"); 66 | let re_pattern = match kind { 67 | RuleType::Prefix => format!("^{cond}.*$"), 68 | RuleType::Suffix => format!("^.*{cond}$"), 69 | }; 70 | ReWrapper::new(re_pattern.as_str()).map(Some) 71 | } 72 | 73 | /// Implement a type that derefs to compare to a string 74 | #[derive(Clone, Debug, PartialEq, Hash, Eq)] 75 | pub struct StrWrapper<'a>(pub &'a str); 76 | 77 | impl<'a> StrWrapper<'a> { 78 | pub fn new(s: &'a str) -> Self { 79 | Self(s) 80 | } 81 | } 82 | 83 | impl Display for StrWrapper<'_> { 84 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 85 | write!(f, "{}", self.0) 86 | } 87 | } 88 | 89 | impl Equivalent> for StrWrapper<'_> { 90 | fn equivalent(&self, key: &Rc) -> bool { 91 | self.0 == key.as_ref() 92 | } 93 | } 94 | 95 | impl Equivalent> for StrWrapper<'_> { 96 | fn equivalent(&self, key: &Arc) -> bool { 97 | self.0 == key.as_ref() 98 | } 99 | } 100 | 101 | #[allow(unused)] 102 | pub fn replace_cow<'a>(s: &'a str, from: char, to: &str) -> Cow<'a, str> { 103 | if s.contains(from) { 104 | Cow::Owned(s.replace(from, to)) 105 | } else { 106 | Cow::Borrowed(s) 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /zspell/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! ZSpell is a spellchecking tool written entirely in Rust, aimed to be 2 | //! compatible with the widely-used [Hunspell] dictionary format. This is the 3 | //! documentation for the system library, please see the [CLI docs] if that is 4 | //! what you expected. 5 | //! 6 | //! # Usage 7 | //! 8 | //! A Hunspell dictionary format has three main components: 9 | //! 10 | //! - An "affix" or "config" file, usually with extension `.aff` 11 | //! - A dictionary word list file, ususally `.dic` or `.dict` 12 | //! - An optional personal dictionary 13 | //! 14 | //! You will need to know the location of dictionary files on your system, or 15 | //! obtain them yourself. A repository exists that has dictionaries for many 16 | //! different languages, if you don't have any available: 17 | //! . 18 | //! 19 | //! This library requires specifying the input from these files, then building a 20 | //! [`Dictionary`] object that can be used to perform all other operations. 21 | //! Usage will typically look like the following: 22 | //! 23 | //! ``` 24 | //! # #![cfg(not(miri))] 25 | //! use std::fs; 26 | //! 27 | //! use zspell::Dictionary; 28 | //! 29 | //! // This example just uses some shortened files. Load them to a string 30 | //! let aff_content = 31 | //! fs::read_to_string("tests/files/w1_eng_short.aff").expect("failed to load config file"); 32 | //! let dic_content = 33 | //! fs::read_to_string("tests/files/w1_eng_short.dic").expect("failed to load wordlist file"); 34 | //! 35 | //! // Use the builder pattern to create our `Dictionary` object 36 | //! let dict: Dictionary = zspell::builder() 37 | //! .config_str(&aff_content) 38 | //! .dict_str(&dic_content) 39 | //! .build() 40 | //! .expect("failed to build dictionary!"); 41 | //! 42 | //! // The `.check(&str)` method is useful for quickly verifying entire strings 43 | //! assert_eq!(dict.check("reptiles pillow: bananas"), true); 44 | //! assert_eq!(dict.check("well, I misspelled soemthing this tiem"), false); 45 | //! 46 | //! // Or use `.check_word(&str)` to validate the input as a single word 47 | //! assert_eq!(dict.check_word("okay"), true); 48 | //! assert_eq!(dict.check_word("okay okay"), false); 49 | //! 50 | //! // `.check_indices(&str)` provides more useful information for anything other than trivial 51 | //! // checks. It returns an iterator over `(usize, &str)`, which gives the byte offset and 52 | //! // string reference of any spelling errors. 53 | //! let input = "okay, I misspelled soemthing this tiem"; 54 | //! let errors: Vec<(usize, &str)> = dict.check_indices(input).collect(); 55 | //! let expected = vec![(19, "soemthing"), (34, "tiem")]; 56 | //! assert_eq!(errors, expected); 57 | //! ``` 58 | //! 59 | //! There is also a powerful entry-based API that allows for stemming and analysis, as well as 60 | //! suggestions (which are currently unstable). 61 | //! 62 | //! ``` 63 | //! # #![cfg(not(miri))] 64 | //! 65 | //! # use std::fs; 66 | //! # use zspell::Dictionary; 67 | //! # let aff_content = 68 | //! # fs::read_to_string("tests/files/w1_eng_short.aff").expect("failed to load config file"); 69 | //! # let dic_content = 70 | //! # fs::read_to_string("tests/files/w1_eng_short.dic").expect("failed to load wordlist file"); 71 | //! # let dict: Dictionary = zspell::builder() 72 | //! # .config_str(&aff_content) 73 | //! # .dict_str(&dic_content) 74 | //! # .build() 75 | //! # .expect("failed to build dictionary!"); 76 | //! let input = "bananas rusting"; 77 | //! let mut entries = dict.entries(input); 78 | //! 79 | //! // We can use the entry API to do the standard checks (word position and correctness), 80 | //! // but also to find word roots. 81 | //! let banana_entry = entries.next().unwrap(); 82 | //! let banana_stems: Vec<&str> = banana_entry.stems().unwrap().collect(); 83 | //! assert_eq!(banana_entry.word(), "bananas"); 84 | //! assert_eq!(banana_entry.index(), 0); 85 | //! assert_eq!(banana_entry.correct(), true); 86 | //! assert_eq!(banana_stems, ["banana"]); 87 | //! 88 | //! let rust_entry = entries.next().unwrap(); 89 | //! let rust_stems: Vec<&str> = rust_entry.stems().unwrap().collect(); 90 | //! assert_eq!(rust_stems, ["rust"]); 91 | //! ``` 92 | //! 93 | //! See [`Dictionary`] and [`DictBuilder`] to get started. 94 | //! 95 | //! # Stability & Feature Flags 96 | //! 97 | //! At the moment, the only public functions available are `check`, 98 | //! `check_word`, and `check_indices`. These three functions are more or less 99 | //! guaranteed to have stable interfaces, though the internals may change. 100 | //! 101 | //! There are also some unstable components to this library: 102 | //! 103 | //! - `unstable-suggestions`: Needed for providing suggestions, this is 104 | //! currently disabled because it is slow. 105 | //! - `unstable-system`: Needed for system interfaces like locating existing 106 | //! dictionaries 107 | //! - `zspell-unstable`: Enable all of these options 108 | //! 109 | //! These flags can be enabled in your `Cargo.toml` if you would like to 110 | //! experiment with these featuers. Any APIs protected behind these feature 111 | //! flags are subject to change, but the need for these flags will be removed as 112 | //! they are stabalized. 113 | //! 114 | //! [Hunspell]: http://hunspell.github.io/ 115 | //! [CLI docs]: https://pluots.github.io/zspell/ 116 | #![forbid(unsafe_code)] 117 | #![warn(clippy::pedantic)] 118 | // #![warn(clippy::cargo)] 119 | #![warn(clippy::nursery)] 120 | #![warn(clippy::str_to_string)] 121 | #![warn(clippy::missing_inline_in_public_items)] 122 | #![warn(clippy::disallowed_types)] 123 | #![allow(clippy::use_self)] 124 | #![allow(clippy::match_same_arms)] 125 | #![allow(clippy::struct_excessive_bools)] 126 | #![allow(clippy::missing_panics_doc)] 127 | #![allow(clippy::must_use_candidate)] 128 | // #![allow(clippy::redundant_pub_crate)] 129 | #![allow(clippy::module_name_repetitions)] 130 | #![allow(clippy::missing_const_for_fn)] 131 | #![allow(clippy::derive_partial_eq_without_eq)] 132 | 133 | mod affix; 134 | mod dict; 135 | pub mod error; 136 | mod helpers; 137 | mod meta; 138 | mod morph; 139 | mod suggestions; 140 | 141 | #[cfg(feature = "unstable-system")] 142 | pub mod system; 143 | 144 | pub(crate) use affix::ParsedCfg; 145 | pub use affix::PartOfSpeech; 146 | #[doc(inline)] 147 | pub use dict::{DictBuilder, Dictionary, WordEntry, WordList}; 148 | #[doc(inline)] 149 | pub use error::Error; 150 | pub use morph::{MorphInfo, MorphStr}; 151 | 152 | // Make some things public when benchmarking 153 | #[cfg(feature = "unstable-bench")] 154 | pub mod bench { 155 | pub use super::affix::{affix_from_str, FlagType}; 156 | pub use super::dict::DictEntry; 157 | } 158 | 159 | /// Create a new [`DictBuilder`] instance (shortcut for [`DictBuilder::new`]) 160 | #[inline] 161 | pub fn builder<'a>() -> DictBuilder<'a> { 162 | DictBuilder::new() 163 | } 164 | -------------------------------------------------------------------------------- /zspell/src/meta.rs: -------------------------------------------------------------------------------- 1 | //! Meta-related logic 2 | -------------------------------------------------------------------------------- /zspell/src/morph.rs: -------------------------------------------------------------------------------- 1 | //! Types and implementation of morphological analysis 2 | 3 | use std::fmt; 4 | 5 | use crate::affix::PartOfSpeech; 6 | 7 | /// Morphological information about a word, used by analysis methods 8 | #[non_exhaustive] 9 | #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] 10 | pub enum MorphInfo { 11 | /// `st:` stem word 12 | Stem(MorphStr), 13 | /// `ph:` better phonetic transliteration if available 14 | Phonetic(MorphStr), 15 | /// `al:` allomorphs (e.g. sing -> sang, sung) 16 | Allomorph(MorphStr), 17 | /// `po:` part of speech 18 | Part(PartOfSpeech), 19 | /// `ds:` derivational suffix 20 | DerivSfx(MorphStr), 21 | /// `is:` inflectional suffix 22 | InflecSfx(MorphStr), 23 | /// `ts:` terminal suffix 24 | TerminalSfx(MorphStr), 25 | /// `dp:` derivational suffix 26 | DerivPfx(MorphStr), 27 | /// `ip:` inflectional suffix 28 | InflecPfx(MorphStr), 29 | /// `tp:` terminal suffix 30 | TermPfx(MorphStr), 31 | /// `sp:` surface prefix 32 | SurfacePfx(MorphStr), 33 | /// `pa:` parts of compound words 34 | CompPart(MorphStr), 35 | /// Any unrecognized tag. This will be stored as written (e.g. `foo:bar` is stored as 36 | /// `foo:bar`, not just `bar`). 37 | Other(MorphStr), 38 | } 39 | 40 | impl MorphInfo { 41 | /// Parse the kind of string that a dictionary file has, usually something like: 42 | /// 43 | /// ```text 44 | /// po:verb st:rootword ts:abcd 45 | /// ``` 46 | #[inline] 47 | #[allow(clippy::unnecessary_wraps)] 48 | pub(crate) fn many_from_str(s: &str) -> impl Iterator + '_ { 49 | s.split_whitespace().map(MorphInfo::from) 50 | // FIXME:dict-parser we should be able to handle the hungarian dictionary that 51 | // has entries like this: 52 | // üzletág/UmôŇyiYcÇ üzletágak 53 | // but I am not sure what that means if it is not morph info... 54 | // res.push(MorphInfo::try_from(morph).map_err(|e| ParseError::new_nospan(e, morph))?); 55 | } 56 | } 57 | 58 | impl From<&str> for MorphInfo { 59 | #[inline] 60 | fn from(value: &str) -> Self { 61 | let Some((tag, val)) = value.split_once(':') else { 62 | return Self::Other(value.into()); 63 | }; 64 | 65 | match tag { 66 | "st" => Self::Stem(val.into()), 67 | "ph" => Self::Phonetic(val.into()), 68 | "al" => Self::Allomorph(val.into()), 69 | "po" => Self::Part(val.into()), 70 | "ds" => Self::DerivSfx(val.into()), 71 | "is" => Self::InflecSfx(val.into()), 72 | "ts" => Self::TerminalSfx(val.into()), 73 | "dp" => Self::DerivPfx(val.into()), 74 | "ip" => Self::InflecPfx(val.into()), 75 | "tp" => Self::TermPfx(val.into()), 76 | "sp" => Self::SurfacePfx(val.into()), 77 | "pa" => Self::CompPart(val.into()), 78 | _ => Self::Other(value.into()), 79 | } 80 | } 81 | } 82 | 83 | impl fmt::Display for MorphInfo { 84 | #[inline] 85 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 86 | match self { 87 | MorphInfo::Stem(v) => write!(f, "st:{v}"), 88 | MorphInfo::Phonetic(v) => write!(f, "ph:{v}"), 89 | MorphInfo::Allomorph(v) => write!(f, "al:{v}"), 90 | MorphInfo::Part(v) => write!(f, "po:{v}"), 91 | MorphInfo::DerivSfx(v) => write!(f, "ds:{v}"), 92 | MorphInfo::InflecSfx(v) => write!(f, "is:{v}"), 93 | MorphInfo::TerminalSfx(v) => write!(f, "ts:{v}"), 94 | MorphInfo::DerivPfx(v) => write!(f, "dp:{v}"), 95 | MorphInfo::InflecPfx(v) => write!(f, "ip:{v}"), 96 | MorphInfo::TermPfx(v) => write!(f, "tp:{v}"), 97 | MorphInfo::SurfacePfx(v) => write!(f, "sp:{v}"), 98 | MorphInfo::CompPart(v) => write!(f, "pa:{v}"), 99 | MorphInfo::Other(v) => write!(f, "{v}"), 100 | } 101 | } 102 | } 103 | 104 | /// A string used as part of morphological analysis 105 | /// 106 | /// This is a thin wrapper over a native string type to allow us to change 107 | /// the implementation as needed. 108 | #[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] 109 | pub struct MorphStr(Box); 110 | 111 | impl AsRef for MorphStr { 112 | #[inline] 113 | fn as_ref(&self) -> &str { 114 | self.0.as_ref() 115 | } 116 | } 117 | 118 | impl From<&str> for MorphStr { 119 | #[inline] 120 | fn from(value: &str) -> Self { 121 | Self(value.into()) 122 | } 123 | } 124 | 125 | impl fmt::Display for MorphStr { 126 | #[inline] 127 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 128 | self.0.fmt(f) 129 | } 130 | } 131 | 132 | impl fmt::Debug for MorphStr { 133 | #[inline] 134 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 135 | self.0.fmt(f) 136 | } 137 | } 138 | 139 | #[cfg(test)] 140 | mod tests { 141 | use super::*; 142 | 143 | #[test] 144 | fn morph_single_ok() { 145 | let tests = [ 146 | ("st:stem", MorphInfo::Stem("stem".into())), 147 | ("ip:abc", MorphInfo::InflecPfx("abc".into())), 148 | ("pa:xyz", MorphInfo::CompPart("xyz".into())), 149 | ("foo:xyz", MorphInfo::Other("foo:xyz".into())), 150 | ]; 151 | 152 | for (input, expected) in tests { 153 | assert_eq!(MorphInfo::from(input), expected, "failure parsing {input}"); 154 | } 155 | } 156 | 157 | #[test] 158 | fn morph_string_ok() { 159 | let input = "st:stem ip:abcd pa:xyz st:some-stem\tal:def"; 160 | let output = MorphInfo::many_from_str(input); 161 | let expected = [ 162 | MorphInfo::Stem("stem".into()), 163 | MorphInfo::InflecPfx("abcd".into()), 164 | MorphInfo::CompPart("xyz".into()), 165 | MorphInfo::Stem("some-stem".into()), 166 | MorphInfo::Allomorph("def".into()), 167 | ]; 168 | 169 | assert_eq!(&output.collect::>(), &expected); 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /zspell/src/suggestions.rs: -------------------------------------------------------------------------------- 1 | //! Types and implementation of suggestion logic 2 | -------------------------------------------------------------------------------- /zspell/src/system/tests.rs: -------------------------------------------------------------------------------- 1 | //! Tests for the `system` module 2 | 3 | // use std::{fs, io}; 4 | 5 | // use tempfile::tempdir; 6 | 7 | // use super::*; 8 | // use crate::errors; 9 | 10 | // #[test] 11 | // #[cfg(windows)] 12 | // fn test_raw_paths() { 13 | // // Just spot check what we have here 14 | // let paths = create_raw_paths(); 15 | 16 | // assert!(paths.contains(&PathBuf::from( 17 | // r"C:\Program files\OpenOffice.org*\share\dict\ooo" 18 | // ))); 19 | // assert!(paths.contains(&PathBuf::from( 20 | // r"C:\Program files\OpenOffice.org*\share\dict\ooo\hunspell" 21 | // ))); 22 | // } 23 | 24 | // #[test] 25 | // #[cfg(not(windows))] 26 | // fn test_raw_paths() { 27 | // // Just spot check what we have here 28 | // let paths = create_raw_paths(); 29 | 30 | // assert!(paths.contains(&PathBuf::from("/usr/share"))); 31 | // assert!(paths.contains(&PathBuf::from("/usr/share/zspell"))); 32 | // assert!(paths.contains(&PathBuf::from("/usr/share/myspell"))); 33 | // assert!(paths.contains(&PathBuf::from("/usr/share/hunspell"))); 34 | // assert!(paths.contains(&PathBuf::from("/Library/Spelling/hunspell"))); 35 | // assert!(paths.contains(&PathBuf::from("/Library/Spelling/hunspell"))); 36 | // } 37 | 38 | // #[test] 39 | // fn test_matching_dirs() { 40 | // // Create a temporary directory with contents 41 | // // Ensure the function locates them using wildcards 42 | // let dir = tempdir().unwrap(); 43 | 44 | // let mut paths = vec![ 45 | // dir.path().join("a").join("b").join("c-x-cxd"), 46 | // dir.path().join("a").join("b").join("c-yz-cxd"), 47 | // dir.path().join("a").join("b").join("c-.abc-cxd"), 48 | // ]; 49 | // paths.sort(); 50 | 51 | // for path in &paths { 52 | // fs::create_dir_all(path).unwrap(); 53 | // } 54 | 55 | // let mut ret = find_matching_dirs(&dir.path().join("a").join("b"), "c-*-c?d"); 56 | // ret.sort(); 57 | 58 | // assert_eq!(paths, ret); 59 | // } 60 | 61 | // #[test] 62 | // fn test_expand_dir_wildcards() { 63 | // let dir = tempdir().unwrap(); 64 | 65 | // let paths = vec![ 66 | // dir.path().join("aaa").join("bbb-x").join("ccc"), 67 | // dir.path().join("aaa").join("bbb-y").join("ccc"), 68 | // dir.path().join("ddd"), 69 | // ]; 70 | 71 | // for path in &paths { 72 | // fs::create_dir_all(path).unwrap(); 73 | // } 74 | 75 | // let mut input = vec![ 76 | // dir.path().join("aaa").join("bbb*").join("ccc"), 77 | // dir.path().join("ddd"), 78 | // ]; 79 | 80 | // let mut expanded = Vec::from_iter(expand_dir_wildcards(&mut input)); 81 | // expanded.sort_unstable(); 82 | 83 | // assert_eq!(paths, expanded); 84 | // } 85 | 86 | // #[test] 87 | // fn test_find_dict_from_path() { 88 | // let dir = tempdir().unwrap(); 89 | 90 | // let fnames = vec![ 91 | // dir.path().join("test_found.dic"), 92 | // dir.path().join("test_found.aff"), 93 | // dir.path().join("test_found.afx"), 94 | // dir.path().join("test.dict"), 95 | // dir.path().join("test.affix"), 96 | // dir.path().join("notfound.dic"), 97 | // dir.path().join("notfound.aff"), 98 | // dir.path().join("test"), 99 | // ]; 100 | 101 | // let mut expected = vec![ 102 | // DictPaths { 103 | // dictionary: fnames[0].clone(), 104 | // affix: fnames[1].clone(), 105 | // }, 106 | // DictPaths { 107 | // dictionary: fnames[0].clone(), 108 | // affix: fnames[2].clone(), 109 | // }, 110 | // DictPaths { 111 | // dictionary: fnames[3].clone(), 112 | // affix: fnames[4].clone(), 113 | // }, 114 | // ]; 115 | // expected.sort(); 116 | 117 | // for fname in fnames { 118 | // fs::File::create(fname).unwrap(); 119 | // } 120 | // fs::read_dir(dir.path()).unwrap(); 121 | 122 | // let mut res = find_dicts_from_path(dir.path(), "test_found").unwrap(); 123 | // res.sort(); 124 | 125 | // assert_eq!(res, expected); 126 | // } 127 | 128 | // #[test] 129 | // fn test_find_dict_from_path_err() { 130 | // let fakepath = tempdir().unwrap().path().join("fake"); 131 | // let res = find_dicts_from_path(&fakepath, "test_found"); 132 | 133 | // assert_eq!( 134 | // Err(errors::SystemError::IOError { 135 | // name: fakepath.to_string_lossy().to_string(), 136 | // e: io::ErrorKind::NotFound 137 | // }), 138 | // res 139 | // ); 140 | // } 141 | -------------------------------------------------------------------------------- /zspell/test-suite/0-example.test: -------------------------------------------------------------------------------- 1 | %% Example test file. '%%' is our comment indicator since '#' gets passed directly 2 | %% to the input 3 | 4 | ==== afx ==== 5 | %% This section contains contents of the .aff file 6 | 7 | ==== dic ==== 8 | %% This section contains contents of the .dic file 9 | 10 | === personal === 11 | %% This section contains contents of a personal file 12 | 13 | ==== valid ==== 14 | %% Each line will get checked with `.check()`, i.e. treated as sentences 15 | 16 | ==== invalid ==== 17 | %% These words should not be valid 18 | 19 | ==== wordlist ==== 20 | %% Verify the word list contains exactly the listed words 21 | 22 | ==== nosuggest ==== 23 | %% Exact expected contents of the never suggested wordlist 24 | 25 | ==== forbidden ==== 26 | %% Exact expected contents of the non-accepted wordlist 27 | 28 | ==== stem ==== 29 | %% List the expected stem for a given word 30 | rooted > root | rooted 31 | 32 | ==== morph ==== 33 | %% List the expected morph info for a given word 34 | apple > po:noun | ... 35 | 36 | ==== suggest ==== 37 | %% Something like the following 38 | appl > apple | Apfel | app 39 | -------------------------------------------------------------------------------- /zspell/test-suite/b-affix-forward-gen-num-flags.test: -------------------------------------------------------------------------------- 1 | %% Test numeric flags 2 | 3 | ==== afx ==== 4 | SET UTF-8 5 | FLAG num 6 | 7 | SFX 1 N 1 8 | SFX 1 0 aa . 9 | 10 | SFX 999 N 1 11 | SFX 999 0 bb . 12 | 13 | SFX 12345 N 1 14 | SFX 12345 0 cc . 15 | 16 | SFX 1234 N 1 17 | SFX 1234 0 dd . 18 | 19 | ==== dic ==== 20 | 4 21 | www/1 22 | xxx/1,999,12345 23 | yyy/999,12345 24 | zzz/999,1234 25 | 26 | 27 | ==== valid ==== 28 | www 29 | xxx 30 | yyy 31 | zzz 32 | wwwaa 33 | xxxaa 34 | xxxbb 35 | xxxcc 36 | yyybb 37 | yyycc 38 | zzzbb 39 | zzzdd 40 | 41 | 42 | ==== invalid ==== 43 | %% Nothing to see here 44 | nothing 45 | 46 | ==== wordlist ==== 47 | www 48 | xxx 49 | yyy 50 | zzz 51 | wwwaa 52 | xxxaa 53 | xxxbb 54 | xxxcc 55 | yyybb 56 | yyycc 57 | zzzbb 58 | zzzdd 59 | -------------------------------------------------------------------------------- /zspell/test-suite/b-affix-forward-gen.test: -------------------------------------------------------------------------------- 1 | %% Test basic affix, including: 2 | %% - Only prefix 3 | %% - Only suffix 4 | %% - Stripping characters 5 | %% - Patterns 6 | %% - Combined prefix and suffix 7 | %% - Noncombining prefix and suffix 8 | 9 | ==== afx ==== 10 | SET UTF-8 11 | 12 | PFX A Y 1 13 | PFX A 0 aa . 14 | 15 | SFX B Y 3 16 | SFX B 0 bb . 17 | SFX B y cc y 18 | SFX B 0 dd [^y] 19 | 20 | PFX C N 2 21 | PFX C yy ee . 22 | PFX C 0 ff . 23 | 24 | 25 | ==== dic ==== 26 | 4 27 | xxx/A 28 | yyy/B 29 | zzz/AB 30 | yyyy/AC 31 | 32 | 33 | ==== valid ==== 34 | xxx 35 | yyy 36 | zzz 37 | yyyy 38 | aaxxx 39 | yyybb 40 | yycc 41 | aazzz 42 | zzzbb 43 | zzzdd 44 | aazzzbb 45 | aazzzdd 46 | aayyyy 47 | eeyy 48 | ffyyyy 49 | 50 | 51 | ==== invalid ==== 52 | %% Nothing to see here 53 | nothing 54 | 55 | ==== wordlist ==== 56 | xxx 57 | yyy 58 | zzz 59 | yyyy 60 | aaxxx 61 | yyybb 62 | yycc 63 | aazzz 64 | zzzbb 65 | zzzdd 66 | aazzzbb 67 | aazzzdd 68 | aayyyy 69 | eeyy 70 | ffyyyy 71 | -------------------------------------------------------------------------------- /zspell/test-suite/b-flag-long.test: -------------------------------------------------------------------------------- 1 | %% Verify that multicharacter flags work 2 | 3 | ==== afx ==== 4 | FLAG long 5 | 6 | NEEDAFFIX () 7 | FORBIDDENWORD {} 8 | KEEPCASE || 9 | NOSUGGEST -- 10 | 11 | %% Test same first character but different second 12 | SFX -+ Y 1 13 | SFX -+ 0 aa . 14 | 15 | ==== dic ==== 16 | foo/-- 17 | bar/||-- 18 | baz/-+ 19 | 20 | ==== valid ==== 21 | foo bar baz bazaa 22 | 23 | ==== wordlist ==== 24 | baz 25 | bazaa 26 | 27 | ==== nosuggest ==== 28 | foo 29 | bar 30 | -------------------------------------------------------------------------------- /zspell/test-suite/b-nosuggest-forbid.test: -------------------------------------------------------------------------------- 1 | %% Verify our nosuggest anf forbid flags works 2 | 3 | ==== afx ==== 4 | NOSUGGEST ! 5 | FORBIDDENWORD * 6 | 7 | SFX A Y 1 8 | SFX A 0 aaa . 9 | 10 | ==== dic ==== 11 | nosuggest/A! 12 | forbid/A* 13 | nosuggest2/! 14 | forbid2/* 15 | ok 16 | 17 | ==== valid ==== 18 | nosuggest 19 | nosuggestaaa 20 | nosuggest2 21 | ok 22 | 23 | ==== invalid ==== 24 | forbid 25 | forbidaaa 26 | forbid2 27 | 28 | ==== wordlist ==== 29 | ok 30 | 31 | ==== nosuggest ==== 32 | nosuggest 33 | nosuggestaaa 34 | nosuggest2 35 | 36 | ==== forbidden ==== 37 | forbid 38 | forbidaaa 39 | forbid2 40 | -------------------------------------------------------------------------------- /zspell/test-suite/b-stemming-morph.test: -------------------------------------------------------------------------------- 1 | ==== afx ==== 2 | SFX X Y 1 3 | SFX X 0 able . ds:able 4 | 5 | ==== dic ==== 6 | %% mice st:mouse 7 | drink/X po:verb 8 | 9 | ==== wordlist ==== 10 | %% attr: allow-extra 11 | %% mice 12 | 13 | ==== stem ==== 14 | %% mice > mouse | miced 15 | drink > drink 16 | 17 | %% FIXME: hunspell lists these as `> drinkable`, not `> drink`. Why? 18 | drinkable > drink 19 | Drinkable > drink 20 | 21 | ==== morph ==== 22 | drink > po:verb 23 | drinkable > po:verb ds:able 24 | -------------------------------------------------------------------------------- /zspell/test-suite/h-circumfix.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/3cfd539b5b1033620b12663ee3f1d673d193add0/tests/circumfix.aff 3 | 4 | ==== afx ==== 5 | # circumfixes: ~ obligate prefix/suffix combinations 6 | # superlative in Hungarian: leg- (prefix) AND -bb (suffix) 7 | 8 | CIRCUMFIX X 9 | 10 | PFX A Y 1 11 | PFX A 0 leg/X . 12 | 13 | PFX B Y 1 14 | PFX B 0 legesleg/X . 15 | 16 | SFX C Y 3 17 | SFX C 0 obb . is:COMPARATIVE 18 | SFX C 0 obb/AX . is:SUPERLATIVE 19 | SFX C 0 obb/BX . is:SUPERSUPERLATIVE 20 | 21 | ==== dic ==== 22 | 1 23 | nagy/C po:adj 24 | 25 | ==== valid ==== 26 | nagy 27 | nagyobb 28 | 29 | %% FIXME(circumfix): xfail 30 | %% legnagyobb 31 | %% legeslegnagyobb 32 | 33 | 34 | ==== stem ==== 35 | nagy > nagy 36 | nagyobb > nagy 37 | 38 | %% FIXME(circumfix): xfail 39 | %% legnagyobb > nagy 40 | %% legeslegnagyobb > nagy 41 | 42 | ==== morph ==== 43 | nagy > po:adj 44 | nagyobb > po:adj is:COMPARATIVE 45 | 46 | %% FIXME(circumfix): xfail 47 | %% legnagyobb > fl:A po:adj is:SUPERLATIVE 48 | %% legeslegnagyobb > fl:B po:adj is:SUPERSUPERLATIVE 49 | -------------------------------------------------------------------------------- /zspell/test-suite/h-ignore-sug.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/ignoresug.aff 3 | 4 | ==== afx ==== 5 | # Ignore punctuation marks used inside of Armenian words, and produce correct 6 | # suggestion (see issue #570). However, "suggestion test" cannot be used because 7 | # words in `.wrong` file are not wrong realy, so it leads to an error. 8 | # Therefore, we apply "morphological analysis" which has a similar result. 9 | 10 | SET UTF-8 11 | WORDCHARS ֊՛՜՞՚ 12 | IGNORE ֊՛՜՞՚ 13 | 14 | ==== dic ==== 15 | 3 16 | ինչ 17 | մնաս 18 | որտեղ 19 | 20 | ==== valid ==== 21 | %% FIXME:ignore 22 | ինչ 23 | %% ի՞նչ 24 | մնաս 25 | %% մնա՜ս 26 | որտեղ 27 | %% որտե՞ղ 28 | 29 | %% FIXME:morph 30 | %% ==== morph ==== 31 | %% > ինչ 32 | %% analyze(ինչ) = st:ինչ 33 | %% stem(ինչ) = ինչ 34 | %% > ի՞նչ 35 | %% analyze(ի՞նչ) = st:ինչ 36 | %% stem(ի՞նչ) = ինչ 37 | %% > մնաս 38 | %% analyze(մնաս) = st:մնաս 39 | %% stem(մնաս) = մնաս 40 | %% > մնա՜ս 41 | %% analyze(մնա՜ս) = st:մնաս 42 | %% stem(մնա՜ս) = մնաս 43 | %% > որտեղ 44 | %% analyze(որտեղ) = st:որտեղ 45 | %% stem(որտեղ) = որտեղ 46 | %% > որտե՞ղ 47 | %% analyze(որտե՞ղ) = st:որտեղ 48 | %% stem(որտե՞ղ) = որտեղ 49 | -------------------------------------------------------------------------------- /zspell/test-suite/h-ignore-utf.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/ignoreutf.aff 3 | 4 | ==== afx ==== 5 | # Arabic test for feature ignoring diacritics 6 | SET UTF-8 7 | # Arabic diacritics (harakat): 8 | # sukun, shadda, kasra, damma, fatha, kasratan, dammantan, fathatan (left to right) 9 | IGNORE ًٌٍَُِّْ 10 | WORDCHARS ًٌٍَُِّْ 11 | 12 | ==== dic ==== 13 | 9 14 | طِير 15 | فَتحة 16 | ضُمة 17 | كِسرة 18 | فتحًتان 19 | ضمتانٌ 20 | كسرتاٍن 21 | شدّة 22 | سكوْن 23 | 24 | %% FIXME:ignore 25 | %% ==== valid ==== 26 | %% طير 27 | %% فتحة 28 | %% ضمة 29 | %% كسرة 30 | %% فتحتان 31 | %% ضمتان 32 | %% كسرتان 33 | %% شدة 34 | %% سكون 35 | -------------------------------------------------------------------------------- /zspell/test-suite/h-keepcase.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/keepcase.aff 3 | 4 | ==== afx ==== 5 | # keep case in signed words 6 | KEEPCASE A 7 | WORDCHARS . 8 | 9 | ==== dic ==== 10 | 4 11 | foo/A 12 | Bar/A 13 | baz./A 14 | Quux./A 15 | 16 | ==== valid ==== 17 | foo 18 | Bar 19 | %% FIXME:unknown 20 | %% baz. 21 | %% Quux. 22 | 23 | ==== invalid ==== 24 | %% FIXME:keepcase 25 | %% Foo 26 | %% FOO 27 | BAR 28 | bar 29 | Baz. 30 | BAZ. 31 | quux. 32 | QUUX. 33 | 34 | %% FIXME:suggestions 35 | %% ==== suggest ==== 36 | %% Foo > foo 37 | %% FOO > foo 38 | %% BAR > Bar 39 | %% bar > Bar, baz. 40 | %% Baz. > baz. 41 | %% BAZ. > baz. 42 | %% quux. > Quux. 43 | %% QUUX. > Quux. 44 | -------------------------------------------------------------------------------- /zspell/test-suite/h-korean.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/korean.aff 3 | 4 | ==== afx ==== 5 | SET UTF-8 6 | 7 | ==== dic ==== 8 | 3 9 | 들어오세요 10 | 안녕하세요 11 | 김수한무거북이와두루미삼천갑자동방삭치치카포사리사리세ᅡ워리워리세브리캉무드셀ᅡ구름위허ᅵ케ᅵᆫᅦ담벼락서생원에ᄀ양 12 | 13 | ==== valid ==== 14 | 들어오세요 15 | 안녕하세요 16 | 17 | ==== invalid ==== 18 | 들어오세 19 | -------------------------------------------------------------------------------- /zspell/test-suite/h-limit-multiple-compounding.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/limit-multiple-compounding.aff 3 | 4 | ==== afx ==== 5 | # "foo+bar" accepted, but not "foo+bar+baz" 6 | # because 3-or-more-word compounds got a typo check, i.e. 7 | # "foobarbaz" is rejected, because it is a typo of the dictionary word "goobarbaz" 8 | # (but not "foobar" as typo of the dictionary word "goobar") 9 | TRY esianrtolcdugmphbyfvkwz' 10 | COMPOUNDFLAG x 11 | 12 | ==== dic ==== 13 | 3 14 | foo/x 15 | bar/x 16 | baz/x 17 | goobar 18 | goobarbaz 19 | 20 | %% FIXME:compound 21 | %% ==== valid ==== 22 | %% foobar 23 | %% foobaz 24 | %% barfoo 25 | %% bazfoo 26 | %% barbaz 27 | %% bazbar 28 | %% foobazbar 29 | %% barfoobaz 30 | %% bazfoobar 31 | %% barbazfoo 32 | %% bazbarfoo 33 | 34 | ==== invalid ==== 35 | foobarbaz 36 | -------------------------------------------------------------------------------- /zspell/test-suite/h-map-utf.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/maputf.aff 3 | 4 | ==== afx ==== 5 | # With MAP suggestion, Hunspell can add missing accents to a word. 6 | 7 | SET UTF-8 8 | 9 | # switch off ngram suggestion for testing 10 | MAXNGRAMSUGS 0 11 | 12 | MAP 3 13 | MAP uúü 14 | MAP öóo 15 | MAP ß(ss) 16 | 17 | ==== dic ==== 18 | 3 19 | Frühstück 20 | tükörfúró 21 | groß 22 | 23 | ==== valid ==== 24 | 25 | ==== invalid ==== 26 | Fruhstuck 27 | tukorfuro 28 | gross 29 | 30 | %% FIXME:suggestions 31 | %% ==== suggest ==== 32 | %% Fruhstuck > Frühstück 33 | %% tukorfuro > tükörfúró 34 | %% gross > groß 35 | -------------------------------------------------------------------------------- /zspell/test-suite/h-map.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/map.aff 3 | 4 | %% skipped because we really have no reason to not support utf8 5 | -------------------------------------------------------------------------------- /zspell/test-suite/h-morph.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/morph.aff 3 | 4 | ==== afx ==== 5 | # example for morphological analysis, stemming and generation 6 | PFX P Y 1 7 | PFX P 0 un . dp:pfx_un sp:un 8 | 9 | SFX S Y 1 10 | SFX S 0 s . is:plur 11 | 12 | SFX Q Y 1 13 | SFX Q 0 s . is:sg_3 14 | 15 | SFX R Y 1 16 | SFX R 0 able/PS . ds:der_able 17 | 18 | ==== dic ==== 19 | 9 20 | drink/S po:noun 21 | drink/RQ po:verb al:drank al:drunk ts:present 22 | drank po:verb st:drink is:past_1 23 | drunk po:verb st:drink is:past_2 24 | eat/RQ po:verb al:ate al:eaten ts:present 25 | ate po:verb st:eat is:past_1 26 | eaten po:verb st:eat is:past_2 27 | phenomenon po:noun al:phenomena 28 | phenomena po:noun st:phenomenon is:plur 29 | 30 | ==== valid ==== 31 | %% FIXME:duplicates should these double words be tested as one or two? 32 | drink 33 | drinks 34 | %% drinkable 35 | %% drinkables 36 | %% undrinkable 37 | %% undrinkables 38 | drank 39 | drunk 40 | phenomenon 41 | phenomena 42 | drink eat 43 | drink eats 44 | drink ate 45 | drink eaten 46 | %% drink eatable 47 | %% drink eatables 48 | drink phenomena 49 | drinks eat 50 | drinks eats 51 | drinks ate 52 | drinks eaten 53 | %% drinks eatable 54 | %% drinks eatables 55 | drinks phenomena 56 | %% undrinkable phenomena 57 | phenomenon drinks 58 | 59 | ==== stem ==== 60 | %% FIXME:stemming 61 | drink > drink 62 | drinks > drink 63 | %% drinkable > drinkable 64 | %% drinkables > drinkable 65 | %% undrinkable > undrinkable 66 | %% undrinkables > undrinkable 67 | drank > drink 68 | drunk > drink 69 | phenomenon > phenomenon 70 | phenomena > phenomenon 71 | 72 | %% ==== morph ==== 73 | %% FIXME:morph. We should probably be storing these duplicates as separate meta entries (vec in our map) 74 | %% drink > st:drink po:noun 75 | %% drink > st:drink po:verb al:drank al:drunk ts:present 76 | %% drinks > st:drink po:verb al:drank al:drunk ts:present is:sg_3 77 | %% drinks > st:drink po:noun is:plur 78 | %% drinkable > st:drink po:verb al:drank al:drunk ts:present ds:der_able 79 | %% drinkables > st:drink po:verb al:drank al:drunk ts:present ds:der_able is:plur 80 | %% undrinkable > dp:pfx_un sp:un st:drink po:verb al:drank al:drunk ts:present ds:der_able 81 | %% undrinkables > dp:pfx_un sp:un st:drink po:verb al:drank al:drunk ts:present ds:der_able is:plur 82 | %% drank > po:verb st:drink is:past_1 83 | %% drunk > po:verb st:drink is:past_2 84 | %% phenomenon > st:phenomenon po:noun al:phenomena 85 | %% phenomena > po:noun st:phenomenon is:plur 86 | 87 | %% ==== gen ==== 88 | %% generate(drink, eat) = drink 89 | %% generate(drink, eats) = drinks 90 | %% generate(drink, ate) = drank 91 | %% generate(drink, eaten) = drunk 92 | %% generate(drink, eatable) = drinkable 93 | %% generate(drink, eatables) = drinkables 94 | %% generate(drink, phenomena) = drinks 95 | %% generate(drinks, eat) = drink 96 | %% generate(drinks, eats) = drinks 97 | %% generate(drinks, ate) = drank 98 | %% generate(drinks, eaten) = drunk 99 | %% generate(drinks, eatable) = drinkable 100 | %% generate(drinks, eatables) = drinkables 101 | %% generate(drinks, phenomena) = drinks 102 | %% generate(undrinkable, phenomena) = undrinkables 103 | %% generate(phenomenon, drinks) = phenomena 104 | -------------------------------------------------------------------------------- /zspell/test-suite/h-needaffix.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/needaffix.aff 3 | 4 | ==== afx ==== 5 | NEEDAFFIX X 6 | COMPOUNDFLAG Y 7 | 8 | SFX A Y 1 9 | SFX A 0 s/Y . 10 | 11 | ==== dic ==== 12 | 2 13 | foo/YXA 14 | bar/Y 15 | 16 | ==== valid ==== 17 | bar 18 | %% FIXME:compound 19 | %% foos 20 | %% barfoos 21 | 22 | ==== invalid ==== 23 | foo 24 | -------------------------------------------------------------------------------- /zspell/test-suite/h-needaffix2.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/needaffix2.aff 3 | 4 | ==== afx ==== 5 | NEEDAFFIX X 6 | COMPOUNDFLAG Y 7 | 8 | ==== dic ==== 9 | 4 10 | foo st:foo id:1 11 | foo/YX st:foo id:2 12 | foo/Y st:foo id:3 13 | bar/Y 14 | 15 | ==== valid ==== 16 | foo 17 | bar 18 | 19 | %% FIXME:compound 20 | %% foobar 21 | %% barfoo 22 | 23 | ==== stem ==== 24 | foo > foo 25 | bar > bar 26 | 27 | %% FIXME:compound 28 | %% foobar > foo 29 | %% barfoo > barfoo 30 | 31 | %% ==== morph ==== 32 | %% foo > st:foo id:1 33 | %% foo > st:foo id:3 34 | %% FIXME:morph:show-stem: seems like morph should always show a `st` entry? 35 | %% bar > st:bar 36 | 37 | %% FIXME:compound 38 | %% foobar > pa:foo st:foo id:3 pa:bar 39 | %% barfoo > pa:bar st:bar pa:foo st:foo id:3 40 | -------------------------------------------------------------------------------- /zspell/test-suite/h-needaffix3.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/needaffix3.aff 3 | 4 | ==== afx ==== 5 | # neeadaffix on affixes 6 | NEEDAFFIX X 7 | 8 | SFX A Y 1 9 | SFX A 0 s/XB . 10 | 11 | SFX B Y 1 12 | SFX B 0 baz . 13 | 14 | ==== dic ==== 15 | 2 16 | foo/A 17 | 18 | ==== valid ==== 19 | foo 20 | %% FIXME:unknown 21 | %% foosbaz 22 | 23 | ==== invalid ==== 24 | foos 25 | -------------------------------------------------------------------------------- /zspell/test-suite/h-needaffix4.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/needaffix4.aff 3 | 4 | ==== afx ==== 5 | NEEDAFFIX X 6 | COMPOUNDFLAG Y 7 | 8 | ==== dic ==== 9 | 4 10 | foo/X [1] 11 | foo/Y [2] 12 | foo/YX [3] 13 | bar/Y [4] 14 | 15 | ==== valid ==== 16 | foo 17 | bar 18 | %% FIXME:compound 19 | %% foobar 20 | %% barfoo 21 | -------------------------------------------------------------------------------- /zspell/test-suite/h-needaffix5.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/needaffix5.aff 3 | 4 | ==== afx ==== 5 | # on affixes 6 | NEEDAFFIX X 7 | 8 | SFX A Y 2 9 | SFX A 0 suf/B . 10 | SFX A 0 pseudosuf/XB . 11 | 12 | SFX B Y 1 13 | SFX B 0 bar . 14 | 15 | PFX C Y 2 16 | PFX C 0 pre . 17 | PFX C 0 pseudopre/X . 18 | 19 | ==== dic ==== 20 | 1 21 | foo/AC 22 | 23 | ==== valid ==== 24 | foo 25 | prefoo 26 | %% FIXME:unknown 27 | %% foosuf 28 | %% prefoosuf 29 | %% foosufbar 30 | %% prefoosufbar 31 | %% pseudoprefoosuf 32 | %% pseudoprefoosufbar 33 | %% pseudoprefoopseudosufbar 34 | %% prefoopseudosuf 35 | %% prefoopseudosufbar 36 | 37 | ==== invalid ==== 38 | pseudoprefoo 39 | foopseudosuf 40 | pseudoprefoopseudosuf 41 | -------------------------------------------------------------------------------- /zspell/test-suite/h-nepali.test: -------------------------------------------------------------------------------- 1 | ==== afx ==== 2 | SET UTF-8 3 | IGNORE ￰ 4 | WORDCHARS ःािीॉॊोौॎॏॕॖॗ‌‍ 5 | 6 | 7 | ICONV 5 8 | ICONV ‌_ ‌ 9 | ICONV र्‌य र्‌य 10 | ICONV र्‌व र्‌व 11 | ICONV ‌ ￰ 12 | ICONV ‍_ ￰ 13 | 14 | 15 | ==== dic ==== 16 | 4 17 | अलम् 18 | क्यार 19 | न्न 20 | र्‌य 21 | 22 | ==== valid ==== 23 | न्न 24 | %% FIXME:unknown 25 | %% न्‌न 26 | %% अलम्‍ 27 | र्‌य 28 | 29 | ==== invalid ==== 30 | र्य 31 | क्‍यार 32 | अलम्‌ 33 | -------------------------------------------------------------------------------- /zspell/test-suite/h-nosuggest.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/nosuggest.aff 3 | 4 | ==== afx ==== 5 | # don't suggest word with NOSUGGEST flag (for example vulgar or obscene words) 6 | # See OpenOffice.org Issue #55498 7 | # (nosuggest.sug is an empty file) 8 | NOSUGGEST A 9 | COMPOUNDFLAG B 10 | 11 | ==== dic ==== 12 | 1 13 | foo/AB 14 | bar/B 15 | 16 | ==== valid ==== 17 | foo 18 | %% FIXME:compound 19 | %% foobar 20 | %% barfoo 21 | 22 | ==== invalid ==== 23 | foox 24 | foobarx 25 | barfoox 26 | 27 | %% FIXME:suggestions 28 | %% ==== suggest ==== 29 | -------------------------------------------------------------------------------- /zspell/test-suite/h-oconv.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/oconv.aff 3 | 4 | ==== afx ==== 5 | # output conversion 6 | SET UTF-8 7 | 8 | # Testing also whitespace and comments. 9 | OCONV 7 # space, space 10 | OCONV a A # tab, space, space 11 | OCONV á Á # tab, tab, space 12 | OCONV b B # tab, tab, tab 13 | OCONV c C # 2xspace, 2xspace, 2xtab 14 | OCONV d D # tab+space, space+tab, space 15 | OCONV e E # 16 | OCONV é É 17 | # Only comment. Note that line above ends with space+tab. 18 | 19 | # space 20 | # 2xspace 21 | # tab 22 | # 2xtab 23 | # space+tab 24 | # tab+space 25 | 26 | ==== dic ==== 27 | 3 28 | bébé 29 | dádá 30 | aábcdeé 31 | 32 | ==== valid ==== 33 | bébé 34 | dádá 35 | 36 | ==== invalid ==== 37 | béb 38 | dád 39 | aábcde 40 | 41 | %% FIXME:suggestions 42 | %% ==== suggest ==== 43 | %% béb > BÉBÉ 44 | %% dád > DÁDÁ 45 | %% aábcde > AÁBCDEÉ 46 | -------------------------------------------------------------------------------- /zspell/test-suite/h-slash.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/slash.aff 3 | 4 | ==== afx ==== 5 | # slashes in words (\/) 6 | 7 | # (only for tokenization) 8 | WORDCHARS /: 9 | 10 | ==== dic ==== 11 | 4 12 | / 13 | 1\/2 14 | http:\/\/ 15 | \/usr\/share\/myspell\/ 16 | 17 | ==== valid ==== 18 | / 19 | %% FIXME:dict-parser 20 | %% 1/2 21 | %% http:// 22 | %% /usr/share/myspell/ 23 | -------------------------------------------------------------------------------- /zspell/test-suite/h-timelimit.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/timelimit.aff 3 | 4 | ==== afx ==== 5 | WORDCHARS 01 6 | COMPOUNDMIN 1 7 | COMPOUNDFLAG Y 8 | 9 | ==== dic ==== 10 | 4 11 | 0/Y 12 | 00/Y 13 | 000/Y 14 | 1/Y 15 | 16 | %% FIXME:compound 17 | %% ==== valid ==== 18 | %% 1000000000000000000000 19 | -------------------------------------------------------------------------------- /zspell/test-suite/h-utf8.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/utf8.aff 3 | 4 | ==== afx ==== 5 | SET UTF-8 6 | 7 | SFX A Y 7 8 | SFX A 0 őő . 9 | SFX A 0 ő o 10 | SFX A 0 ő ó 11 | SFX A ó ő ó 12 | SFX A ó őoo ó 13 | SFX A o őo o 14 | SFX A 0 ó [abcdó] 15 | 16 | ==== dic ==== 17 | 2 18 | foo/A 19 | foó/A 20 | 21 | ==== valid ==== 22 | foo 23 | foó 24 | fooőő 25 | fooő 26 | foóő 27 | foő 28 | foőo 29 | foőoo 30 | foóó 31 | -------------------------------------------------------------------------------- /zspell/test-suite/h-utfcoumpound.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/utfcompound.aff 3 | 4 | %% FIXME:compound 5 | %% ==== afx ==== 6 | %% SET UTF-8 7 | %% COMPOUNDMIN 3 8 | %% COMPOUNDFLAG A 9 | %% 10 | %% ==== dic ==== 11 | %% 8 12 | %% foo/A 13 | %% bar/A 14 | %% fóó/A 15 | %% áár/A 16 | %% xy/A 17 | %% yz/A 18 | %% éé/A 19 | %% őő/A 20 | %% 21 | %% ==== valid ==== 22 | %% foobar 23 | %% barfoo 24 | %% foobarfoo 25 | %% fóóáár 26 | %% áárfóó 27 | %% 28 | %% ==== invalid ==== 29 | %% xyyz 30 | %% fooxy 31 | %% xyfoo 32 | %% fooxybar 33 | %% ééőő 34 | %% fóóéé 35 | %% őőáár 36 | -------------------------------------------------------------------------------- /zspell/test-suite/h-zeroaffix.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: MPL-1.1 2 | %% https://github.com/hunspell/hunspell/blob/fbf0ce7e4737084fe84e733b031634f4a2c7559f/tests/zeroaffix.aff 3 | 4 | ==== afx ==== 5 | PSEUDOROOT X 6 | COMPOUNDFLAG Y 7 | SFX A Y 1 8 | SFX A 0 0 . > 9 | SFX B Y 1 10 | SFX B 0 0 . > 11 | SFX C Y 2 12 | SFX C 0 0/XAB . 13 | SFX C 0 baz/XAB . 14 | 15 | ==== dic ==== 16 | 2 17 | foo/XA foo 29 | bar > bar 30 | 31 | %% FIXME: compount 32 | %% barbaz > bar 33 | 34 | %% ==== morph ==== 35 | %% > bar 36 | %% analyze(bar) = st:bar > 37 | %% analyze(bar) = st:bar 38 | %% analyze(bar) = st:bar > 39 | %% analyze(bar) = st:bar > 40 | %% stem(bar) = bar 41 | %% > foo 42 | %% analyze(foo) = st:foo 43 | %% stem(foo) = foo 44 | %% > barbaz 45 | %% analyze(barbaz) = st:bar > 46 | %% analyze(barbaz) = st:bar > 47 | %% stem(barbaz) = bar 48 | -------------------------------------------------------------------------------- /zspell/test-suite/i071-number-affixes.test: -------------------------------------------------------------------------------- 1 | %% Based on https://github.com/pluots/zspell/issues/71 2 | 3 | ==== afx ==== 4 | FLAG num 5 | SFX 10 Y 3 6 | SFX 10 0 0 . is:tens 7 | SFX 10 0 00 . is:hundreds 8 | SFX 10 0 000 . is:thousands 9 | 10 | ==== dic ==== 11 | 10 12 | 0 po:number 13 | 1/10 po:number 14 | 2/10 po:number 15 | 3/10 po:number 16 | 4/10 po:number 17 | 5/10 po:number 18 | 6/10 po:number 19 | 7/10 po:number 20 | 8/10 po:number 21 | 9/10 po:number 22 | 23 | ==== valid ==== 24 | 1 25 | 10 26 | 100 27 | 1000 28 | 9 29 | 90 30 | 900 31 | 32 | ==== morph ==== 33 | 9 > po:number 34 | 10 > po:number is:tens 35 | 600 > po:number is:hundreds 36 | -------------------------------------------------------------------------------- /zspell/test-suite/i093-separate-dict-afx-flags.test: -------------------------------------------------------------------------------- 1 | %% SPDX-License-Identifier: GPL-2.0-or-later 2 | %% Flags that apply to dictionary items and flags that apply to other things in 3 | %% the affix file have separate namespaces. 4 | %% 5 | %% Examples come from the German dictionary 6 | %% 7 | 8 | 9 | ==== afx ==== 10 | # Afffix using `f` 11 | SFX f Y 4 12 | SFX f ph f ph 13 | SFX f ph fen ph 14 | SFX f phie fie phie 15 | SFX f phie fien phie 16 | 17 | # Meta flag using `f` 18 | # Affixes signed with CIRCUMFIX flag may be on a word when this word also has 19 | # a prefix with CIRCUMFIX flag and vice versa. 20 | # for decapitalizing nouns with fogemorphemes 21 | CIRCUMFIX f 22 | 23 | # Affix that adds the flag `f` 24 | # for Uppercased end-words to prepend - and lowercase: (Tier/EPSm) (EX: Bettbezüge und *-laken*) 25 | # AND 26 | # for lowercased end-words to prepend - and re-uppercase : (tier/EPSozm) (EX: Arbeits*-Tier*) 27 | PFX m Y 1 28 | PFX m f -F/co f 29 | 30 | %% FIXME(added flags): affixes need to be able to add flags 31 | -------------------------------------------------------------------------------- /zspell/test-util/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "test-util" 3 | version = "0.5.5" 4 | edition = "2021" 5 | publish = false 6 | 7 | [dependencies] 8 | zspell = { path = "../", features = ["zspell-unstable"] } 9 | regex = "1.10" 10 | pretty_assertions = "1.4" 11 | -------------------------------------------------------------------------------- /zspell/tests/files/tortoise_hare_misspelled.txt: -------------------------------------------------------------------------------- 1 | A Hare was mking fun of the Tortoise one day for being so slow. 2 | 3 | Do you ever get anywhere? he asked with a mocking laugh. 4 | 5 | Yes, replied the Tortoise, and I get there sooner than you think. I'll 6 | run you a race and prove it. 7 | 8 | The Hare was much amused at the iea of running a race with the Tortise, 9 | but for the fun of the thing he agreed. So the Fox, who had consented to 10 | act as judge, maarked the distance and started the runners off. 11 | 12 | The Hare was soon far out of sight, and to make the Tortoise feel very 13 | deeply how ridiculous it was for him to try a race with a Hare, he lay 14 | down beside the course to take a nap until the Tortoise should catch up. 15 | 16 | The Tortoise meanwhile kept going sloly but steadily, and, after a time, 17 | passed the place where the Hare was sleeping. But the Hare slept on very 18 | peacefully; and when at last he did wake up, the Tortoise was near the goal. 19 | The Hare now ran his swiftest, but he could not overtaake the Tortoise 20 | in time. 21 | -------------------------------------------------------------------------------- /zspell/tests/files/w1_eng_short.aff: -------------------------------------------------------------------------------- 1 | # This is a shortened en_US affix file 2 | SET UTF-8 3 | TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' 4 | ICONV 1 5 | ICONV ’ ' 6 | NOSUGGEST ! 7 | 8 | # ordinal numbers 9 | COMPOUNDMIN 1 10 | # only in compounds: 1th, 2th, 3th 11 | ONLYINCOMPOUND c 12 | # compound rules: 13 | # 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.) 14 | # 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.) 15 | COMPOUNDRULE 2 16 | COMPOUNDRULE n*1t 17 | COMPOUNDRULE n*mp 18 | WORDCHARS 0123456789 19 | 20 | PFX A Y 1 21 | PFX A 0 re . 22 | 23 | SFX V N 2 24 | SFX V e ive e 25 | SFX V 0 ive [^e] 26 | 27 | SFX N Y 3 28 | SFX N e ion e 29 | SFX N y ication y 30 | SFX N 0 en [^ey] 31 | 32 | SFX G Y 2 33 | SFX G e ing e 34 | SFX G 0 ing [^e] 35 | 36 | SFX D Y 4 37 | SFX D 0 d e 38 | SFX D y ied [^aeiou]y 39 | SFX D 0 ed [^ey] 40 | SFX D 0 ed [aeiou]y 41 | 42 | SFX T N 4 43 | SFX T 0 st e 44 | SFX T y iest [^aeiou]y 45 | SFX T 0 est [aeiou]y 46 | SFX T 0 est [^ey] 47 | 48 | SFX R Y 4 49 | SFX R 0 r e 50 | SFX R y ier [^aeiou]y 51 | SFX R 0 er [aeiou]y 52 | SFX R 0 er [^ey] 53 | 54 | SFX S Y 4 55 | SFX S y ies [^aeiou]y 56 | SFX S 0 s [aeiou]y 57 | SFX S 0 es [sxzh] 58 | SFX S 0 s [^sxzhy] 59 | 60 | SFX M Y 1 61 | SFX M 0 's . 62 | 63 | REP 12 64 | REP a ei 65 | REP ei a 66 | REP a ey 67 | REP ey a 68 | REP ai ie 69 | REP ie ai 70 | REP alot a_lot 71 | REP are air 72 | REP are ear 73 | REP are eir 74 | REP air are 75 | REP air ere 76 | -------------------------------------------------------------------------------- /zspell/tests/files/w1_eng_short.dic: -------------------------------------------------------------------------------- 1 | 4 2 | banana/SM 3 | pine/AGDS 4 | pillow/GMDS 5 | reptile/SM 6 | rust/MDGS 7 | okay 8 | I 9 | misspelled 10 | alright 11 | something 12 | well 13 | this 14 | time 15 | -------------------------------------------------------------------------------- /zspell/tests/suite.rs: -------------------------------------------------------------------------------- 1 | // include tests generated by `build.rs`, one test per file in tests/suite 2 | include!(concat!(env!("OUT_DIR"), "/auto_suite.rs")); 3 | --------------------------------------------------------------------------------