Accuracy test

├── .gitignore ├── .cargo └── config.toml ├── support ├── accuracy-viewer │ ├── .gitignore │ ├── src │ │ ├── main.js │ │ └── App.svelte │ ├── public │ │ ├── index.html │ │ └── global.css │ ├── package.json │ ├── rollup.config.js │ └── README.md ├── schemas │ └── tokenizer.fbs ├── divvunspell.hpp └── divvunspell.h ├── src ├── ffi │ ├── fbs │ │ └── mod.rs │ └── mod.rs ├── constants.rs ├── archive │ ├── error.rs │ ├── boxf.rs │ ├── mod.rs │ ├── zip.rs │ └── meta.rs ├── lib.rs ├── transducer │ ├── symbol_transition.rs │ ├── hfst │ │ ├── header.rs │ │ ├── index_table.rs │ │ ├── transition_table.rs │ │ ├── alphabet.rs │ │ └── mod.rs │ ├── convert.rs │ ├── alphabet.rs │ ├── mod.rs │ ├── thfst │ │ ├── index_table.rs │ │ ├── transition_table.rs │ │ ├── mod.rs │ │ └── chunked.rs │ └── tree_node.rs ├── speller │ └── suggestion.rs ├── paths.rs ├── vfs.rs ├── tokenizer │ ├── case_handling.rs │ └── mod.rs └── types.rs ├── crates ├── regtest │ ├── Cargo.toml │ └── src │ │ └── main.rs ├── thfst-tools │ ├── Cargo.toml │ └── src │ │ └── main.rs └── accuracy │ ├── Cargo.toml │ └── src │ └── main.rs ├── shell.nix ├── examples └── find-path.rs ├── cli └── Cargo.toml ├── LICENSE-MIT ├── Cargo.toml ├── .taskcluster.yml ├── README.md └── LICENSE-APACHE /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .DS_Store 3 | report.json 4 | tmp 5 | -------------------------------------------------------------------------------- /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [alias] 2 | accuracy-test = "run --bin accuracy --release" 3 | -------------------------------------------------------------------------------- /support/accuracy-viewer/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | public/bundle.* 4 | -------------------------------------------------------------------------------- /src/ffi/fbs/mod.rs: -------------------------------------------------------------------------------- 1 | pub(crate) mod tokenizer; 2 | 3 | #[doc(hidden)] 4 | pub trait IntoFlatbuffer { 5 | fn into_flatbuffer(self) -> Vec; 6 | } 7 | -------------------------------------------------------------------------------- /support/accuracy-viewer/src/main.js: -------------------------------------------------------------------------------- 1 | import App from './App.svelte'; 2 | 3 | const app = new App({ 4 | target: document.body, 5 | }); 6 | 7 | export default app; -------------------------------------------------------------------------------- /crates/regtest/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "regtest" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | clap = { version = "4.5.32", features = ["derive"] } 8 | csv = "1.3.1" 9 | divvunspell = { features = ["internal_convert", "compression"], path = "../.." } 10 | -------------------------------------------------------------------------------- /support/schemas/tokenizer.fbs: -------------------------------------------------------------------------------- 1 | table IndexedWord { 2 | index: uint64; 3 | value: string; 4 | } 5 | 6 | table WordContext { 7 | current: IndexedWord (required); 8 | first_before: IndexedWord; 9 | second_before: IndexedWord; 10 | first_after: IndexedWord; 11 | second_after: IndexedWord; 12 | } 13 | 14 | root_type WordContext; 15 | -------------------------------------------------------------------------------- /support/accuracy-viewer/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Accuracy test 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | fenix = import (fetchTarball "https://github.com/nix-community/fenix/archive/main.tar.gz") { }; 3 | pkgs = import {}; 4 | in 5 | pkgs.mkShell { 6 | buildInputs = [ 7 | (fenix.complete.withComponents [ 8 | "cargo" 9 | "clippy" 10 | "rust-src" 11 | "rustc" 12 | "rustfmt" 13 | ]) 14 | pkgs.openssl 15 | pkgs.pkg-config 16 | pkgs.libtorch-bin 17 | ]; 18 | } 19 | -------------------------------------------------------------------------------- /examples/find-path.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let tag_arg = match std::env::args().skip(1).next() { 3 | Some(v) => v, 4 | None => { 5 | eprintln!("No tag passed."); 6 | return; 7 | } 8 | }; 9 | 10 | let tag = tag_arg.parse().expect("Invalid tag"); 11 | 12 | match divvunspell::paths::find_speller_path(tag) { 13 | Some(v) => println!("Found: {}", v.display()), 14 | None => println!("Not found!"), 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /crates/thfst-tools/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thfst-tools" 3 | description = "Support tools for DivvunSpell - convert ZHFST files to BHFST." 4 | version = "1.0.0-beta.3" 5 | authors = ["Brendan Molloy "] 6 | edition = "2024" 7 | license = "GPL-3.0" 8 | repository = "https://github.com/divvun/divvunspell" 9 | 10 | [dependencies] 11 | serde_json = "1.0.57" 12 | divvunspell = { features = ["internal_convert", "compression"], path = "../.." } 13 | box-format = "0.3.2" 14 | clap = { version = "4.5", features = ["derive"] } 15 | tempfile = "3" 16 | -------------------------------------------------------------------------------- /crates/accuracy/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "accuracy" 3 | version = "1.0.0-beta.1" 4 | authors = ["Brendan Molloy "] 5 | edition = "2024" 6 | license = "GPL-3.0" 7 | publish = false 8 | 9 | [dependencies] 10 | tracing-subscriber = "0.3" 11 | serde = { version = "1.0.116", features = ["derive"] } 12 | serde_json = "1.0.57" 13 | divvunspell = { features = ["internal_convert", "compression"], path = "../.." } 14 | csv = { version = "1.1" } 15 | rayon = { version = "1.4.0" } 16 | indicatif = { version = "0.15", features = ["with_rayon"] } 17 | clap = { version = "4.5", features = ["derive"] } 18 | distance = "0.4.0" 19 | chrono = "0.4.19" 20 | -------------------------------------------------------------------------------- /cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "divvunspell-cli" 3 | description = "Spellchecker for ZHFST/BHFST spellers, with case handling and tokenization support." 4 | version = "1.0.0" 5 | authors = ["Brendan Molloy "] 6 | edition = "2024" 7 | license = "GPL-3.0" 8 | repository = "https://github.com/divvun/divvunspell" 9 | 10 | [[bin]] 11 | name = "divvunspell" 12 | path = "src/main.rs" 13 | 14 | [dependencies] 15 | divvunspell = { features = ["internal_convert", "compression"], path = ".." } 16 | tracing-subscriber = "0.3" 17 | 18 | serde.workspace = true 19 | serde_json.workspace = true 20 | box-format.workspace = true 21 | clap.workspace = true 22 | anyhow.workspace = true 23 | -------------------------------------------------------------------------------- /support/accuracy-viewer/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "svelte-app", 3 | "version": "1.0.0", 4 | "devDependencies": { 5 | "npm-run-all": "^4.1.5", 6 | "rollup": "^1.21.2", 7 | "rollup-plugin-commonjs": "^10.1.0", 8 | "rollup-plugin-livereload": "^1.0.0", 9 | "rollup-plugin-node-resolve": "^5.2.0", 10 | "rollup-plugin-svelte": "^5.0.3", 11 | "rollup-plugin-terser": "^4.0.4", 12 | "svelte": "^3.12.1" 13 | }, 14 | "dependencies": { 15 | "sirv-cli": "^0.4.4" 16 | }, 17 | "scripts": { 18 | "build": "rollup -c", 19 | "autobuild": "rollup -c -w", 20 | "dev": "run-p start:dev autobuild", 21 | "start": "sirv public --single", 22 | "start:dev": "sirv public --single --dev" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/constants.rs: -------------------------------------------------------------------------------- 1 | use crate::types::TransitionTableIndex; 2 | 3 | pub(crate) const INDEX_TABLE_SIZE: usize = 6; 4 | pub(crate) const TRANS_TABLE_SIZE: usize = 12; 5 | pub(crate) const TARGET_TABLE: TransitionTableIndex = TransitionTableIndex(2_147_483_648); 6 | 7 | #[cfg(test)] 8 | mod tests { 9 | #![allow(non_snake_case)] 10 | use super::*; 11 | 12 | #[test] 13 | fn test_INDEX_TABLE_SIZE() { 14 | use crate::types::*; 15 | use std::mem; 16 | 17 | let c = mem::size_of::() + mem::size_of::(); 18 | 19 | assert!(INDEX_TABLE_SIZE == c); 20 | } 21 | 22 | #[test] 23 | fn test_TRANS_TABLE_SIZE() { 24 | use crate::types::*; 25 | use std::mem; 26 | 27 | let c = 2 * mem::size_of::() 28 | + mem::size_of::() 29 | + mem::size_of::(); 30 | 31 | assert!(TRANS_TABLE_SIZE == c); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /support/accuracy-viewer/public/global.css: -------------------------------------------------------------------------------- 1 | html, body { 2 | position: relative; 3 | width: 100%; 4 | height: 100%; 5 | } 6 | 7 | body { 8 | color: #333; 9 | margin: 0; 10 | padding: 8px; 11 | box-sizing: border-box; 12 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", sans-serif; 13 | } 14 | 15 | a { 16 | color: rgb(0,100,200); 17 | text-decoration: none; 18 | } 19 | 20 | a:hover { 21 | text-decoration: underline; 22 | } 23 | 24 | a:visited { 25 | color: rgb(0,80,160); 26 | } 27 | 28 | label { 29 | display: block; 30 | } 31 | 32 | input, button, select, textarea { 33 | font-family: inherit; 34 | font-size: inherit; 35 | padding: 0.4em; 36 | margin: 0 0 0.5em 0; 37 | box-sizing: border-box; 38 | border: 1px solid #ccc; 39 | border-radius: 2px; 40 | } 41 | 42 | input:disabled { 43 | color: #ccc; 44 | } 45 | 46 | input[type="range"] { 47 | height: 0; 48 | } 49 | 50 | button { 51 | color: #333; 52 | background-color: #f4f4f4; 53 | outline: none; 54 | } 55 | 56 | button:active { 57 | background-color: #ddd; 58 | } 59 | 60 | button:focus { 61 | border-color: #666; 62 | } 63 | -------------------------------------------------------------------------------- /src/archive/error.rs: -------------------------------------------------------------------------------- 1 | //! Archive-related errors. 2 | use std::{ffi::OsString, io::Error}; 3 | 4 | use crate::transducer::TransducerError; 5 | 6 | /// Errors that can occur when opening or using a speller archive. 7 | #[derive(Debug, thiserror::Error)] 8 | #[non_exhaustive] 9 | pub enum SpellerArchiveError { 10 | /// Error opening or reading the archive file 11 | #[error("File error")] 12 | File(#[source] Error), 13 | 14 | /// I/O error while reading archive contents 15 | #[error("IO error")] 16 | Io(String, #[source] eieio::Error), 17 | 18 | /// Error loading or parsing a transducer from the archive 19 | #[error("Transducer error")] 20 | Transducer(#[source] TransducerError), 21 | 22 | /// Archive is missing required metadata 23 | #[error("Missing metadata")] 24 | NoMetadata, 25 | 26 | /// Archive uses unsupported compression 27 | #[error("Unsupported compression")] 28 | UnsupportedCompressed, 29 | 30 | /// Unknown error code encountered 31 | #[error("Unknown error code {0}")] 32 | Unknown(u8), 33 | 34 | /// File has an unsupported extension (expected .zhfst or .bhfst) 35 | #[error("Unsupported file extension: {0:?}")] 36 | UnsupportedExt(OsString), 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017-2025 Brendan Molloy 2 | Copyright (c) 2018-2025 UiT The Arctic University of Norway 3 | Copyright (c) 2018-2025 Sámediggi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! Spell-checking and correction with Finite-State Automata. 2 | 3 | Implements spell-checking and correction using weighted finite-state 4 | automata. The automata can be compiled using [`HFST`], 5 | this library is originally based on C++ code in [`HFST 6 | ospell`] 7 | 8 | [`HFST`]: (https://hfst.github.io) 9 | [`HFST ospell`]: (https://github.com/hfst/hfst-ospell) 10 | 11 | Further examples of how to use divvunspell library can be found in the 12 | [`cli`] in the same repository. 13 | 14 | [`cli`]: (https://github.com/divvun/divvunspell) 15 | */ 16 | 17 | // #![warn(missing_docs)] 18 | 19 | #![deny(unsafe_op_in_unsafe_fn)] 20 | 21 | pub mod archive; 22 | #[cfg(feature = "internal_ffi")] 23 | pub mod ffi; 24 | 25 | pub mod paths; 26 | pub mod speller; 27 | pub mod tokenizer; 28 | pub mod transducer; 29 | 30 | /// Virtual filesystem abstraction (internal use only) 31 | /// 32 | /// **Warning:** This module is only for internal tooling use and should not be used in normal applications. 33 | /// It may be removed or significantly changed in a future version without a major version bump. 34 | /// Use the higher-level [`archive`] module APIs instead. 35 | #[doc(hidden)] 36 | pub mod vfs; 37 | 38 | pub(crate) mod constants; 39 | /// Core types for transducers and spell-checking. 40 | /// 41 | /// This module contains type aliases and enums used throughout the transducer API. 42 | pub mod types; 43 | -------------------------------------------------------------------------------- /src/transducer/symbol_transition.rs: -------------------------------------------------------------------------------- 1 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 2 | 3 | /// Represents a transition in a finite-state transducer. 4 | /// 5 | /// A transition connects states in the FST and carries a symbol and weight. 6 | #[derive(Debug, Clone)] 7 | pub struct SymbolTransition { 8 | /// Target state index, or None if this is a final state 9 | pub target: Option, 10 | /// Input/output symbol number 11 | pub symbol: Option, 12 | /// Transition weight 13 | pub weight: Option, 14 | } 15 | 16 | impl SymbolTransition { 17 | pub fn new( 18 | target: Option, 19 | symbol: Option, 20 | weight: Option, 21 | ) -> SymbolTransition { 22 | SymbolTransition { 23 | target, 24 | symbol, 25 | weight, 26 | } 27 | } 28 | 29 | #[inline(always)] 30 | pub fn target(&self) -> Option { 31 | self.target 32 | } 33 | 34 | #[inline(always)] 35 | pub fn symbol(&self) -> Option { 36 | self.symbol 37 | } 38 | 39 | #[inline(always)] 40 | pub fn weight(&self) -> Option { 41 | self.weight 42 | } 43 | 44 | #[inline(always)] 45 | pub fn clone_with_epsilon_symbol(&self) -> SymbolTransition { 46 | SymbolTransition { 47 | target: self.target, 48 | symbol: Some(SymbolNumber(0)), 49 | weight: self.weight, 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /support/accuracy-viewer/rollup.config.js: -------------------------------------------------------------------------------- 1 | import svelte from 'rollup-plugin-svelte'; 2 | import resolve from 'rollup-plugin-node-resolve'; 3 | import commonjs from 'rollup-plugin-commonjs'; 4 | import livereload from 'rollup-plugin-livereload'; 5 | import { terser } from 'rollup-plugin-terser'; 6 | 7 | const production = !process.env.ROLLUP_WATCH; 8 | 9 | export default { 10 | input: 'src/main.js', 11 | output: { 12 | sourcemap: true, 13 | format: 'iife', 14 | name: 'app', 15 | file: 'public/bundle.js' 16 | }, 17 | plugins: [ 18 | svelte({ 19 | // enable run-time checks when not in production 20 | dev: !production, 21 | // we'll extract any component CSS out into 22 | // a separate file — better for performance 23 | css: css => { 24 | css.write('public/bundle.css'); 25 | } 26 | }), 27 | 28 | // If you have external dependencies installed from 29 | // npm, you'll most likely need these plugins. In 30 | // some cases you'll need additional configuration — 31 | // consult the documentation for details: 32 | // https://github.com/rollup/rollup-plugin-commonjs 33 | resolve({ 34 | browser: true, 35 | dedupe: importee => importee === 'svelte' || importee.startsWith('svelte/') 36 | }), 37 | commonjs(), 38 | 39 | // Watch the `public` directory and refresh the 40 | // browser on changes when not in production 41 | !production && livereload('public'), 42 | 43 | // If we're building for production (npm run build 44 | // instead of npm run dev), minify 45 | production && terser() 46 | ], 47 | watch: { 48 | clearScreen: false 49 | } 50 | }; 51 | -------------------------------------------------------------------------------- /support/accuracy-viewer/README.md: -------------------------------------------------------------------------------- 1 | *Psst — looking for a shareable component template? Go here --> [sveltejs/component-template](https://github.com/sveltejs/component-template)* 2 | 3 | --- 4 | 5 | # svelte app 6 | 7 | This is a project template for [Svelte](https://svelte.dev) apps. It lives at https://github.com/sveltejs/template. 8 | 9 | To create a new project based on this template using [degit](https://github.com/Rich-Harris/degit): 10 | 11 | ```bash 12 | npx degit sveltejs/template svelte-app 13 | cd svelte-app 14 | ``` 15 | 16 | *Note that you will need to have [Node.js](https://nodejs.org) installed.* 17 | 18 | 19 | ## Get started 20 | 21 | Install the dependencies... 22 | 23 | ```bash 24 | cd svelte-app 25 | npm install 26 | ``` 27 | 28 | ...then start [Rollup](https://rollupjs.org): 29 | 30 | ```bash 31 | npm run dev 32 | ``` 33 | 34 | Navigate to [localhost:5000](http://localhost:5000). You should see your app running. Edit a component file in `src`, save it, and reload the page to see your changes. 35 | 36 | 37 | ## Deploying to the web 38 | 39 | ### With [now](https://zeit.co/now) 40 | 41 | Install `now` if you haven't already: 42 | 43 | ```bash 44 | npm install -g now 45 | ``` 46 | 47 | Then, from within your project folder: 48 | 49 | ```bash 50 | cd public 51 | now 52 | ``` 53 | 54 | As an alternative, use the [Now desktop client](https://zeit.co/download) and simply drag the unzipped project folder to the taskbar icon. 55 | 56 | ### With [surge](https://surge.sh/) 57 | 58 | Install `surge` if you haven't already: 59 | 60 | ```bash 61 | npm install -g surge 62 | ``` 63 | 64 | Then, from within your project folder: 65 | 66 | ```bash 67 | npm run build 68 | surge public 69 | ``` 70 | -------------------------------------------------------------------------------- /src/speller/suggestion.rs: -------------------------------------------------------------------------------- 1 | //! Suggestion for a spelling correction. 2 | use crate::types::Weight; 3 | use serde::{Deserialize, Serialize}; 4 | use smol_str::SmolStr; 5 | use std::cmp::Ordering; 6 | use std::cmp::Ordering::Equal; 7 | 8 | #[derive(Clone, Debug, Serialize, Deserialize)] 9 | /// Suggestion for a spelling correction 10 | pub struct Suggestion { 11 | /// the suggested word-form 12 | pub value: SmolStr, 13 | /// total penalty weight of the word-form 14 | pub weight: Weight, 15 | /// whether the word is completed or partial 16 | #[serde(skip_serializing_if = "Option::is_none")] 17 | pub completed: Option, 18 | } 19 | 20 | impl Suggestion { 21 | /// creates a spelling correction suggestion 22 | pub fn new(value: SmolStr, weight: Weight, completed: Option) -> Suggestion { 23 | Suggestion { 24 | value, 25 | weight, 26 | completed, 27 | } 28 | } 29 | 30 | /// gets the suggested word-form 31 | pub fn value(&self) -> &str { 32 | &self.value 33 | } 34 | 35 | /// gets the penalty weight of the suggestion 36 | pub fn weight(&self) -> Weight { 37 | self.weight 38 | } 39 | 40 | /// returns whether this suggestion is a full word or partial 41 | pub fn completed(&self) -> Option { 42 | self.completed 43 | } 44 | } 45 | 46 | impl PartialOrd for Suggestion { 47 | fn partial_cmp(&self, other: &Self) -> Option { 48 | Some(self.cmp(other)) 49 | } 50 | } 51 | 52 | impl Ord for Suggestion { 53 | fn cmp(&self, other: &Self) -> Ordering { 54 | let x = self.weight.partial_cmp(&other.weight).unwrap_or(Equal); 55 | 56 | if let Equal = x { 57 | return self.value.cmp(&other.value); 58 | } 59 | 60 | x 61 | } 62 | } 63 | 64 | impl PartialEq for Suggestion { 65 | fn eq(&self, other: &Self) -> bool { 66 | self.value == other.value && self.weight == other.weight 67 | } 68 | } 69 | 70 | impl Eq for Suggestion {} 71 | -------------------------------------------------------------------------------- /src/transducer/hfst/header.rs: -------------------------------------------------------------------------------- 1 | use byteorder::{LittleEndian, ReadBytesExt}; 2 | use std::io::Cursor; 3 | 4 | use crate::types::{HeaderFlag, SymbolNumber, TransitionTableIndex}; 5 | 6 | #[derive(Debug)] 7 | pub struct TransducerHeader { 8 | symbols: SymbolNumber, 9 | input_symbols: SymbolNumber, 10 | trans_index_table: TransitionTableIndex, 11 | trans_target_table: TransitionTableIndex, 12 | states: TransitionTableIndex, 13 | transitions: TransitionTableIndex, 14 | 15 | properties: [bool; 9], 16 | header_size: usize, 17 | } 18 | 19 | #[allow(clippy::len_without_is_empty)] 20 | impl TransducerHeader { 21 | pub fn new(buf: &[u8]) -> TransducerHeader { 22 | let mut rdr = Cursor::new(buf); 23 | 24 | // Skip HFST string 25 | rdr.set_position(5); 26 | 27 | let header_len = rdr.read_u16::().unwrap(); 28 | 29 | rdr.set_position(8); 30 | 31 | let pos = rdr.position() + u64::from(header_len); 32 | rdr.set_position(pos); 33 | 34 | let input_symbols = SymbolNumber(rdr.read_u16::().unwrap()); 35 | let symbols = SymbolNumber(rdr.read_u16::().unwrap()); 36 | let trans_index_table = TransitionTableIndex(rdr.read_u32::().unwrap()); 37 | let trans_target_table = TransitionTableIndex(rdr.read_u32::().unwrap()); 38 | let states = TransitionTableIndex(rdr.read_u32::().unwrap()); 39 | let transitions = TransitionTableIndex(rdr.read_u32::().unwrap()); 40 | 41 | let mut props = [false; 9]; 42 | 43 | for prop in props.iter_mut() { 44 | let v = rdr.read_u32::().unwrap(); 45 | *prop = v != 0 46 | } 47 | 48 | TransducerHeader { 49 | symbols, 50 | input_symbols, 51 | trans_index_table, 52 | trans_target_table, 53 | states, 54 | transitions, 55 | properties: props, 56 | header_size: rdr.position() as usize, 57 | } 58 | } 59 | 60 | pub fn symbol_count(&self) -> SymbolNumber { 61 | self.symbols 62 | } 63 | 64 | pub fn input_symbol_count(&self) -> SymbolNumber { 65 | self.input_symbols 66 | } 67 | 68 | pub fn index_table_size(&self) -> TransitionTableIndex { 69 | self.trans_index_table 70 | } 71 | 72 | pub fn target_table_size(&self) -> TransitionTableIndex { 73 | self.trans_target_table 74 | } 75 | 76 | pub fn has_flag(&self, flag: HeaderFlag) -> bool { 77 | self.properties[flag as usize] 78 | } 79 | 80 | pub fn states(&self) -> TransitionTableIndex { 81 | self.states 82 | } 83 | 84 | pub fn transitions(&self) -> TransitionTableIndex { 85 | self.transitions 86 | } 87 | 88 | pub fn properties(&self) -> &[bool; 9] { 89 | &self.properties 90 | } 91 | 92 | pub fn len(&self) -> usize { 93 | self.header_size as usize 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "divvunspell" 3 | description = "Spell checking library for ZHFST/BHFST spellers, with case handling and tokenization support." 4 | version = "1.0.0-beta.7" 5 | authors = ["Brendan Molloy "] 6 | edition = "2024" 7 | license = "MIT OR Apache-2.0" 8 | repository = "https://github.com/divvun/divvunspell" 9 | 10 | [lib] 11 | name = "divvunspell" 12 | crate-type = ["rlib", "staticlib", "cdylib"] 13 | 14 | [workspace.dependencies] 15 | libc = "0.2" 16 | memmap2 = "0.9.4" 17 | byteorder = "1.3.4" 18 | serde = { version = "1.0.116", features = ["derive"] } 19 | serde_json = "1.0.57" 20 | serde-xml-rs = { version = "=0.6.0", default-features = false } 21 | zip = { version = "6.0.0", default-features = false } 22 | unic-segment = "0.9.0" 23 | unic-char-range = "0.9.0" 24 | unic-char-property = "0.9.0" 25 | unic-ucd-category = "0.9.0" 26 | unic-emoji-char = "0.9.0" 27 | parking_lot = "0.12.5" 28 | hashbrown = { version = "0.16", features = ["serde"] } 29 | lifeguard = "0.6.1" 30 | smol_str = { version = "0.3.2", features = ["serde"] } 31 | box-format = { version = "0.3.2", features = ["reader"], default-features = false } 32 | itertools = "0.14.0" 33 | strsim = "0.11.1" 34 | log = "0.4.11" 35 | unic-ucd-common = "0.9.0" 36 | # env_logger = { version = "0.11.8", optional = true } 37 | thiserror = "2.0.17" 38 | tempfile = "3.3.0" 39 | fs_extra = "1.2.0" 40 | eieio = "1.0.0" 41 | pathos = "0.3.0" 42 | language-tags = "0.3.2" 43 | globwalk = "0.9.1" 44 | tracing = "0.1.37" 45 | clap = { version = "4.5", features = ["derive"] } 46 | anyhow = "1" 47 | 48 | [dependencies] 49 | cffi = { git = "https://github.com/cffi-rs/cffi", branch = "next", optional = true } 50 | flatbuffers = { version = "25.9.23", optional = true } 51 | libc.workspace = true 52 | memmap2.workspace = true 53 | byteorder.workspace = true 54 | serde.workspace = true 55 | serde_json.workspace = true 56 | serde-xml-rs.workspace = true 57 | zip.workspace = true 58 | unic-segment.workspace = true 59 | unic-char-range.workspace = true 60 | unic-char-property.workspace = true 61 | unic-ucd-category.workspace = true 62 | unic-emoji-char.workspace = true 63 | parking_lot.workspace = true 64 | hashbrown.workspace = true 65 | lifeguard.workspace = true 66 | smol_str.workspace = true 67 | box-format.workspace = true 68 | itertools.workspace = true 69 | strsim.workspace = true 70 | unic-ucd-common.workspace = true 71 | thiserror.workspace = true 72 | tempfile.workspace = true 73 | fs_extra.workspace = true 74 | eieio.workspace = true 75 | pathos.workspace = true 76 | language-tags.workspace = true 77 | globwalk.workspace = true 78 | tracing.workspace = true 79 | 80 | [features] 81 | compression = ["zip/deflate"] 82 | 83 | # Internal features: unstable, not for external use! 84 | internal_convert = [] 85 | internal_ffi = ["flatbuffers", "cffi"] 86 | 87 | [workspace] 88 | resolver = "2" 89 | members = [ 90 | ".", 91 | "cli", 92 | "crates/*" 93 | ] 94 | 95 | [profile.dev] 96 | opt-level = 1 97 | 98 | [profile.release] 99 | debug = true 100 | codegen-units = 1 101 | lto = "fat" 102 | -------------------------------------------------------------------------------- /src/paths.rs: -------------------------------------------------------------------------------- 1 | //! Platform-specific paths for locating installed spell-checkers. 2 | //! 3 | //! Provides functions to find spell-checker files in standard system locations 4 | //! based on language tags. 5 | #[cfg(target_os = "macos")] 6 | use std::path::PathBuf; 7 | #[cfg(target_os = "windows")] 8 | use std::path::PathBuf; 9 | #[cfg(target_os = "linux")] 10 | use std::path::PathBuf; 11 | 12 | #[cfg(target_os = "macos")] 13 | use language_tags::LanguageTag; 14 | #[cfg(target_os = "windows")] 15 | use language_tags::LanguageTag; 16 | #[cfg(target_os = "linux")] 17 | use language_tags::LanguageTag; 18 | 19 | #[cfg(target_os = "macos")] 20 | /// Find a spell-checker file for the given language tag on macOS. 21 | /// 22 | /// Searches for `.bhfst` or `.zhfst` files matching the language tag in: 23 | /// 1. User services directory (`~/Library/Services`) 24 | /// 2. System services directory (`/Library/Services`) 25 | /// 26 | /// # Arguments 27 | /// 28 | /// * `tag` - BCP 47 language tag (e.g., "en-US", "se") 29 | /// 30 | /// # Returns 31 | /// 32 | /// The path to the spell-checker file if found, or `None` if not found. 33 | pub fn find_speller_path(tag: LanguageTag) -> Option { 34 | let tag = tag.to_string(); 35 | let pattern = format!("{tag}.{{bhfst,zhfst}}"); 36 | if let Ok(path) = pathos::macos::user::services_dir() { 37 | match globwalk::GlobWalkerBuilder::new(path, &pattern) 38 | .build() 39 | .unwrap() 40 | .into_iter() 41 | .filter_map(Result::ok) 42 | .next() 43 | { 44 | Some(v) => return Some(v.path().to_path_buf()), 45 | None => {} 46 | } 47 | } 48 | 49 | globwalk::GlobWalkerBuilder::new(pathos::macos::system::services_dir(), &pattern) 50 | .build() 51 | .unwrap() 52 | .into_iter() 53 | .filter_map(Result::ok) 54 | .next() 55 | .map(|v| v.path().to_path_buf()) 56 | } 57 | 58 | #[cfg(windows)] 59 | /// Find a spell-checker file for the given language tag on Windows. 60 | /// 61 | /// Searches for `.bhfst` or `.zhfst` files matching the language tag in 62 | /// `C:\Program Files\WinDivvun\spellers`. 63 | /// 64 | /// # Arguments 65 | /// 66 | /// * `tag` - BCP 47 language tag (e.g., "en-US", "se") 67 | /// 68 | /// # Returns 69 | /// 70 | /// The path to the spell-checker file if found, or `None` if not found. 71 | pub fn find_speller_path(tag: LanguageTag) -> Option { 72 | let tag = tag.to_string(); 73 | let pattern = format!("{tag}.{{bhfst,zhfst}}"); 74 | 75 | globwalk::GlobWalkerBuilder::new(r"C:\Program Files\WinDivvun\spellers", &pattern) 76 | .build() 77 | .unwrap() 78 | .into_iter() 79 | .filter_map(Result::ok) 80 | .next() 81 | .map(|x| x.path().to_path_buf()) 82 | } 83 | 84 | #[cfg(target_os = "linux")] 85 | /// Find a spell-checker file for the given language tag on Linux. 86 | /// 87 | /// # Arguments 88 | /// 89 | /// * `tag` - BCP 47 language tag (e.g., "en-US", "se") 90 | /// 91 | /// # Returns 92 | /// 93 | /// Currently always returns `None` as no standard paths are defined for Linux. 94 | pub fn find_speller_path(tag: LanguageTag) -> Option { 95 | None 96 | } 97 | -------------------------------------------------------------------------------- /support/divvunspell.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifndef _DIVVUNSPELL_H 3 | #define _DIVVUNSPELL_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "divvunspell.h" 11 | 12 | namespace divvunspell { 13 | 14 | struct Suggestion { 15 | std::string value; 16 | float weight; 17 | 18 | Suggestion(std::string value, float weight) : value(value), weight(weight) {} 19 | }; 20 | 21 | class SpellerError : public std::exception { 22 | std::string message; 23 | 24 | public: 25 | SpellerError(std::string message) : message(message.c_str()) { 26 | } 27 | 28 | const char* what() const throw () { 29 | return message.c_str(); 30 | } 31 | }; 32 | 33 | struct SpellerConfig { 34 | std::size_t nBest; 35 | float maxWeight; 36 | float beam; 37 | 38 | SpellerConfig(std::size_t nBest, float maxWeight, float beam) : nBest(nBest), maxWeight(maxWeight), beam(beam) {} 39 | SpellerConfig() : nBest(5), maxWeight(20000.0), beam(0.0) {} 40 | }; 41 | 42 | class SpellerArchive { 43 | private: 44 | speller_t* handle; 45 | 46 | SpellerArchive(std::string path); 47 | public: 48 | ~SpellerArchive(); 49 | static std::shared_ptr create(std::string path); 50 | std::string locale(); 51 | bool isCorrect(std::string word); 52 | std::vector suggest(std::string word); 53 | std::vector suggest(std::string word, SpellerConfig config); 54 | }; 55 | 56 | std::shared_ptr SpellerArchive::create(std::string path) { 57 | return std::shared_ptr(new SpellerArchive(path)); 58 | } 59 | 60 | std::string SpellerArchive::locale() { 61 | auto c_locale = speller_meta_get_locale(handle); 62 | std::string locale = std::string(c_locale); 63 | speller_str_free(c_locale); 64 | return locale; 65 | } 66 | 67 | bool SpellerArchive::isCorrect(std::string word) { 68 | return speller_is_correct(handle, word.c_str()); 69 | } 70 | 71 | std::vector SpellerArchive::suggest(std::string word, SpellerConfig config) { 72 | auto vec_handle = speller_suggest(handle, word.c_str(), config.nBest, config.maxWeight, config.beam); 73 | auto len = suggest_vec_len(vec_handle); 74 | 75 | std::vector out_vector; 76 | 77 | for (auto i = 0; i < len; ++i) { 78 | auto c_value = suggest_vec_get_value(vec_handle, i); 79 | auto weight = suggest_vec_get_weight(vec_handle, i); 80 | std::string value(c_value); 81 | suggest_vec_value_free(c_value); 82 | out_vector.push_back(Suggestion(value, weight)); 83 | } 84 | 85 | suggest_vec_free(vec_handle); 86 | 87 | return out_vector; 88 | } 89 | 90 | std::vector SpellerArchive::suggest(std::string word) { 91 | return suggest(word, SpellerConfig()); 92 | } 93 | 94 | SpellerArchive::SpellerArchive(std::string path) { 95 | char* error = nullptr; 96 | speller_t* handle = speller_archive_new(path.c_str(), &error); 97 | 98 | if (handle == NULL) { 99 | auto msg = std::string(error); 100 | speller_str_free(error); 101 | throw new SpellerError(msg); 102 | } 103 | 104 | this->handle = handle; 105 | } 106 | 107 | SpellerArchive::~SpellerArchive() { 108 | speller_archive_free(handle); 109 | } 110 | 111 | } 112 | #endif -------------------------------------------------------------------------------- /.taskcluster.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | reporting: checks-v1 3 | policy: 4 | pullRequests: public 5 | tasks: 6 | $let: 7 | head_rev: 8 | $if: tasks_for == "github-pull-request" 9 | then: ${event.pull_request.head.sha} 10 | else: ${event.after} 11 | repository: 12 | $if: tasks_for == "github-pull-request" 13 | then: ${event.pull_request.head.repo.html_url} 14 | else: ${event.repository.html_url} 15 | repo_name: 16 | $if: tasks_for == "github-pull-request" 17 | then: ${event.pull_request.head.repo.name} 18 | else: ${event.repository.name} 19 | repo_full_name: 20 | $if: tasks_for == "github-pull-request" 21 | then: ${event.pull_request.head.repo.full_name} 22 | else: ${event.repository.full_name} 23 | branch: 24 | $if: tasks_for == "github-pull-request" 25 | then: "pull-request" 26 | else: 27 | $if: "event.ref[:11] == 'refs/heads/'" 28 | then: "branch:${event.ref[11:]}" 29 | else: 30 | $if: "event.ref[:10] == 'refs/tags/'" 31 | then: "tag:${event.ref[10:]}" 32 | else: "unknown" 33 | git_ref: 34 | $if: tasks_for == "github-pull-request" 35 | then: "refs/pull/${event.pull_request.number}" 36 | else: ${event.ref} 37 | should_run: 38 | $match: 39 | (tasks_for == "github-push") || (tasks_for == "github-pull-request" && event["action"] in ["opened","reopened","synchronize"]): 40 | in: 41 | $if: should_run 42 | then: 43 | - metadata: 44 | name: Decision task 45 | description: '' 46 | owner: ${event.sender.login}@users.noreply.github.com 47 | source: 48 | $if: tasks_for == "github-pull-request" 49 | then: ${event.pull_request.html_url} 50 | else: ${event.compare} 51 | tags: 52 | git_ref: ${git_ref} 53 | projectId: "divvun" 54 | taskQueueId: divvun/linux 55 | deadline: 56 | $fromNow: 1 day 57 | scopes: 58 | - "assume:repo:github.com/${repo_full_name}:${branch}" 59 | - "object:upload:divvun:*" 60 | payload: 61 | maxRunTime: 3600 62 | image: "ghcr.io/divvun/taskcluster-decision-task-image:main" 63 | features: 64 | # Needed for the decision task to create other tasks 65 | taskclusterProxy: true 66 | command: 67 | - /bin/bash 68 | - '--login' 69 | - '-e' 70 | - '-c' 71 | - >- 72 | git init ci && 73 | cd ci && 74 | git fetch --depth 1 "$CI_REPO_URL" "$CI_REPO_REF" && 75 | git reset --hard FETCH_HEAD && 76 | python3 decision_task.py 77 | env: 78 | GIT_URL: ${event.repository.clone_url} 79 | TASK_FOR: ${tasks_for} 80 | GIT_REF: ${git_ref} 81 | GITHUB_REF: ${git_ref} 82 | GIT_SHA: 83 | $if: tasks_for == "github-pull-request" 84 | then: ${event.pull_request.head.sha} 85 | else: ${event.after} 86 | TASK_OWNER: ${event.sender.login}@users.noreply.github.com 87 | TASK_SOURCE: 88 | $if: tasks_for == "github-pull-request" 89 | then: ${event.pull_request.html_url} 90 | else: ${event.compare} 91 | REPO_NAME: ${repo_name} 92 | REPO_FULL_NAME: ${repo_full_name} 93 | CI_REPO_URL: https://github.com/divvun/taskcluster-scripts.git 94 | CI_REPO_REF: main 95 | -------------------------------------------------------------------------------- /src/transducer/convert.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::{BufWriter, prelude::*}; 3 | use std::path::Path; 4 | 5 | use byteorder::{LittleEndian, WriteBytesExt}; 6 | 7 | use super::hfst; 8 | use super::thfst; 9 | use crate::transducer::Transducer; 10 | use crate::types::{SymbolNumber, TransitionTableIndex}; 11 | 12 | pub trait ConvertFile { 13 | fn convert_file(transducer: &T, path: &Path) -> Result<(), std::io::Error>; 14 | } 15 | 16 | pub trait ConvertFrom { 17 | fn convert_from(from: &T, writer: &mut W) -> Result<(), std::io::Error>; 18 | } 19 | 20 | impl ConvertFile> 21 | for thfst::MemmapThfstTransducer 22 | { 23 | fn convert_file( 24 | transducer: &hfst::HfstTransducer, 25 | path: &Path, 26 | ) -> Result<(), std::io::Error> { 27 | let thfst_path = path.with_extension("thfst"); 28 | std::fs::create_dir_all(&thfst_path)?; 29 | 30 | let transition_path = thfst_path.join("transition"); 31 | let index_path = thfst_path.join("index"); 32 | let alphabet_path = thfst_path.join("alphabet"); 33 | 34 | let mut writer = BufWriter::new(File::create(transition_path)?); 35 | thfst::MemmapTransitionTable::convert_from(&transducer.transition_table, &mut writer)?; 36 | 37 | let mut writer = BufWriter::new(File::create(index_path)?); 38 | thfst::MemmapIndexTable::convert_from(&transducer.index_table, &mut writer)?; 39 | 40 | let writer = BufWriter::new(File::create(alphabet_path)?); 41 | serde_json::to_writer_pretty(writer, transducer.alphabet())?; 42 | 43 | Ok(()) 44 | } 45 | } 46 | 47 | impl ConvertFrom for thfst::MemmapIndexTable { 48 | fn convert_from( 49 | table: &hfst::MappedIndexTable, 50 | writer: &mut W, 51 | ) -> Result<(), std::io::Error> { 52 | for index in 0..table.size.0 { 53 | let input_symbol = table 54 | .input_symbol(TransitionTableIndex(index)) 55 | .unwrap_or(SymbolNumber::MAX); 56 | let targetish = table 57 | .target(TransitionTableIndex(index)) 58 | .unwrap_or(TransitionTableIndex::MAX); 59 | 60 | writer.write_u16::(input_symbol.0).unwrap(); 61 | writer.write_u16::(0).unwrap(); 62 | writer.write_u32::(targetish.0).unwrap(); 63 | } 64 | 65 | Ok(()) 66 | } 67 | } 68 | 69 | impl ConvertFrom for thfst::MemmapTransitionTable { 70 | fn convert_from( 71 | table: &hfst::MappedTransitionTable, 72 | writer: &mut W, 73 | ) -> Result<(), std::io::Error> { 74 | for index in 0..table.size.0 { 75 | let index = TransitionTableIndex(index); 76 | let input_symbol = table.input_symbol(index).unwrap_or(SymbolNumber::MAX); 77 | let output_symbol = table.output_symbol(index).unwrap_or(SymbolNumber::MAX); 78 | let target = table.target(index).unwrap_or(TransitionTableIndex::MAX); 79 | let weight = table.weight(index).unwrap(); 80 | 81 | writer.write_u16::(input_symbol.0).unwrap(); 82 | writer.write_u16::(output_symbol.0).unwrap(); 83 | writer.write_u32::(target.0).unwrap(); 84 | writer 85 | .write_u32::(weight.0.to_bits()) 86 | .unwrap(); 87 | } 88 | 89 | Ok(()) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/archive/boxf.rs: -------------------------------------------------------------------------------- 1 | //! Box-based archive stuff. 2 | use std::sync::Arc; 3 | 4 | use box_format::BoxFileReader; 5 | 6 | use super::error::SpellerArchiveError; 7 | use super::{SpellerArchive, meta::SpellerMetadata}; 8 | use crate::speller::{HfstSpeller, Speller}; 9 | use crate::transducer::{ 10 | Transducer, 11 | thfst::{MemmapThfstChunkedTransducer, MemmapThfstTransducer}, 12 | }; 13 | use crate::vfs::Filesystem; 14 | use crate::vfs::boxf::Filesystem as BoxFilesystem; 15 | 16 | /// An archive with mmaped language and error model THFST automata archive. 17 | pub type ThfstBoxSpellerArchive = BoxSpellerArchive< 18 | MemmapThfstTransducer, 19 | MemmapThfstTransducer, 20 | >; 21 | 22 | /// An archive with mmaped chunked language and error model THFST automata 23 | /// file. 24 | pub type ThfstChunkedBoxSpeller = HfstSpeller< 25 | crate::vfs::boxf::File, 26 | MemmapThfstChunkedTransducer, 27 | MemmapThfstChunkedTransducer, 28 | >; 29 | 30 | /// An archive with mmaped language and error model THFST automata file. 31 | pub type ThfstBoxSpeller = HfstSpeller< 32 | crate::vfs::boxf::File, 33 | MemmapThfstTransducer, 34 | MemmapThfstTransducer, 35 | >; 36 | 37 | /// An archive with mmaped chunked language and error model THFST automata 38 | /// archive. 39 | pub type ThfstChunkedBoxSpellerArchive = BoxSpellerArchive< 40 | MemmapThfstChunkedTransducer, 41 | MemmapThfstChunkedTransducer, 42 | >; 43 | 44 | /// Speller in box archive. 45 | pub struct BoxSpellerArchive 46 | where 47 | T: Transducer, 48 | U: Transducer, 49 | { 50 | metadata: Option, 51 | speller: Arc>, 52 | } 53 | 54 | impl BoxSpellerArchive 55 | where 56 | T: Transducer + Send + Sync + 'static, 57 | U: Transducer + Send + Sync + 'static, 58 | { 59 | /// get the spell-checking component 60 | pub fn hfst_speller(&self) -> Arc> { 61 | self.speller.clone() 62 | } 63 | } 64 | 65 | impl SpellerArchive for BoxSpellerArchive 66 | where 67 | T: Transducer + Send + Sync + 'static, 68 | U: Transducer + Send + Sync + 'static, 69 | { 70 | fn open(file_path: &std::path::Path) -> Result, SpellerArchiveError> { 71 | let archive = BoxFileReader::open(file_path).map_err(|e| { 72 | SpellerArchiveError::File(std::io::Error::new(std::io::ErrorKind::Other, e)) 73 | })?; 74 | 75 | let fs = BoxFilesystem::new(&archive); 76 | 77 | let metadata = fs 78 | .open_file("meta.json") 79 | .ok() 80 | .and_then(|x| serde_json::from_reader(x).ok()); 81 | let errmodel = 82 | T::from_path(&fs, "errmodel.default.thfst").map_err(SpellerArchiveError::Transducer)?; 83 | let acceptor = 84 | U::from_path(&fs, "acceptor.default.thfst").map_err(SpellerArchiveError::Transducer)?; 85 | 86 | let speller = HfstSpeller::new(errmodel, acceptor); 87 | Ok(BoxSpellerArchive { speller, metadata }) 88 | } 89 | 90 | fn speller(&self) -> Arc { 91 | self.speller.clone() 92 | } 93 | 94 | fn metadata(&self) -> Option<&SpellerMetadata> { 95 | self.metadata.as_ref() 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/ffi/mod.rs: -------------------------------------------------------------------------------- 1 | use libc::c_char; 2 | use std::ffi::{CStr, CString}; 3 | 4 | use crate::tokenizer::{Tokenize, WordIndices}; 5 | 6 | pub(crate) mod fbs; 7 | 8 | #[unsafe(no_mangle)] 9 | pub extern "C" fn divvun_word_indices<'a>(utf8_string: *const c_char) -> *mut WordIndices<'a> { 10 | let c_str = unsafe { CStr::from_ptr(utf8_string) }; 11 | let string = c_str.to_str().unwrap(); 12 | let iterator = string.word_indices(); 13 | Box::into_raw(Box::new(iterator)) as *mut _ 14 | } 15 | 16 | #[unsafe(no_mangle)] 17 | pub extern "C" fn divvun_word_indices_next<'a>( 18 | iterator: *mut WordIndices<'a>, 19 | out_index: *mut u64, 20 | out_string: *mut *const c_char, 21 | ) -> u8 { 22 | let iterator = unsafe { &mut *iterator }; 23 | 24 | match iterator.next() { 25 | Some((index, word)) => { 26 | let c_word = CString::new(word).unwrap(); 27 | unsafe { 28 | *out_index = index as u64; 29 | *out_string = c_word.into_raw(); 30 | } 31 | 1 32 | } 33 | None => 0, 34 | } 35 | } 36 | 37 | #[unsafe(no_mangle)] 38 | pub extern "C" fn divvun_word_indices_free<'a>(handle: *mut WordIndices<'a>) { 39 | drop(unsafe { Box::from_raw(handle) }); 40 | } 41 | 42 | #[unsafe(no_mangle)] 43 | pub extern "C" fn divvun_cstr_free(handle: *mut c_char) { 44 | drop(unsafe { CString::from_raw(handle) }); 45 | } 46 | 47 | use crate::ffi::fbs::IntoFlatbuffer; 48 | use crate::tokenizer::WordContext; 49 | use cffi::{FromForeign, Slice, ToForeign}; 50 | use std::convert::Infallible; 51 | 52 | pub struct FbsMarshaler; 53 | 54 | impl cffi::ReturnType for FbsMarshaler { 55 | type Foreign = Slice; 56 | type ForeignTraitObject = (); 57 | 58 | fn foreign_default() -> Self::Foreign { 59 | Slice::default() 60 | } 61 | } 62 | 63 | impl ToForeign> for FbsMarshaler { 64 | type Error = Infallible; 65 | 66 | fn to_foreign(bufferable: T) -> Result, Self::Error> { 67 | let vec = bufferable.into_flatbuffer(); 68 | cffi::VecMarshaler::to_foreign(vec) 69 | } 70 | } 71 | 72 | #[unsafe(no_mangle)] 73 | pub unsafe extern "C" fn divvun_fbs_free(slice: Slice) { 74 | unsafe { 75 | cffi::VecMarshaler::from_foreign(slice) 76 | .map(|_| ()) 77 | .unwrap_or(()) 78 | }; 79 | } 80 | 81 | #[doc(hidden)] 82 | /// This makes the cffi go brrr. 83 | pub unsafe extern "C" fn _cffi_string_free(ptr: Slice) { 84 | unsafe { cffi::ffi::cffi_string_free(ptr) }; 85 | } 86 | 87 | #[cffi::marshal(return_marshaler = "FbsMarshaler")] 88 | pub extern "C" fn divvun_cursor_context( 89 | #[marshal(cffi::StrMarshaler)] first_half: &str, 90 | #[marshal(cffi::StrMarshaler)] second_half: &str, 91 | ) -> WordContext { 92 | crate::tokenizer::cursor_context(first_half, second_half) 93 | } 94 | 95 | #[cfg(all(test, feature = "internal_ffi"))] 96 | mod tests { 97 | use crate::ffi::fbs::IntoFlatbuffer; 98 | 99 | #[test] 100 | fn fbs() { 101 | let word_context = crate::tokenizer::cursor_context("this is some", " text"); 102 | println!("{:?}", &word_context); 103 | 104 | let buf = word_context.into_flatbuffer(); 105 | println!("{:?}", &buf); 106 | 107 | let word_context = crate::ffi::fbs::tokenizer::get_root_as_word_context(&buf); 108 | println!( 109 | "{:?} {:?}", 110 | &word_context.current().index(), 111 | &word_context.current().value() 112 | ); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/transducer/alphabet.rs: -------------------------------------------------------------------------------- 1 | use hashbrown::HashMap; 2 | use serde::{Deserialize, Serialize}; 3 | use smol_str::SmolStr; 4 | 5 | use crate::transducer::Transducer; 6 | use crate::types::{OperationsMap, SymbolNumber}; 7 | 8 | #[derive(Debug, Serialize, Deserialize)] 9 | pub struct TransducerAlphabet { 10 | pub(crate) key_table: Vec, 11 | pub(crate) initial_symbol_count: SymbolNumber, 12 | pub(crate) flag_state_size: SymbolNumber, 13 | pub(crate) length: usize, 14 | pub(crate) string_to_symbol: HashMap, 15 | pub(crate) operations: OperationsMap, 16 | pub(crate) identity_symbol: Option, 17 | pub(crate) unknown_symbol: Option, 18 | } 19 | 20 | impl TransducerAlphabet { 21 | #[inline(always)] 22 | pub fn string_from_symbols(&self, syms: &[SymbolNumber]) -> SmolStr { 23 | syms.iter() 24 | .map(|s| &*self.key_table[s.0 as usize]) 25 | .collect() 26 | } 27 | 28 | #[inline(always)] 29 | pub fn key_table(&self) -> &Vec { 30 | &self.key_table 31 | } 32 | 33 | #[inline(always)] 34 | pub fn state_size(&self) -> SymbolNumber { 35 | self.flag_state_size 36 | } 37 | 38 | #[inline(always)] 39 | pub fn operations(&self) -> &OperationsMap { 40 | &self.operations 41 | } 42 | 43 | #[inline(always)] 44 | pub fn string_to_symbol(&self) -> &HashMap { 45 | &self.string_to_symbol 46 | } 47 | 48 | #[inline(always)] 49 | pub fn is_flag(&self, symbol: SymbolNumber) -> bool { 50 | self.operations.contains_key(&symbol) 51 | } 52 | 53 | #[inline(always)] 54 | pub fn add_symbol(&mut self, string: &str) { 55 | self.string_to_symbol.insert( 56 | string.into(), 57 | SymbolNumber(self.key_table.len().try_into().expect("too many symbols")), 58 | ); 59 | self.key_table.push(string.into()); 60 | } 61 | 62 | #[inline(always)] 63 | pub fn identity(&self) -> Option { 64 | self.identity_symbol 65 | } 66 | 67 | #[inline(always)] 68 | pub fn unknown(&self) -> Option { 69 | self.unknown_symbol 70 | } 71 | 72 | #[inline(always)] 73 | pub fn initial_symbol_count(&self) -> SymbolNumber { 74 | self.initial_symbol_count 75 | } 76 | 77 | #[inline(always)] 78 | pub fn len(&self) -> usize { 79 | self.length 80 | } 81 | 82 | #[inline(always)] 83 | pub fn is_empty(&self) -> bool { 84 | self.length == 0 85 | } 86 | 87 | #[inline(always)] 88 | pub fn create_translator_from(&mut self, mutator: &T) -> Vec 89 | where 90 | F: crate::vfs::File, 91 | T: Transducer, 92 | { 93 | tracing::trace!("create_translator_from"); 94 | let from = mutator.alphabet(); 95 | let from_keys = from.key_table(); 96 | 97 | let mut translator = Vec::with_capacity(64); 98 | translator.push(SymbolNumber::ZERO); 99 | 100 | for from_sym in from_keys.iter().skip(1) { 101 | tracing::trace!("key {}", from_sym); 102 | if let Some(sym) = self.string_to_symbol.get(from_sym) { 103 | translator.push(*sym); 104 | } else { 105 | let lexicon_key = SymbolNumber(self.key_table.len() as u16); 106 | translator.push(lexicon_key); 107 | self.add_symbol(from_sym); 108 | } 109 | } 110 | 111 | translator 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /crates/regtest/src/main.rs: -------------------------------------------------------------------------------- 1 | /*! Regression testing for Finite-State Spell-Checkers 2 | 3 | A tool to help testing updates in finite-state spell-checkers. Simply shows the 4 | differences between two spell-checker language models. Can be used in automated 5 | scripts to cap amount of changes between two versions. 6 | 7 | # Usage examples 8 | 9 | It's a command-line tool: 10 | ```console 11 | $ cargo run -- --old old.zhfst --new new.zhfst --words typos.txt --threshold 0.9 12 | ``` 13 | will expect there to be less than 10 % regressions between `old.zhfst` and 14 | `new.zhfst`. 15 | */ 16 | 17 | use std::error::Error; 18 | use std::path::PathBuf; 19 | 20 | use divvunspell::archive; 21 | 22 | use clap::Parser; 23 | 24 | #[derive(Parser)] 25 | #[command(author, version, about, long_about = None)] 26 | struct Cli { 27 | #[arg(short, long, value_name = "OLDFILE")] 28 | old: PathBuf, 29 | #[arg(short, long, value_name = "NEWFILE")] 30 | new: PathBuf, 31 | #[arg(short, long, value_name = "WORDFILE")] 32 | words: PathBuf, 33 | #[arg(short, long, value_name = "THOLD")] 34 | threshold: f32, 35 | } 36 | 37 | fn load_words(path: PathBuf) -> Result, Box> { 38 | let mut rdr = csv::ReaderBuilder::new() 39 | .comment(Some(b'#')) 40 | .delimiter(b'\t') 41 | .has_headers(false) 42 | .flexible(true) 43 | .from_path(path)?; 44 | Ok(rdr 45 | .records() 46 | .filter_map(Result::ok) 47 | .filter_map(|r| { 48 | r.get(0) 49 | .and_then(|x| r.get(1).map(|y| (x.to_string(), y.to_string()))) 50 | }) 51 | .collect()) 52 | } 53 | 54 | fn main() -> Result<(), Box> { 55 | let cli = Cli::parse(); 56 | let oldarch = archive::open(cli.old)?; 57 | let newarch = archive::open(cli.new)?; 58 | let words = load_words(cli.words)?; 59 | let mut regressions = 0; 60 | for word in &words { 61 | let oldsuggs = oldarch.speller().suggest(&word.0); 62 | let newsuggs = newarch.speller().suggest(&word.0); 63 | let oldpos = oldsuggs.iter().position(|x| x.value == word.1); 64 | let newpos = newsuggs.iter().position(|x| x.value == word.1); 65 | if oldpos != newpos { 66 | match (oldpos, newpos) { 67 | (None, Some(y)) => { 68 | println!( 69 | "Regression: {} -> {} was uncorrected now {}", 70 | word.0, word.1, y 71 | ); 72 | } 73 | (Some(x), None) => { 74 | println!( 75 | "Regression: {} -> {} was {} now uncorrectable!", 76 | word.0, word.1, x 77 | ); 78 | } 79 | (Some(x), Some(y)) => { 80 | println!("REGRESSION: {} -> {} was {} now {}", word.0, word.1, x, y); 81 | } 82 | (None, None) => { 83 | panic!("Shouldn't happen lol"); 84 | } 85 | } 86 | regressions = regressions + 1; 87 | } else { 88 | print!("."); 89 | } 90 | } 91 | if words.len() == 0 { 92 | Err("Could not find any words from {}")? 93 | } 94 | let regressionrate = regressions as f32 / words.len() as f32; 95 | if cli.threshold > regressionrate { 96 | Ok(()) 97 | } else { 98 | Err(format!( 99 | "regressions more than threshold {} > {}", 100 | regressionrate, cli.threshold 101 | ))? 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/archive/mod.rs: -------------------------------------------------------------------------------- 1 | //! Handling of archives of spell-checking models. 2 | use memmap2::Mmap; 3 | use std::{ffi::OsString, path::Path, sync::Arc}; 4 | 5 | pub mod boxf; 6 | pub mod error; 7 | pub mod meta; 8 | pub mod zip; 9 | 10 | pub use self::{boxf::BoxSpellerArchive, zip::ZipSpellerArchive}; 11 | 12 | use self::{ 13 | boxf::ThfstChunkedBoxSpellerArchive, error::SpellerArchiveError, meta::SpellerMetadata, 14 | }; 15 | use crate::speller::Speller; 16 | 17 | pub(crate) struct TempMmap { 18 | mmap: Arc, 19 | 20 | // Not really dead, needed to drop when TempMmap drops 21 | _tempdir: tempfile::TempDir, 22 | } 23 | 24 | pub(crate) enum MmapRef { 25 | Direct(Arc), 26 | Temp(TempMmap), 27 | } 28 | 29 | impl MmapRef { 30 | pub fn map(&self) -> Arc { 31 | match self { 32 | MmapRef::Direct(mmap) => Arc::clone(mmap), 33 | MmapRef::Temp(tmmap) => Arc::clone(&tmmap.mmap), 34 | } 35 | } 36 | } 37 | 38 | /// Speller archive is a file read into spell-checker with metadata. 39 | pub trait SpellerArchive { 40 | /// Read and parse a speller archive. 41 | fn open(path: &Path) -> Result 42 | where 43 | Self: Sized; 44 | 45 | /// Retrieve spell-checker. 46 | /// 47 | /// The returned speller can perform both spell checking and morphological analysis 48 | /// depending on the `OutputMode` passed to `suggest()`. 49 | fn speller(&self) -> Arc; 50 | 51 | /// Retrieve metadata. 52 | fn metadata(&self) -> Option<&SpellerMetadata>; 53 | } 54 | 55 | /// Reads a speller archive. 56 | pub fn open

(path: P) -> Result, SpellerArchiveError> 57 | where 58 | P: AsRef, 59 | { 60 | match path.as_ref().extension() { 61 | Some(x) if x == "bhfst" => { 62 | ThfstChunkedBoxSpellerArchive::open(path.as_ref()).map(|x| Arc::new(x) as _) 63 | } 64 | Some(x) if x == "zhfst" => ZipSpellerArchive::open(path.as_ref()).map(|x| Arc::new(x) as _), 65 | unknown => Err(SpellerArchiveError::UnsupportedExt( 66 | unknown 67 | .map(|x| x.to_owned()) 68 | .unwrap_or_else(|| OsString::new()), 69 | )), 70 | } 71 | } 72 | 73 | #[cfg(feature = "internal_ffi")] 74 | pub(crate) mod ffi { 75 | use super::*; 76 | use cffi::{FromForeign, ToForeign}; 77 | use std::error::Error; 78 | 79 | #[cffi::marshal(return_marshaler = cffi::ArcMarshaler::)] 80 | pub extern "C" fn divvun_speller_archive_open( 81 | #[marshal(cffi::PathBufMarshaler)] path: std::path::PathBuf, 82 | ) -> Result, Box> { 83 | open(&path).map_err(|e| Box::new(e) as _) 84 | } 85 | 86 | #[cffi::marshal(return_marshaler = "cffi::ArcMarshaler::")] 87 | pub extern "C" fn divvun_speller_archive_speller( 88 | #[marshal(cffi::ArcRefMarshaler::)] handle: Arc< 89 | dyn SpellerArchive + Send + Sync, 90 | >, 91 | ) -> Arc { 92 | handle.speller() 93 | } 94 | 95 | #[cffi::marshal(return_marshaler = "cffi::StringMarshaler")] 96 | pub extern "C" fn divvun_speller_archive_locale( 97 | #[marshal(cffi::ArcRefMarshaler::)] handle: Arc< 98 | dyn SpellerArchive + Send + Sync, 99 | >, 100 | ) -> Result> { 101 | match handle.metadata() { 102 | Some(v) => Ok(v.info().locale().to_string()), 103 | None => Err(Box::new(SpellerArchiveError::NoMetadata) as _), 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/transducer/hfst/index_table.rs: -------------------------------------------------------------------------------- 1 | // We manually ensure alignment of reads in this file. 2 | #![allow(clippy::cast_ptr_alignment)] 3 | 4 | use byteorder::{LittleEndian, ReadBytesExt}; 5 | use std::fmt; 6 | use std::io::Cursor; 7 | use std::mem; 8 | use std::ptr; 9 | 10 | use crate::constants::INDEX_TABLE_SIZE; 11 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 12 | use memmap2::Mmap; 13 | use std::sync::Arc; 14 | 15 | pub struct MappedIndexTable { 16 | pub(crate) size: TransitionTableIndex, 17 | pub(crate) mmap: Arc, 18 | pub(crate) offset: usize, 19 | pub(crate) len: usize, 20 | } 21 | 22 | impl fmt::Debug for MappedIndexTable { 23 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 24 | write!(f, "Index table index: {}", self.size)?; 25 | Ok(()) 26 | } 27 | } 28 | 29 | #[allow(clippy::len_without_is_empty)] 30 | impl MappedIndexTable { 31 | pub fn new( 32 | buf: Arc, 33 | offset: usize, 34 | len: usize, 35 | size: TransitionTableIndex, 36 | ) -> MappedIndexTable { 37 | MappedIndexTable { 38 | size, 39 | mmap: buf, 40 | offset, 41 | len, 42 | } 43 | } 44 | 45 | #[inline(always)] 46 | pub fn len(&self) -> usize { 47 | self.len - self.offset 48 | } 49 | 50 | #[inline(always)] 51 | fn make_cursor<'a>(&'a self) -> Cursor<&'a [u8]> { 52 | Cursor::new(&self.mmap) 53 | } 54 | 55 | #[inline(always)] 56 | pub fn input_symbol(&self, i: TransitionTableIndex) -> Option { 57 | if i >= self.size { 58 | return None; 59 | } 60 | 61 | let index = self.offset + INDEX_TABLE_SIZE * i.0 as usize; 62 | 63 | let input_symbol = if cfg!(all(target_arch = "arm", target_pointer_width = "32")) { 64 | let mut cursor = self.make_cursor(); 65 | cursor.set_position(index as u64); 66 | SymbolNumber(cursor.read_u16::().unwrap()) 67 | } else { 68 | SymbolNumber(unsafe { ptr::read(self.mmap.as_ptr().add(index) as *const _) }) 69 | }; 70 | 71 | if input_symbol == SymbolNumber::MAX { 72 | None 73 | } else { 74 | Some(input_symbol) 75 | } 76 | } 77 | 78 | #[inline(always)] 79 | pub fn target(&self, i: TransitionTableIndex) -> Option { 80 | if i >= self.size { 81 | return None; 82 | } 83 | 84 | let index = self.offset + INDEX_TABLE_SIZE * i.0 as usize; 85 | let target = if cfg!(all(target_arch = "arm", target_pointer_width = "32")) { 86 | let mut cursor = self.make_cursor(); 87 | cursor.set_position((index + mem::size_of::()) as u64); 88 | TransitionTableIndex(cursor.read_u32::().unwrap()) 89 | } else { 90 | TransitionTableIndex(unsafe { 91 | ptr::read(self.mmap.as_ptr().add(index + 2) as *const _) 92 | }) 93 | }; 94 | 95 | if target == TransitionTableIndex::MAX { 96 | None 97 | } else { 98 | Some(target) 99 | } 100 | } 101 | 102 | // Final weight reads from the same position as target, but for a different tuple 103 | // This can probably be abstracted out more nicely 104 | #[inline(always)] 105 | pub fn final_weight(&self, i: TransitionTableIndex) -> Option { 106 | if i >= self.size { 107 | return None; 108 | } 109 | 110 | let index = self.offset + INDEX_TABLE_SIZE * i.0 as usize; 111 | let weight: Weight = { 112 | let mut cursor = self.make_cursor(); 113 | cursor.set_position((index + mem::size_of::()) as u64); 114 | Weight(cursor.read_f32::().unwrap()) 115 | }; 116 | 117 | Some(weight) 118 | } 119 | 120 | #[inline(always)] 121 | pub fn is_final(&self, i: TransitionTableIndex) -> bool { 122 | self.input_symbol(i) == None && self.target(i) != None 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/archive/zip.rs: -------------------------------------------------------------------------------- 1 | //! Zip archive stuff. 2 | use ::zip::{CompressionMethod, ZipArchive}; 3 | use memmap2::MmapOptions; 4 | use std::fs::File; 5 | use std::io::Seek; 6 | use std::io::prelude::*; 7 | use std::sync::Arc; 8 | 9 | use super::error::SpellerArchiveError; 10 | use super::meta::SpellerMetadata; 11 | use super::{MmapRef, SpellerArchive, TempMmap}; 12 | use crate::speller::{HfstSpeller, Speller}; 13 | use crate::transducer::hfst::HfstTransducer; 14 | 15 | /// Type alias for HFST-based speller loaded from a zip archive. 16 | /// 17 | /// Uses memory-mapped HFST transducers for both the error model and lexicon. 18 | pub type HfstZipSpeller = 19 | HfstSpeller, HfstTransducer>; 20 | 21 | /// Speller archive backed by a zip file. 22 | /// 23 | /// This is the standard format for distributing spell-checkers (`.zhfst` files). 24 | /// The archive contains metadata, an error model transducer, and a lexicon transducer. 25 | pub struct ZipSpellerArchive { 26 | metadata: SpellerMetadata, 27 | speller: Arc, 28 | } 29 | 30 | fn mmap_by_name( 31 | zipfile: &mut File, 32 | archive: &mut ZipArchive, 33 | name: &str, 34 | ) -> Result { 35 | let mut index = archive.by_name(name)?; 36 | 37 | if index.compression() != CompressionMethod::Stored { 38 | let tempdir = tempfile::tempdir()?; 39 | let outpath = tempdir.path().join(index.mangled_name()); 40 | 41 | let mut outfile = File::create(&outpath)?; 42 | std::io::copy(&mut index, &mut outfile)?; 43 | 44 | let outfile = File::open(&outpath)?; 45 | 46 | let mmap = unsafe { MmapOptions::new().map(&outfile) }; 47 | 48 | return match mmap { 49 | Ok(v) => Ok(MmapRef::Temp(TempMmap { 50 | mmap: Arc::new(v), 51 | _tempdir: tempdir, 52 | })), 53 | Err(err) => return Err(err), 54 | }; 55 | } 56 | 57 | let mmap = unsafe { 58 | MmapOptions::new() 59 | .offset(index.data_start()) 60 | .len(index.size() as usize) 61 | .map(&*zipfile) 62 | }; 63 | 64 | match mmap { 65 | Ok(v) => Ok(MmapRef::Direct(Arc::new(v))), 66 | Err(err) => Err(err), 67 | } 68 | } 69 | 70 | impl ZipSpellerArchive { 71 | /// Get a reference to the HFST speller. 72 | /// 73 | /// Returns the underlying `HfstSpeller` with its concrete transducer types. 74 | /// This is useful when you need access to HFST-specific functionality. 75 | pub fn hfst_speller( 76 | &self, 77 | ) -> Arc, HfstTransducer>> 78 | { 79 | self.speller.clone() 80 | } 81 | } 82 | 83 | impl SpellerArchive for ZipSpellerArchive { 84 | fn open(file_path: &std::path::Path) -> Result { 85 | let file = File::open(&file_path).map_err(SpellerArchiveError::File)?; 86 | let reader = std::io::BufReader::new(&file); 87 | let mut archive = ZipArchive::new(reader).expect("zip"); 88 | 89 | // // Open file a second time to get around borrow checker 90 | let mut file = File::open(file_path).map_err(SpellerArchiveError::File)?; 91 | 92 | let metadata_mmap = mmap_by_name(&mut file, &mut archive, "index.xml") 93 | .map_err(|e| SpellerArchiveError::Io("index.xml".into(), e.into()))?; 94 | let metadata = SpellerMetadata::from_bytes(&*metadata_mmap.map()).expect("meta"); 95 | 96 | let acceptor_id = metadata.acceptor().id(); 97 | let errmodel_id = metadata.errmodel().id(); 98 | 99 | let acceptor_mmap = mmap_by_name(&mut file, &mut archive, &acceptor_id) 100 | .map_err(|e| SpellerArchiveError::Io(acceptor_id.into(), e.into()))?; 101 | let errmodel_mmap = mmap_by_name(&mut file, &mut archive, &errmodel_id) 102 | .map_err(|e| SpellerArchiveError::Io(errmodel_id.into(), e.into()))?; 103 | drop(archive); 104 | 105 | let acceptor = HfstTransducer::from_mapped_memory(acceptor_mmap.map()); 106 | let errmodel = HfstTransducer::from_mapped_memory(errmodel_mmap.map()); 107 | 108 | let speller = HfstSpeller::new(errmodel, acceptor); 109 | 110 | Ok(ZipSpellerArchive { metadata, speller }) 111 | } 112 | 113 | fn speller(&self) -> Arc { 114 | self.speller.clone() 115 | } 116 | 117 | fn metadata(&self) -> Option<&SpellerMetadata> { 118 | Some(&self.metadata) 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/transducer/hfst/transition_table.rs: -------------------------------------------------------------------------------- 1 | // We manually ensure alignment of reads in this file. 2 | #![allow(clippy::cast_ptr_alignment)] 3 | 4 | use byteorder::{LittleEndian, ReadBytesExt}; 5 | use memmap2::Mmap; 6 | use std::fmt; 7 | use std::io::Cursor; 8 | use std::mem; 9 | use std::ptr; 10 | use std::sync::Arc; 11 | 12 | use crate::constants::TRANS_TABLE_SIZE; 13 | use crate::transducer::symbol_transition::SymbolTransition; 14 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 15 | 16 | pub struct MappedTransitionTable { 17 | pub(crate) size: TransitionTableIndex, 18 | pub(crate) mmap: Arc, 19 | pub(crate) offset: usize, 20 | } 21 | 22 | impl fmt::Debug for MappedTransitionTable { 23 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 24 | write!(f, "Transition table index: {}", self.size)?; 25 | Ok(()) 26 | } 27 | } 28 | 29 | impl MappedTransitionTable { 30 | #[inline(always)] 31 | pub fn new( 32 | mmap: Arc, 33 | offset: usize, 34 | size: TransitionTableIndex, 35 | ) -> MappedTransitionTable { 36 | MappedTransitionTable { size, mmap, offset } 37 | } 38 | 39 | #[inline(always)] 40 | fn make_cursor(&self) -> Cursor<&[u8]> { 41 | Cursor::new(&self.mmap) 42 | } 43 | 44 | #[inline(always)] 45 | fn read_symbol_from_cursor(&self, index: usize) -> Option { 46 | let index = self.offset + index; 47 | let x = if cfg!(all(target_arch = "arm", target_pointer_width = "32")) { 48 | let mut cursor = self.make_cursor(); 49 | cursor.set_position(index as u64); 50 | SymbolNumber(cursor.read_u16::().unwrap()) 51 | } else { 52 | SymbolNumber(unsafe { ptr::read(self.mmap.as_ptr().add(index) as *const _) }) 53 | }; 54 | if x == SymbolNumber::MAX { 55 | None 56 | } else { 57 | Some(x) 58 | } 59 | } 60 | 61 | #[inline(always)] 62 | pub fn input_symbol(&self, i: TransitionTableIndex) -> Option { 63 | if i >= self.size { 64 | return None; 65 | } 66 | 67 | let index = TRANS_TABLE_SIZE as usize * i.0 as usize; 68 | self.read_symbol_from_cursor(index) 69 | } 70 | 71 | #[inline(always)] 72 | pub fn output_symbol(&self, i: TransitionTableIndex) -> Option { 73 | if i >= self.size { 74 | return None; 75 | } 76 | 77 | let index = ((TRANS_TABLE_SIZE * i.0 as usize) + mem::size_of::()) as usize; 78 | self.read_symbol_from_cursor(index) 79 | } 80 | 81 | #[inline(always)] 82 | pub fn target(&self, i: TransitionTableIndex) -> Option { 83 | if i >= self.size { 84 | return None; 85 | } 86 | 87 | let index = self.offset 88 | + ((TRANS_TABLE_SIZE * i.0 as usize) + (2 * mem::size_of::())); 89 | 90 | let x: TransitionTableIndex = if cfg!(all(target_arch = "arm", target_pointer_width = "32")) 91 | { 92 | let mut cursor = self.make_cursor(); 93 | cursor.set_position(index as u64); 94 | TransitionTableIndex(cursor.read_u32::().unwrap()) 95 | } else { 96 | TransitionTableIndex(unsafe { ptr::read(self.mmap.as_ptr().add(index) as *const _) }) 97 | }; 98 | if x == TransitionTableIndex::MAX { 99 | None 100 | } else { 101 | Some(x) 102 | } 103 | } 104 | 105 | #[inline(always)] 106 | pub fn weight(&self, i: TransitionTableIndex) -> Option { 107 | if i >= self.size { 108 | return None; 109 | } 110 | 111 | let index = self.offset 112 | + ((TRANS_TABLE_SIZE * i.0 as usize) 113 | + (2 * mem::size_of::()) 114 | + mem::size_of::()); 115 | 116 | let x: Weight = if cfg!(all(target_arch = "arm", target_pointer_width = "32")) { 117 | let mut cursor = self.make_cursor(); 118 | cursor.set_position(index as u64); 119 | Weight(cursor.read_f32::().unwrap()) 120 | } else { 121 | Weight(unsafe { ptr::read(self.mmap.as_ptr().add(index) as *const _) }) 122 | }; 123 | Some(x) 124 | } 125 | 126 | #[inline(always)] 127 | pub fn is_final(&self, i: TransitionTableIndex) -> bool { 128 | self.input_symbol(i) == None 129 | && self.output_symbol(i) == None 130 | && self.target(i) == Some(TransitionTableIndex::ONE) 131 | } 132 | 133 | #[inline(always)] 134 | pub fn symbol_transition(&self, i: TransitionTableIndex) -> SymbolTransition { 135 | SymbolTransition::new(self.target(i), self.output_symbol(i), self.weight(i)) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /support/divvunspell.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #pragma once 12 | 13 | #ifndef __APPLE__ 14 | #define _Nonnull 15 | #define _Nullable 16 | #endif 17 | 18 | // Rust FFI required types 19 | typedef uint8_t rust_bool_t; 20 | typedef uintptr_t rust_usize_t; 21 | 22 | typedef struct rust_slice_s { 23 | void *_Nullable data; 24 | uintptr_t len; 25 | } rust_slice_t; 26 | 27 | #if _WIN32 28 | typedef wchar_t rust_path_t; 29 | #else 30 | typedef char rust_path_t; 31 | #endif 32 | 33 | // Rust error handling constructs 34 | char*_Nullable divvunspell_err = NULL; 35 | 36 | static void divvunspell_err_callback(const char *_Nonnull msg) { 37 | size_t sz = strlen(msg) + 1; 38 | divvunspell_err = (char*)calloc(1, sz); 39 | memcpy(divvunspell_err, msg, sz); 40 | } 41 | 42 | static void divvunspell_err_print() { 43 | if (divvunspell_err != NULL) { 44 | printf("Err: %s\n", divvunspell_err); 45 | } 46 | } 47 | 48 | static void divvunspell_err_free() { 49 | if (divvunspell_err != NULL) { 50 | free(divvunspell_err); 51 | divvunspell_err = NULL; 52 | } 53 | } 54 | 55 | #define ERR_CALLBACK void (*_Nonnull exception)(const char *_Nonnull) 56 | 57 | struct CaseHandlingConfig { 58 | float start_penalty; 59 | float end_penalty; 60 | float mid_penalty; 61 | }; 62 | 63 | struct SpellerConfig { 64 | rust_usize_t n_best; 65 | float max_weight; 66 | float beam; 67 | struct CaseHandlingConfig case_handling; 68 | rust_usize_t node_pool_size; 69 | }; 70 | 71 | extern const void *_Nullable 72 | divvun_thfst_chunked_box_speller_archive_open(const rust_path_t *_Nonnull path, ERR_CALLBACK); 73 | 74 | extern const void *_Nullable 75 | divvun_thfst_chunked_box_speller_archive_speller(const void *_Nonnull handle, ERR_CALLBACK); 76 | 77 | extern rust_bool_t 78 | divvun_thfst_chunked_box_speller_is_correct(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 79 | 80 | extern const rust_slice_t 81 | divvun_thfst_chunked_box_speller_suggest(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 82 | 83 | extern const void *_Nullable 84 | divvun_thfst_chunked_box_speller_suggest_with_config( 85 | const void *_Nonnull speller, 86 | const char *_Nonnull word, 87 | struct SpellerConfig *_Nonnull config, 88 | ERR_CALLBACK); 89 | 90 | extern const void *_Nullable 91 | divvun_thfst_box_speller_archive_open(const rust_path_t *_Nonnull path, ERR_CALLBACK); 92 | 93 | extern const void *_Nullable 94 | divvun_thfst_box_speller_archive_speller(const void *_Nonnull handle, ERR_CALLBACK); 95 | 96 | extern rust_bool_t 97 | divvun_thfst_box_speller_is_correct(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 98 | 99 | extern const rust_slice_t 100 | divvun_thfst_box_speller_suggest(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 101 | 102 | extern const void *_Nullable 103 | divvun_thfst_box_speller_suggest_with_config( 104 | const void *_Nonnull speller, 105 | const char *_Nonnull word, 106 | struct SpellerConfig *_Nonnull config, 107 | ERR_CALLBACK); 108 | 109 | extern const void *_Nullable 110 | divvun_hfst_zip_speller_archive_open(const rust_path_t *_Nonnull path, ERR_CALLBACK); 111 | 112 | extern const void *_Nullable 113 | divvun_hfst_zip_speller_archive_speller(const void *_Nonnull handle, ERR_CALLBACK); 114 | 115 | extern const char *_Nullable 116 | divvun_hfst_zip_speller_archive_locale(const void *_Nonnull handle, ERR_CALLBACK); 117 | 118 | extern rust_bool_t 119 | divvun_hfst_zip_speller_is_correct(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 120 | 121 | extern const rust_slice_t 122 | divvun_hfst_zip_speller_suggest(const void *_Nonnull speller, const char *_Nonnull word, ERR_CALLBACK); 123 | 124 | extern const void *_Nullable 125 | divvun_hfst_zip_speller_suggest_with_config( 126 | const void *_Nonnull speller, 127 | const char *_Nonnull word, 128 | struct SpellerConfig *_Nonnull config, 129 | ERR_CALLBACK); 130 | 131 | extern rust_usize_t 132 | divvun_vec_suggestion_len(const rust_slice_t suggestions, ERR_CALLBACK); 133 | 134 | extern const char *_Nullable 135 | divvun_vec_suggestion_get_value( 136 | const rust_slice_t suggestions, 137 | rust_usize_t index, 138 | ERR_CALLBACK); 139 | 140 | extern void 141 | divvun_string_free(const char *_Nullable value); 142 | 143 | 144 | // TODO: this is temporary until a better tokenizer impl is written 145 | extern void *_Nonnull 146 | word_bound_indices(const char *_Nonnull utf8_string); 147 | 148 | extern rust_bool_t 149 | word_bound_indices_next(const void *_Nonnull handle, uint64_t *_Nonnull out_index, char *_Nonnull *_Nonnull out_string); 150 | 151 | extern void 152 | word_bound_indices_free(void *_Nonnull handle); 153 | 154 | #ifdef __cplusplus 155 | } 156 | #endif 157 | 158 | -------------------------------------------------------------------------------- /src/transducer/hfst/alphabet.rs: -------------------------------------------------------------------------------- 1 | use hashbrown::HashMap; 2 | use smol_str::SmolStr; 3 | 4 | use crate::types::{ 5 | FlagDiacriticOperation, FlagDiacriticOperator, OperationsMap, SymbolNumber, ValueNumber, 6 | }; 7 | 8 | use crate::transducer::alphabet::TransducerAlphabet; 9 | 10 | pub struct TransducerAlphabetParser { 11 | key_table: Vec, 12 | flag_state_size: SymbolNumber, 13 | length: usize, 14 | string_to_symbol: HashMap, 15 | operations: OperationsMap, 16 | feature_bucket: HashMap, 17 | value_bucket: HashMap, 18 | val_n: ValueNumber, 19 | feat_n: SymbolNumber, 20 | identity_symbol: Option, 21 | unknown_symbol: Option, 22 | } 23 | 24 | impl std::default::Default for TransducerAlphabetParser { 25 | fn default() -> Self { 26 | TransducerAlphabetParser { 27 | key_table: Vec::with_capacity(64), 28 | flag_state_size: SymbolNumber::ZERO, 29 | length: 0, 30 | string_to_symbol: HashMap::new(), 31 | operations: HashMap::new(), 32 | feature_bucket: HashMap::new(), 33 | value_bucket: HashMap::new(), 34 | val_n: ValueNumber::ZERO, 35 | feat_n: SymbolNumber::ZERO, 36 | identity_symbol: None, 37 | unknown_symbol: None, 38 | } 39 | } 40 | } 41 | 42 | impl TransducerAlphabetParser { 43 | pub fn new() -> TransducerAlphabetParser { 44 | Self::default() 45 | } 46 | 47 | fn handle_special_symbol(&mut self, i: SymbolNumber, key: &str) { 48 | use std::str::FromStr; 49 | let mut chunks = key.split('.'); 50 | 51 | let fdo = FlagDiacriticOperator::from_str(&chunks.next().unwrap()[1..]).unwrap(); 52 | let feature: SmolStr = chunks 53 | .next() 54 | .unwrap_or("") 55 | .chars() 56 | .filter(|x| x != &'@') 57 | .collect(); 58 | let value: SmolStr = chunks 59 | .next() 60 | .unwrap_or("") 61 | .chars() 62 | .filter(|x| x != &'@') 63 | .collect(); 64 | 65 | if !self.feature_bucket.contains_key(&feature) { 66 | self.feature_bucket.insert(feature.clone(), self.feat_n); 67 | self.feat_n = self.feat_n.incr(); 68 | } 69 | 70 | if !self.value_bucket.contains_key(&value) { 71 | self.value_bucket.insert(value.clone(), self.val_n); 72 | self.val_n = self.val_n.incr(); 73 | } 74 | 75 | let op = FlagDiacriticOperation { 76 | operation: fdo, 77 | feature: self.feature_bucket[&feature], 78 | value: self.value_bucket[&value], 79 | }; 80 | 81 | self.operations.insert(i, op); 82 | self.key_table.push(key.into()); 83 | } 84 | 85 | fn parse_inner(&mut self, buf: &[u8], symbols: SymbolNumber) { 86 | let mut offset = 0usize; 87 | 88 | for i in 0..symbols.0 { 89 | let i = SymbolNumber(i); 90 | let mut end = 0usize; 91 | 92 | while buf[offset + end] != 0 { 93 | end += 1; 94 | } 95 | 96 | let key: SmolStr = String::from_utf8_lossy(&buf[offset..offset + end]).into(); 97 | 98 | if key.len() > 1 && key.starts_with('@') && key.ends_with('@') { 99 | if key.chars().nth(2).unwrap() == '.' { 100 | self.handle_special_symbol(i, &key); 101 | } else if key == "@_EPSILON_SYMBOL_@" { 102 | self.value_bucket.insert("".into(), self.val_n); 103 | self.key_table.push("".into()); 104 | self.val_n = self.val_n.incr(); 105 | } else if key == "@_IDENTITY_SYMBOL_@" { 106 | self.identity_symbol = Some(i); 107 | self.key_table.push(key); 108 | } else if key == "@_UNKNOWN_SYMBOL_@" { 109 | self.unknown_symbol = Some(i); 110 | self.key_table.push(key); 111 | } else { 112 | // No idea, skip. 113 | eprintln!("Unhandled alphabet key: {}", &key); 114 | self.key_table.push(SmolStr::from("")); 115 | } 116 | } else { 117 | self.key_table.push(key.clone()); 118 | self.string_to_symbol.insert(key.clone(), i); 119 | } 120 | 121 | offset += end + 1; 122 | } 123 | 124 | self.flag_state_size = SymbolNumber( 125 | self.feature_bucket 126 | .len() 127 | .try_into() 128 | .expect("Too many features in the alphabet, cannot fit into SymbolNumber"), 129 | ); 130 | 131 | // Count remaining null padding bytes 132 | while buf[offset] == b'\0' { 133 | offset += 1; 134 | } 135 | 136 | self.length = offset; 137 | } 138 | 139 | pub fn parse(buf: &[u8], symbols: SymbolNumber) -> TransducerAlphabet { 140 | let mut p = TransducerAlphabetParser::new(); 141 | p.parse_inner(buf, symbols); 142 | 143 | TransducerAlphabet { 144 | key_table: p.key_table, 145 | initial_symbol_count: symbols, 146 | length: p.length, 147 | flag_state_size: p.flag_state_size, 148 | string_to_symbol: p.string_to_symbol, 149 | operations: p.operations, 150 | identity_symbol: p.identity_symbol, 151 | unknown_symbol: p.unknown_symbol, 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/transducer/mod.rs: -------------------------------------------------------------------------------- 1 | //! Transducer is a Finite-State Automaton with two tapes / two symbols per 2 | //! transition. 3 | //! 4 | //! Transducer in divvunspell is modeled after the C++ transducer in the 5 | //! hfst-ospell library. It may contain some complex optimisations and 6 | //! specifics to underlying finite-state systems and lot of this is 7 | //! pretty hacky. 8 | pub mod hfst; 9 | pub mod thfst; 10 | 11 | mod alphabet; 12 | mod symbol_transition; 13 | pub(crate) mod tree_node; 14 | 15 | pub use self::alphabet::TransducerAlphabet; 16 | pub use self::symbol_transition::SymbolTransition; 17 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 18 | use crate::vfs::{self, Filesystem}; 19 | 20 | /// Error with transducer reading or processing. 21 | #[derive(Debug, thiserror::Error)] 22 | pub enum TransducerError { 23 | /// Error with mmapping 24 | #[error("Memory mapping error")] 25 | Memmap(#[source] std::io::Error), 26 | /// Error with input/output. 27 | #[error("IO error")] 28 | Io(#[source] std::io::Error), 29 | /// Error with FSA alphabets. 30 | #[error("Alphabet error")] 31 | Alphabet(#[source] Box), 32 | } 33 | 34 | impl TransducerError { 35 | /// Wrap into i/o error. 36 | pub fn into_io_error(self) -> std::io::Error { 37 | match self { 38 | TransducerError::Memmap(v) => v, 39 | TransducerError::Io(v) => v, 40 | TransducerError::Alphabet(v) => { 41 | std::io::Error::new(std::io::ErrorKind::Other, format!("{}", v)) 42 | } 43 | } 44 | } 45 | } 46 | 47 | /// A file-based finite-state transducer. 48 | /// 49 | /// This trait defines the interface for finite-state transducers that can be loaded 50 | /// from files and used for spell-checking and morphological analysis. 51 | /// 52 | /// Implementors can provide custom transducer formats beyond the built-in HFST and THFST formats. 53 | pub trait Transducer: Sized { 54 | /// file extension. 55 | const FILE_EXT: &'static str; 56 | 57 | /// read a transducer from a file. 58 | fn from_path(fs: &FS, path: P) -> Result 59 | where 60 | P: AsRef, 61 | FS: Filesystem; 62 | 63 | /// get transducer's alphabet. 64 | fn alphabet(&self) -> &TransducerAlphabet; 65 | /// get transducer's alphabet as mutable reference. 66 | fn mut_alphabet(&mut self) -> &mut TransducerAlphabet; 67 | 68 | /// get input symbol number of given transition arc. 69 | fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option; 70 | /// check if there are transitions at given index. 71 | fn has_transitions(&self, i: TransitionTableIndex, s: Option) -> bool; 72 | /// get next transition with a symbol. 73 | fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option; 74 | /// check if there are free transitions at index. 75 | fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool; 76 | /// follow free transitions. 77 | fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option; 78 | /// follow epsilon transitions. 79 | fn take_epsilons(&self, i: TransitionTableIndex) -> Option; 80 | /// follow transitions with given symbol. 81 | fn take_non_epsilons( 82 | &self, 83 | i: TransitionTableIndex, 84 | symbol: SymbolNumber, 85 | ) -> Option; 86 | /// check if given index is an end state. 87 | fn is_final(&self, i: TransitionTableIndex) -> bool; 88 | /// get end state weight of a state. 89 | fn final_weight(&self, i: TransitionTableIndex) -> Option; 90 | } 91 | 92 | /// Transition table contains the arcs of the automaton (and states). 93 | pub trait TransitionTable: Sized { 94 | /// read transition table from a file. 95 | fn from_path(fs: &FS, path: P) -> Result 96 | where 97 | P: AsRef, 98 | FS: Filesystem; 99 | /// get input symbol of a transition. 100 | fn input_symbol(&self, i: TransitionTableIndex) -> Option; 101 | /// get output symbol of a transition. 102 | fn output_symbol(&self, i: TransitionTableIndex) -> Option; 103 | /// get the target state in the index. 104 | fn target(&self, i: TransitionTableIndex) -> Option; 105 | /// get the weight of the transition. 106 | fn weight(&self, i: TransitionTableIndex) -> Option; 107 | 108 | /// check if the state is a final state. 109 | #[inline(always)] 110 | fn is_final(&self, i: TransitionTableIndex) -> bool { 111 | self.input_symbol(i) == None 112 | && self.output_symbol(i) == None 113 | && self.target(i) == Some(TransitionTableIndex(1)) 114 | } 115 | 116 | /// ??? 117 | #[inline(always)] 118 | fn symbol_transition(&self, i: TransitionTableIndex) -> SymbolTransition { 119 | SymbolTransition::new(self.target(i), self.output_symbol(i), self.weight(i)) 120 | } 121 | } 122 | 123 | /// Index table contains something. 124 | pub trait IndexTable: Sized { 125 | fn from_path(fs: &FS, path: P) -> Result 126 | where 127 | P: AsRef, 128 | FS: Filesystem; 129 | fn input_symbol(&self, i: TransitionTableIndex) -> Option; 130 | fn target(&self, i: TransitionTableIndex) -> Option; 131 | fn final_weight(&self, i: TransitionTableIndex) -> Option; 132 | 133 | #[inline(always)] 134 | fn is_final(&self, i: TransitionTableIndex) -> bool { 135 | self.input_symbol(i) == None && self.target(i) != None 136 | } 137 | } 138 | 139 | #[cfg(feature = "internal_convert")] 140 | pub mod convert; 141 | -------------------------------------------------------------------------------- /src/vfs.rs: -------------------------------------------------------------------------------- 1 | //! Some stuff for filesystems and different OSes. 2 | use fs_extra::dir::CopyOptions; 3 | use memmap2::{Mmap, MmapOptions}; 4 | use std::fmt::Debug; 5 | use std::io::{Read, Result}; 6 | use std::path::Path; 7 | use tempfile::TempDir; 8 | 9 | #[cfg(unix)] 10 | use std::os::unix::fs::FileExt; 11 | 12 | pub trait Filesystem { 13 | type File: File; 14 | 15 | fn open_file>(&self, path: P) -> Result; 16 | fn copy_to_temp_dir>(&self, path: P) -> Result; 17 | } 18 | 19 | pub trait File: Read + Debug { 20 | fn len(&self) -> Result; 21 | fn is_empty(&self) -> Result; 22 | #[cfg(unix)] 23 | fn read_at(&self, buf: &mut [u8], offset: u64) -> Result; 24 | #[cfg(unix)] 25 | fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()>; 26 | unsafe fn memory_map(&self) -> Result; 27 | unsafe fn partial_memory_map(&self, offset: u64, len: usize) -> Result; 28 | } 29 | 30 | impl File for std::fs::File { 31 | fn len(&self) -> Result { 32 | self.metadata().map(|m| m.len()) 33 | } 34 | 35 | fn is_empty(&self) -> Result { 36 | self.len().map(|x| x == 0) 37 | } 38 | 39 | #[cfg(unix)] 40 | #[inline(always)] 41 | fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { 42 | FileExt::read_at(self, buf, offset) 43 | } 44 | 45 | #[cfg(unix)] 46 | #[inline(always)] 47 | fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> { 48 | FileExt::read_exact_at(self, buf, offset) 49 | } 50 | 51 | unsafe fn memory_map(&self) -> Result { 52 | unsafe { MmapOptions::new().map(self) } 53 | } 54 | 55 | unsafe fn partial_memory_map(&self, offset: u64, len: usize) -> Result { 56 | unsafe { MmapOptions::new().offset(offset).len(len).map(self) } 57 | } 58 | } 59 | 60 | /// File system. 61 | pub struct Fs; 62 | 63 | impl Filesystem for Fs { 64 | type File = std::fs::File; 65 | 66 | #[inline(always)] 67 | fn open_file>(&self, path: P) -> Result { 68 | std::fs::File::open(&path) 69 | } 70 | 71 | fn copy_to_temp_dir>(&self, path: P) -> Result { 72 | let dir = tempfile::tempdir()?; 73 | fs_extra::copy_items(&[path.as_ref()], &dir, &CopyOptions::new()) 74 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 75 | Ok(dir) 76 | } 77 | } 78 | 79 | /// Box file. 80 | pub mod boxf { 81 | use box_format::{BoxFileReader, BoxPath}; 82 | use std::io::{Read, Result}; 83 | use std::path::Path; 84 | use tempfile::TempDir; 85 | 86 | #[derive(Debug)] 87 | pub struct File { 88 | offset: u64, 89 | len: usize, 90 | file: std::fs::File, 91 | reader: std::io::Take, 92 | } 93 | 94 | impl Read for File { 95 | fn read(&mut self, buf: &mut [u8]) -> Result { 96 | self.reader.read(buf) 97 | } 98 | } 99 | 100 | impl<'a> super::File for File { 101 | fn len(&self) -> Result { 102 | Ok(self.len as u64) 103 | } 104 | 105 | fn is_empty(&self) -> Result { 106 | Ok(self.len == 0) 107 | } 108 | 109 | #[cfg(unix)] 110 | #[inline(always)] 111 | fn read_at(&self, buf: &mut [u8], offset: u64) -> Result { 112 | self.file.read_at(buf, self.offset + offset) 113 | } 114 | 115 | #[cfg(unix)] 116 | #[inline(always)] 117 | fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> { 118 | self.file.read_exact_at(buf, self.offset + offset) 119 | } 120 | 121 | unsafe fn memory_map(&self) -> Result { 122 | unsafe { 123 | memmap2::MmapOptions::new() 124 | .offset(self.offset) 125 | .len(self.len) 126 | .map(&self.file) 127 | } 128 | } 129 | 130 | unsafe fn partial_memory_map(&self, offset: u64, len: usize) -> Result { 131 | unsafe { 132 | memmap2::MmapOptions::new() 133 | .offset(self.offset + offset) 134 | .len(std::cmp::min(self.len - offset as usize, len)) 135 | .map(&self.file) 136 | } 137 | } 138 | } 139 | 140 | pub struct Filesystem<'a>(&'a BoxFileReader); 141 | 142 | impl<'a> Filesystem<'a> { 143 | pub fn new(reader: &'a BoxFileReader) -> Filesystem<'a> { 144 | Filesystem(reader) 145 | } 146 | } 147 | 148 | impl<'a> super::Filesystem for Filesystem<'a> { 149 | type File = File; 150 | 151 | #[inline(always)] 152 | fn open_file>(&self, path: P) -> Result { 153 | let boxpath = BoxPath::new(path).map_err(|e| e.as_io_error())?; 154 | let meta = self.0.metadata(); 155 | let record = meta 156 | .inode(&boxpath) 157 | .and_then(|x| meta.record(x)) 158 | .and_then(|r| r.as_file()); 159 | 160 | let file = std::fs::File::open(self.0.path())?; 161 | 162 | match record { 163 | Some(v) => self.0.read_bytes(v).map(|reader| File { 164 | offset: v.data.get(), 165 | len: v.length as usize, 166 | file, 167 | reader, 168 | }), 169 | None => Err(std::io::Error::new( 170 | std::io::ErrorKind::NotFound, 171 | "not found", 172 | )), 173 | } 174 | } 175 | 176 | fn copy_to_temp_dir>(&self, path: P) -> Result { 177 | let dir = tempfile::tempdir()?; 178 | let box_path = BoxPath::new(path) 179 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 180 | self.0 181 | .extract_recursive(&box_path, &dir) 182 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 183 | Ok(dir) 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /crates/thfst-tools/src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use std::path::{Path, PathBuf}; 3 | 4 | use divvunspell::archive::{ 5 | BoxSpellerArchive, SpellerArchive, ZipSpellerArchive, boxf::ThfstBoxSpellerArchive, 6 | }; 7 | use divvunspell::transducer::{ 8 | Transducer, 9 | convert::ConvertFile, 10 | hfst::HfstTransducer, 11 | thfst::{self, MemmapThfstTransducer}, 12 | }; 13 | 14 | use box_format::{BoxFileWriter, BoxPath, Compression}; 15 | 16 | #[derive(Debug, Parser)] 17 | #[command( 18 | name = "thfst-tools", 19 | about = "Tromsø-Helsinki Finite State Transducer toolkit." 20 | )] 21 | enum Opts { 22 | /// Convert an HFST file to THFST 23 | HfstToThfst { from: PathBuf }, 24 | 25 | /// Convert a ZHFST file to BHFST 26 | ZhfstToBhfst { from: PathBuf }, 27 | 28 | /// Convert a THFST acceptor/errmodel pair to BHFST 29 | ThfstsToBhfst { 30 | acceptor: PathBuf, 31 | errmodel: PathBuf, 32 | output: PathBuf, 33 | }, 34 | 35 | /// Print metadata for BHFST 36 | BhfstInfo { path: PathBuf }, 37 | } 38 | 39 | const ALIGNMENT: u64 = 8; 40 | 41 | #[inline(always)] 42 | fn boxpath(path: &Path, filename: &str) -> BoxPath { 43 | let path = Path::new(path.file_name().unwrap()).join(filename); 44 | BoxPath::new(path).unwrap() 45 | } 46 | 47 | #[inline(always)] 48 | fn insert( 49 | boxfile: &mut BoxFileWriter, 50 | compression: Compression, 51 | path: &Path, 52 | name: &str, 53 | ) -> Result<(), std::io::Error> { 54 | use std::collections::HashMap; 55 | use std::io::BufReader; 56 | let file = std::fs::File::open(path.join(name))?; 57 | boxfile 58 | .insert( 59 | compression, 60 | boxpath(path, name), 61 | &mut BufReader::new(file), 62 | HashMap::new(), 63 | ) 64 | .map(|_| ()) 65 | } 66 | 67 | #[inline(always)] 68 | fn insert_thfst_files(boxfile: &mut BoxFileWriter, path: &Path) -> Result<(), std::io::Error> { 69 | let boxpath = BoxPath::new(path.file_name().unwrap()).unwrap(); 70 | println!("Inserting \"{}\"...", &boxpath); 71 | 72 | boxfile.mkdir(boxpath, std::collections::HashMap::new())?; 73 | insert(boxfile, Compression::Stored, path, "alphabet")?; 74 | insert(boxfile, Compression::Stored, path, "index")?; 75 | insert(boxfile, Compression::Stored, path, "transition") 76 | } 77 | 78 | fn convert_hfst_to_thfst(hfst_path: &Path) -> Result<(), std::io::Error> { 79 | let fs = divvunspell::vfs::Fs; 80 | let transducer = HfstTransducer::from_path(&fs, hfst_path).map_err(|e| e.into_io_error())?; 81 | println!( 82 | "Converting {:?} to {:?}...", 83 | &hfst_path.file_name().unwrap(), 84 | &hfst_path.with_extension("thfst").file_name().unwrap() 85 | ); 86 | 87 | thfst::ThfstTransducer::convert_file(&transducer, hfst_path)?; 88 | Ok(()) 89 | } 90 | 91 | fn convert_thfsts_to_bhfst( 92 | acceptor_path: &Path, 93 | errmodel_path: &Path, 94 | output_path: &Path, 95 | ) -> Result<(), std::io::Error> { 96 | let fs = divvunspell::vfs::Fs; 97 | let _acceptor_transducer = 98 | MemmapThfstTransducer::from_path(&fs, acceptor_path).map_err(|e| e.into_io_error())?; 99 | let _errmodel_transducer = 100 | MemmapThfstTransducer::from_path(&fs, errmodel_path).map_err(|e| e.into_io_error())?; 101 | 102 | let mut boxfile: BoxFileWriter = BoxFileWriter::create_with_alignment(output_path, ALIGNMENT)?; 103 | 104 | insert_thfst_files(&mut boxfile, acceptor_path)?; 105 | insert_thfst_files(&mut boxfile, errmodel_path)?; 106 | 107 | Ok(()) 108 | } 109 | 110 | fn convert_zhfst_to_bhfst(zhfst_path: &Path) -> Result<(), std::io::Error> { 111 | let zhfst_path = std::fs::canonicalize(zhfst_path)?; 112 | let zhfst = ZipSpellerArchive::open(&zhfst_path).unwrap(); 113 | 114 | let dir = tempfile::tempdir()?; 115 | println!( 116 | "Unzipping {:?} to temporary directory...", 117 | zhfst_path.file_name().unwrap() 118 | ); 119 | std::process::Command::new("unzip") 120 | .current_dir(&dir) 121 | .args(&[&zhfst_path]) 122 | .output()?; 123 | 124 | let bhfst_path = zhfst_path.with_extension("bhfst"); 125 | let mut boxfile: BoxFileWriter = BoxFileWriter::create_with_alignment(&bhfst_path, ALIGNMENT)?; 126 | 127 | let meta_json = match zhfst.metadata() { 128 | Some(metadata) => { 129 | println!("Converting \"index.xml\" to \"meta.json\"..."); 130 | let mut m = metadata.to_owned(); 131 | m.acceptor_mut() 132 | .set_id(metadata.acceptor().id().replace(".hfst", ".thfst")); 133 | m.errmodel_mut() 134 | .set_id(metadata.errmodel().id().replace(".hfst", ".thfst")); 135 | Some(serde_json::to_string_pretty(&m)?) 136 | } 137 | None => None, 138 | }; 139 | 140 | let acceptor_path = dir.as_ref().join("acceptor.default.hfst"); 141 | convert_hfst_to_thfst(&acceptor_path)?; 142 | insert_thfst_files(&mut boxfile, &acceptor_path.with_extension("thfst"))?; 143 | 144 | let errmodel_path = dir.as_ref().join("errmodel.default.hfst"); 145 | convert_hfst_to_thfst(&errmodel_path)?; 146 | insert_thfst_files(&mut boxfile, &errmodel_path.with_extension("thfst"))?; 147 | 148 | if let Some(v) = meta_json { 149 | println!("Inserting \"meta.json\"..."); 150 | boxfile.insert( 151 | Compression::Stored, 152 | BoxPath::new("meta.json").unwrap(), 153 | &mut std::io::Cursor::new(v), 154 | std::collections::HashMap::new(), 155 | )?; 156 | } 157 | 158 | println!("Wrote to {:?}.", bhfst_path); 159 | 160 | Ok(()) 161 | } 162 | 163 | fn main() -> Result<(), std::io::Error> { 164 | let opts = Opts::parse(); 165 | 166 | match opts { 167 | Opts::HfstToThfst { from } => convert_hfst_to_thfst(&from), 168 | Opts::ThfstsToBhfst { 169 | acceptor, 170 | errmodel, 171 | output, 172 | } => convert_thfsts_to_bhfst(&acceptor, &errmodel, &output), 173 | Opts::ZhfstToBhfst { from } => convert_zhfst_to_bhfst(&from), 174 | Opts::BhfstInfo { path } => { 175 | let ar: ThfstBoxSpellerArchive = BoxSpellerArchive::open(&path).unwrap(); 176 | println!("{:#?}", ar.metadata()); 177 | Ok(()) 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/transducer/thfst/index_table.rs: -------------------------------------------------------------------------------- 1 | use std::ptr; 2 | 3 | use memmap2::Mmap; 4 | 5 | use crate::transducer::TransducerError; 6 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 7 | use crate::vfs::{self, Filesystem}; 8 | 9 | #[derive(Debug)] 10 | pub struct MemmapIndexTable { 11 | buf: Mmap, 12 | pub(crate) size: TransitionTableIndex, 13 | _file: std::marker::PhantomData, 14 | } 15 | 16 | const INDEX_TABLE_SIZE: usize = 8; 17 | 18 | impl MemmapIndexTable { 19 | pub fn from_path_partial( 20 | fs: &FS, 21 | path: P, 22 | chunk: u64, 23 | total: u64, 24 | ) -> Result 25 | where 26 | P: AsRef, 27 | FS: Filesystem, 28 | { 29 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 30 | let len = file.len().map_err(TransducerError::Io)? / total; 31 | let buf = unsafe { 32 | file.partial_memory_map(chunk * len, len as usize) 33 | .map_err(TransducerError::Memmap)? 34 | }; 35 | let size = TransitionTableIndex((buf.len() / INDEX_TABLE_SIZE) as u32); 36 | Ok(MemmapIndexTable { 37 | buf, 38 | size, 39 | _file: std::marker::PhantomData::, 40 | }) 41 | } 42 | } 43 | 44 | impl crate::transducer::IndexTable for MemmapIndexTable { 45 | fn from_path(fs: &FS, path: P) -> Result 46 | where 47 | P: AsRef, 48 | FS: Filesystem, 49 | { 50 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 51 | let buf = unsafe { file.memory_map().map_err(TransducerError::Memmap)? }; 52 | let size = TransitionTableIndex((buf.len() / INDEX_TABLE_SIZE) as u32); 53 | Ok(MemmapIndexTable { 54 | buf, 55 | size, 56 | _file: std::marker::PhantomData::, 57 | }) 58 | } 59 | 60 | fn input_symbol(&self, i: TransitionTableIndex) -> Option { 61 | if i >= self.size { 62 | return None; 63 | } 64 | 65 | let index = INDEX_TABLE_SIZE * i.0 as usize; 66 | 67 | let input_symbol: SymbolNumber = 68 | unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 69 | 70 | if input_symbol == SymbolNumber::MAX { 71 | None 72 | } else { 73 | Some(input_symbol) 74 | } 75 | } 76 | 77 | fn target(&self, i: TransitionTableIndex) -> Option { 78 | if i >= self.size { 79 | return None; 80 | } 81 | 82 | let index = (INDEX_TABLE_SIZE * i.0 as usize) + 4; 83 | let target: TransitionTableIndex = 84 | unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 85 | 86 | if target == TransitionTableIndex::MAX { 87 | None 88 | } else { 89 | Some(target) 90 | } 91 | } 92 | 93 | fn final_weight(&self, i: TransitionTableIndex) -> Option { 94 | if i >= self.size { 95 | return None; 96 | } 97 | 98 | let index = (INDEX_TABLE_SIZE * i.0 as usize) + 4; 99 | let weight: Weight = unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 100 | 101 | Some(weight) 102 | } 103 | } 104 | 105 | #[cfg(unix)] 106 | mod unix { 107 | use super::*; 108 | 109 | use crate::transducer::IndexTable; 110 | use crate::transducer::TransducerError; 111 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 112 | use crate::vfs::{self, Filesystem}; 113 | 114 | pub struct FileIndexTable { 115 | file: F, 116 | size: TransitionTableIndex, 117 | } 118 | 119 | impl FileIndexTable { 120 | #[inline(always)] 121 | fn read_u16_at(&self, index: u64) -> u16 { 122 | let mut buf = [0u8; 2]; 123 | self.file 124 | .read_exact_at(&mut buf, index) 125 | .expect("failed to read u16"); 126 | u16::from_le_bytes(buf) 127 | } 128 | 129 | #[inline(always)] 130 | fn read_u32_at(&self, index: u64) -> u32 { 131 | let mut buf = [0u8; 4]; 132 | self.file 133 | .read_exact_at(&mut buf, index) 134 | .expect("failed to read u32"); 135 | u32::from_le_bytes(buf) 136 | } 137 | } 138 | 139 | impl IndexTable for FileIndexTable { 140 | fn from_path(fs: &FS, path: P) -> Result 141 | where 142 | P: AsRef, 143 | FS: Filesystem, 144 | { 145 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 146 | Ok(FileIndexTable { 147 | size: TransitionTableIndex(file.len().map_err(TransducerError::Io)? as u32), 148 | file, 149 | }) 150 | } 151 | 152 | fn input_symbol(&self, i: TransitionTableIndex) -> Option { 153 | if i >= self.size { 154 | return None; 155 | } 156 | 157 | let index = INDEX_TABLE_SIZE * i.0 as usize; 158 | 159 | let input_symbol = SymbolNumber(self.read_u16_at(index as u64)); 160 | 161 | if input_symbol == SymbolNumber::MAX { 162 | None 163 | } else { 164 | Some(input_symbol) 165 | } 166 | } 167 | 168 | fn target(&self, i: TransitionTableIndex) -> Option { 169 | if i >= self.size { 170 | return None; 171 | } 172 | 173 | let index = (INDEX_TABLE_SIZE * i.0 as usize) + 4; 174 | let target = TransitionTableIndex(self.read_u32_at(index as u64)); 175 | 176 | if target == TransitionTableIndex::MAX { 177 | None 178 | } else { 179 | Some(target) 180 | } 181 | } 182 | 183 | fn final_weight(&self, i: TransitionTableIndex) -> Option { 184 | if i >= self.size { 185 | return None; 186 | } 187 | 188 | let index = (INDEX_TABLE_SIZE * i.0 as usize) + 4; 189 | let x = self.read_u32_at(index as u64); 190 | let weight = Weight(f32::from_bits(x)); 191 | 192 | Some(weight) 193 | } 194 | } 195 | } 196 | 197 | #[cfg(unix)] 198 | pub use self::unix::FileIndexTable; 199 | -------------------------------------------------------------------------------- /src/tokenizer/case_handling.rs: -------------------------------------------------------------------------------- 1 | use itertools::Itertools; 2 | use smol_str::SmolStr; 3 | 4 | #[inline(always)] 5 | pub fn lower_case(s: &str) -> SmolStr { 6 | s.chars() 7 | .map(|c| c.to_lowercase().collect::()) 8 | .collect::() 9 | } 10 | 11 | #[inline(always)] 12 | pub fn upper_case(s: &str) -> SmolStr { 13 | s.chars() 14 | .map(|c| c.to_uppercase().collect::()) 15 | .collect::() 16 | } 17 | 18 | #[inline(always)] 19 | pub fn upper_first(s: &str) -> SmolStr { 20 | let mut c = s.chars(); 21 | match c.next() { 22 | None => SmolStr::new(""), 23 | Some(f) => SmolStr::from(f.to_uppercase().collect::() + c.as_str()), 24 | } 25 | } 26 | 27 | #[inline(always)] 28 | pub fn lower_first(s: &str) -> SmolStr { 29 | let mut c = s.chars(); 30 | match c.next() { 31 | None => SmolStr::new(""), 32 | Some(f) => SmolStr::from(f.to_lowercase().collect::() + c.as_str()), 33 | } 34 | } 35 | 36 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 37 | enum Case { 38 | Upper, 39 | Lower, 40 | Neither, 41 | } 42 | 43 | impl Case { 44 | #[inline(always)] 45 | fn new(ch: char) -> Case { 46 | if ch.is_lowercase() { 47 | Case::Lower 48 | } else if ch.is_uppercase() { 49 | Case::Upper 50 | } else { 51 | Case::Neither 52 | } 53 | } 54 | } 55 | 56 | pub fn is_mixed_case(word: &str) -> bool { 57 | let mut chars = word.chars(); 58 | let mut last_case = match chars.next() { 59 | Some(ch) => Case::new(ch), 60 | None => return false, 61 | }; 62 | 63 | if last_case == Case::Neither { 64 | return false; 65 | } 66 | 67 | let mut case_changes = 0; 68 | 69 | for ch in chars { 70 | let next_case = Case::new(ch); 71 | 72 | match (last_case, next_case) { 73 | (_, Case::Neither) => return false, 74 | (_, Case::Upper) => case_changes += 2, 75 | (Case::Upper, Case::Lower) => case_changes += 1, 76 | _ => {} 77 | } 78 | 79 | last_case = next_case; 80 | } 81 | 82 | case_changes > 1 83 | } 84 | 85 | pub fn is_all_caps(word: &str) -> bool { 86 | upper_case(word) == word 87 | } 88 | 89 | pub fn is_first_caps(word: &str) -> bool { 90 | upper_first(word) == word 91 | } 92 | 93 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 94 | pub enum CaseMutation { 95 | FirstCaps, 96 | AllCaps, 97 | None, 98 | } 99 | 100 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 101 | pub enum CaseMode { 102 | FirstResults, 103 | MergeAll, 104 | } 105 | 106 | #[derive(Debug, Clone)] 107 | pub struct CaseHandler { 108 | pub original_input: SmolStr, 109 | pub mutation: CaseMutation, 110 | pub mode: CaseMode, 111 | pub words: Vec, 112 | } 113 | 114 | fn mixed_case_word_variants(word: &str) -> CaseHandler { 115 | // The input string should be accepted IFF it is accepted exactly as given, 116 | // or with the initial letter downcased, or all upper. 117 | // 118 | // Crucially, it should not be accepted if it is only accepted when all lowercased. 119 | 120 | let mut words = vec![]; 121 | if is_first_caps(word) { 122 | words.push(lower_first(word)); 123 | } else { 124 | let upper = upper_first(word); 125 | // Edge case of "sOMETHING" 126 | if !is_all_caps(&upper) { 127 | words.push(upper); 128 | } 129 | } 130 | 131 | CaseHandler { 132 | original_input: word.into(), 133 | mutation: if is_first_caps(word) { 134 | CaseMutation::FirstCaps 135 | } else { 136 | CaseMutation::None 137 | }, 138 | mode: CaseMode::FirstResults, 139 | words, 140 | } 141 | } 142 | 143 | pub fn word_variants(word: &str) -> CaseHandler { 144 | if is_mixed_case(word) { 145 | return mixed_case_word_variants(word); 146 | } 147 | 148 | let word = SmolStr::new(word); 149 | let mut base: Vec = vec![]; 150 | 151 | base.append( 152 | &mut std::iter::once(&word) 153 | .chain(base.iter()) 154 | .filter(|x| is_all_caps(x)) 155 | .map(|x| upper_first(&lower_case(x))) 156 | .collect(), 157 | ); 158 | 159 | base.append( 160 | &mut std::iter::once(&word) 161 | .chain(base.iter()) 162 | .map(|x| lower_case(x)) 163 | .collect(), 164 | ); 165 | 166 | let words = base.into_iter().unique().collect(); 167 | 168 | let (mutation, mode) = if is_all_caps(&word) { 169 | (CaseMutation::AllCaps, CaseMode::MergeAll) 170 | } else if is_first_caps(&word) { 171 | (CaseMutation::FirstCaps, CaseMode::MergeAll) 172 | } else { 173 | (CaseMutation::None, CaseMode::MergeAll) 174 | }; 175 | 176 | CaseHandler { 177 | original_input: word.into(), 178 | mode, 179 | mutation, 180 | words, 181 | } 182 | } 183 | 184 | #[cfg(test)] 185 | mod tests { 186 | use super::*; 187 | 188 | #[test] 189 | fn test() { 190 | let _a = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 191 | .chars() 192 | .map(|c| SmolStr::from(c.to_string())) 193 | .collect::>(); 194 | // println!("{:?}", word_variants(&a, "FOO")); 195 | // println!("{:?}", word_variants(&a, "Giella")); 196 | // println!("{:?}", word_variants(&a, "abc")); 197 | // println!("{:?}", word_variants(&a, "$GIELLA$")); 198 | } 199 | 200 | #[test] 201 | fn variants() { 202 | assert_eq!(word_variants("IDENTITETE").mutation, CaseMutation::AllCaps); 203 | assert_eq!( 204 | word_variants("Identitete").mutation, 205 | CaseMutation::FirstCaps 206 | ); 207 | } 208 | 209 | #[test] 210 | fn mixed_case() { 211 | assert_eq!(is_mixed_case("McDonald"), true); 212 | assert_eq!(is_mixed_case("Mcdonald"), false); 213 | assert_eq!(is_mixed_case("McDoNaLd"), true); 214 | assert_eq!(is_mixed_case("MCDONALD"), false); 215 | assert_eq!(is_mixed_case("mcDonald"), true); 216 | assert_eq!(is_mixed_case("mcdonald"), false); 217 | 218 | assert_eq!(is_mixed_case("ab"), false); 219 | assert_eq!(is_mixed_case("aB"), true); 220 | assert_eq!(is_mixed_case("Ab"), false); 221 | assert_eq!(is_mixed_case("AB"), false); 222 | 223 | assert_eq!(is_mixed_case("A"), false); 224 | assert_eq!(is_mixed_case("a"), false); 225 | assert_eq!(is_mixed_case("aS:"), false); 226 | assert_eq!(is_mixed_case(":"), false); 227 | 228 | assert_eq!(is_mixed_case("DavveVássján"), true); 229 | assert_eq!(is_mixed_case("davveVássján"), true); 230 | assert_eq!(is_mixed_case("Davvevássján"), false); 231 | 232 | assert_eq!(is_mixed_case("SGPai"), false); 233 | assert_eq!(is_mixed_case("SgPaI"), true); 234 | assert_eq!(is_mixed_case("SGPaiSGP"), true); 235 | assert_eq!(is_mixed_case("sgpAI"), true); 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # divvunspell 2 | 3 | [![CI](https://builds.giellalt.org/api/badge/divvunspell)](https://builds.giellalt.org/pipelines/divvunspell) 4 | [![Crates.io](https://img.shields.io/crates/v/divvunspell.svg)](https://crates.io/crates/divvunspell) 5 | [![Documentation](https://docs.rs/divvunspell/badge.svg)](https://docs.rs/divvunspell) 6 | 7 | A fast, feature-rich spell checking library and toolset for HFST-based spell checkers. Written in Rust, divvunspell is a modern reimplementation and extension of [hfst-ospell](https://github.com/hfst/hfst-ospell) with additional features like parallel processing, comprehensive tokenization, case handling, and morphological analysis. 8 | 9 | ## Features 10 | 11 | - **High Performance**: Memory-mapped transducers and parallel suggestion generation 12 | - **ZHFST/BHFST Support**: Load standard HFST spell checker archives 13 | - **Smart Tokenization**: Unicode-aware word boundary detection with customizable alphabets 14 | - **Case Handling**: Intelligent case preservation and suggestion recasing 15 | - **Morphological Analysis**: Extract and filter suggestions based on morphological tags 16 | - **Cross-Platform**: Works on macOS, Linux, Windows, iOS and Android 17 | 18 | ## Quick Start 19 | 20 | ### As a Command-Line Tool 21 | 22 | ```sh 23 | # Install the CLI 24 | cargo install divvunspell-cli 25 | 26 | # Check spelling and get suggestions 27 | divvunspell suggest --archive speller.zhfst --json "sámi" 28 | ``` 29 | 30 | ### As a Rust Library 31 | 32 | Add to your `Cargo.toml`: 33 | 34 | ```toml 35 | [dependencies] 36 | divvunspell = "1.0.0-beta.7" 37 | ``` 38 | 39 | Basic usage: 40 | 41 | ```rust 42 | use divvunspell::archive::{SpellerArchive, ZipSpellerArchive}; 43 | use divvunspell::speller::{Speller, SpellerConfig, OutputMode}; 44 | 45 | // Load a spell checker archive 46 | let archive = ZipSpellerArchive::open("language.zhfst")?; 47 | let speller = archive.speller(); 48 | 49 | // Check if a word is correct 50 | if !speller.clone().is_correct("wordd") { 51 | // Get spelling suggestions 52 | let config = SpellerConfig::default(); 53 | let suggestions = speller.clone().suggest("wordd"); 54 | 55 | for suggestion in suggestions { 56 | println!("{} (weight: {})", suggestion.value, suggestion.weight); 57 | } 58 | } 59 | 60 | // Morphological analysis 61 | let analyses = speller.analyze_input("running"); 62 | for analysis in analyses { 63 | println!("{}", analysis.value); // e.g., "run+V+PresPartc" 64 | } 65 | ``` 66 | 67 | ## Command-Line Tools 68 | 69 | ### divvunspell 70 | 71 | The main spell checking tool with support for suggestions, analysis, and tokenization. 72 | 73 | ```sh 74 | # Get suggestions for a word 75 | divvunspell suggest --archive language.zhfst "wordd" 76 | 77 | # Always show suggestions even for correct words 78 | divvunspell suggest --archive language.zhfst --always-suggest "word" 79 | 80 | # Limit number and weight of suggestions 81 | divvunspell suggest --archive language.zhfst --nbest 5 --weight 20.0 "wordd" 82 | 83 | # JSON output 84 | divvunspell suggest --archive language.zhfst --json "wordd" 85 | 86 | # Tokenize text 87 | divvunspell tokenize --archive language.zhfst "This is some text." 88 | 89 | # Analyze word forms morphologically 90 | divvunspell analyze-input --archive language.zhfst "running" 91 | divvunspell analyze-output --archive language.zhfst "runing" 92 | ``` 93 | 94 | **Options:** 95 | - `-a, --archive ` - BHFST or ZHFST archive to use 96 | - `-S, --always-suggest` - Show suggestions even if word is correct 97 | - `-w, --weight ` - Maximum weight limit for suggestions 98 | - `-n, --nbest ` - Maximum number of suggestions to return 99 | - `--no-reweighting` - Disable suggestion reweighting (closer to hfst-ospell behavior) 100 | - `--no-recase` - Disable case-aware suggestion handling 101 | - `--json` - Output results as JSON 102 | 103 | **Debugging:** 104 | 105 | Set `RUST_LOG=trace` to enable detailed logging: 106 | 107 | ```sh 108 | RUST_LOG=trace divvunspell suggest --archive language.zhfst "wordd" 109 | ``` 110 | 111 | ### thfst-tools 112 | 113 | Convert HFST and ZHFST files to optimized THFST and BHFST formats. 114 | 115 | **THFST** (Tromsø-Helsinki FST): A byte-aligned HFST format optimized for fast loading and memory mapping, required for ARM processors. 116 | 117 | **BHFST** (Box HFST): THFST files packaged in a [box](https://github.com/bbqsrc/box) container with JSON metadata for efficient processing. 118 | 119 | ```sh 120 | # Convert HFST to THFST 121 | thfst-tools hfst-to-thfst acceptor.hfst acceptor.thfst 122 | 123 | # Convert ZHFST to BHFST (recommended for distribution) 124 | thfst-tools zhfst-to-bhfst language.zhfst language.bhfst 125 | 126 | # Convert THFST pair to BHFST 127 | thfst-tools thfsts-to-bhfst --errmodel errmodel.thfst --lexicon lexicon.thfst output.bhfst 128 | 129 | # View BHFST metadata 130 | thfst-tools bhfst-info language.bhfst 131 | ``` 132 | 133 | ### accuracy 134 | 135 | Test spell checker accuracy against known typo/correction pairs. 136 | 137 | ```sh 138 | # Install 139 | cd crates/accuracy 140 | cargo install --path . 141 | 142 | # Run accuracy test 143 | accuracy typos.tsv language.zhfst 144 | 145 | # Save detailed JSON report 146 | accuracy -o report.json typos.tsv language.zhfst 147 | 148 | # Limit test size and save TSV summary 149 | accuracy -w 1000 -t results.tsv typos.tsv language.zhfst 150 | 151 | # Use custom config 152 | accuracy -c config.json typos.tsv language.zhfst 153 | ``` 154 | 155 | **Input format** (`typos.tsv`): Tab-separated values with typo in first column, expected correction in second: 156 | 157 | ``` 158 | wordd word 159 | recieve receive 160 | teh the 161 | ``` 162 | 163 | **Accuracy viewer** (prototype web UI): 164 | 165 | ```sh 166 | accuracy -o support/accuracy-viewer/public/report.json typos.txt language.zhfst 167 | cd support/accuracy-viewer 168 | npm i && npm run dev 169 | # Open http://localhost:5000 170 | ``` 171 | 172 | ## Building from Source 173 | 174 | ### Install Rust 175 | 176 | ```sh 177 | curl https://sh.rustup.rs -sSf | sh 178 | source $HOME/.cargo/env 179 | rustup default stable 180 | ``` 181 | 182 | ### Build Everything 183 | 184 | ```sh 185 | # Build all crates 186 | cargo build --release 187 | 188 | # Install specific tools 189 | cargo install --path ./cli # divvunspell CLI 190 | cargo install --path ./crates/thfst-tools 191 | cargo install --path ./crates/accuracy 192 | ``` 193 | 194 | ### Run Tests 195 | 196 | ```sh 197 | cargo test 198 | ``` 199 | 200 | ## Documentation 201 | 202 | - **API Documentation**: [docs.rs/divvunspell](https://docs.rs/divvunspell) 203 | - **GitHub Pages**: [divvun.github.io/divvunspell](https://divvun.github.io/divvunspell/) 204 | 205 | ## License 206 | 207 | The **divvunspell library** is dual-licensed under: 208 | 209 | - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 210 | - MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 211 | 212 | You may choose either license for library use. 213 | 214 | The **command-line tools** (`divvunspell`, `thfst-tools`, `accuracy`) are licensed under **GPL-3.0** ([LICENSE-GPL](LICENSE-GPL)). 215 | -------------------------------------------------------------------------------- /support/accuracy-viewer/src/App.svelte: -------------------------------------------------------------------------------- 1 | 135 | 136 | 205 | 206 | {#if report != null} 207 |

Speller configuration:

208 |

209 | {JSON.stringify(report.config, null, 2)}
210 |

211 | 212 | 213 | 214 | 215 | 218 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 |

Words: {report.results.length}	216 \| Words per second 217 \|	219 \| Total runtime 220 \|
Real	{wordsPerSecond(report.total_time)}	{humanTime(report.total_time)}
CPU (linear "user" time)	{wordsPerSecond(totalRuntime)}	{humanTime(totalRuntime)}

233 | 234 |

236 | % in 1st position: {firstPosition()}% 237 |
239 | % in top 5: {topFive()}% 240 |
242 | % anywhere: {anywhere()}% 243 |
245 | % no suggestions: {noSuggestions()}% 246 |
248 | % only wrong: {onlyWrong()}% 249 |

251 | 252 | {#if sortMode == null} 253 |

Sorted by input order

254 | {:else if sortMode === "time:asc"} 255 |

Sorted by time, ascending

256 | {:else if sortMode === "time:desc"} 257 |

Sorted by time, descending

258 | {:else} 259 |

Sorted in some unknown way (this is a bug)

260 | {/if} 261 | 262 | {/if} 263 | 264 | {#if results == null} 265 | Loading 266 | {:else} 267 | Sort by Time 268 | 269 | 270 | {#each results as result} 271 | 272 | 292 | 308 | 309 | {/each} 310 | 311 |

273 |

274 | {result.input} 275 | → 276 | {result.expected} 277 |

278 |

279 | Result: 280 | {#if result.position === null} 281 | Not in suggestions 282 | {:else if result.position === 0} 283 | Top suggestion 284 | {:else} 285 | Suggestion {result.position + 1} 286 | {/if} 287 |

288 |

289 | Time: {humanTimeMillis(result.time)} 290 |

291 |

293 | {#if result.suggestions.length > 0} 294 |

297 | 298 | {suggestion.value} 299 | 300 | {suggestion.weight} 301 |

304 | {:else} 305 | No suggestions 306 | {/if} 307 |

312 | {/if} -------------------------------------------------------------------------------- /src/transducer/hfst/mod.rs: -------------------------------------------------------------------------------- 1 | //! Finite-state automaton in HFST format. 2 | pub mod alphabet; 3 | pub mod header; 4 | pub mod index_table; 5 | pub mod transition_table; 6 | 7 | use std::fmt; 8 | use std::path::Path; 9 | use std::sync::Arc; 10 | 11 | use memmap2::Mmap; 12 | 13 | use self::alphabet::TransducerAlphabetParser; 14 | use self::header::TransducerHeader; 15 | pub use self::index_table::MappedIndexTable; 16 | pub use self::transition_table::MappedTransitionTable; 17 | use super::alphabet::TransducerAlphabet; 18 | use super::symbol_transition::SymbolTransition; 19 | use super::{Transducer, TransducerError}; 20 | use crate::constants::{INDEX_TABLE_SIZE, TARGET_TABLE}; 21 | use crate::types::{HeaderFlag, SymbolNumber, TransitionTableIndex, Weight}; 22 | use crate::vfs::{self, Filesystem}; 23 | 24 | pub struct HfstTransducer 25 | where 26 | F: vfs::File, 27 | { 28 | buf: Arc, 29 | header: TransducerHeader, 30 | alphabet: TransducerAlphabet, 31 | pub(crate) index_table: MappedIndexTable, 32 | pub(crate) transition_table: MappedTransitionTable, 33 | _file: std::marker::PhantomData, 34 | } 35 | 36 | impl fmt::Debug for HfstTransducer { 37 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 38 | writeln!(f, "{:?}", self.header)?; 39 | writeln!(f, "{:?}", self.alphabet)?; 40 | writeln!(f, "{:?}", self.index_table)?; 41 | writeln!(f, "{:?}", self.transition_table)?; 42 | Ok(()) 43 | } 44 | } 45 | 46 | impl HfstTransducer { 47 | #[inline(always)] 48 | pub fn from_mapped_memory(buf: Arc) -> HfstTransducer { 49 | let header = TransducerHeader::new(&buf); 50 | let alphabet_offset = header.len(); 51 | let alphabet = TransducerAlphabetParser::parse( 52 | &buf[alphabet_offset..buf.len()], 53 | header.symbol_count(), 54 | ); 55 | 56 | let index_table_offset = alphabet_offset + alphabet.len(); 57 | 58 | let index_table_end = 59 | index_table_offset + INDEX_TABLE_SIZE * header.index_table_size().0 as usize; 60 | let index_table = MappedIndexTable::new( 61 | buf.clone(), 62 | index_table_offset, 63 | index_table_end, 64 | header.index_table_size(), 65 | ); 66 | 67 | let trans_table = 68 | MappedTransitionTable::new(buf.clone(), index_table_end, header.target_table_size()); 69 | 70 | HfstTransducer { 71 | buf, 72 | header, 73 | alphabet, 74 | index_table, 75 | transition_table: trans_table, 76 | _file: std::marker::PhantomData::, 77 | } 78 | } 79 | 80 | #[inline(always)] 81 | pub fn buffer(&self) -> &[u8] { 82 | &self.buf 83 | } 84 | 85 | #[inline(always)] 86 | pub fn is_weighted(&self) -> bool { 87 | self.header.has_flag(HeaderFlag::Weighted) 88 | } 89 | 90 | #[inline(always)] 91 | pub fn header(&self) -> &TransducerHeader { 92 | &self.header 93 | } 94 | } 95 | 96 | impl Transducer for HfstTransducer { 97 | const FILE_EXT: &'static str = "hfst"; 98 | 99 | fn from_path(fs: &FS, path: P) -> Result, TransducerError> 100 | where 101 | P: AsRef, 102 | FS: Filesystem, 103 | { 104 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 105 | let mmap = unsafe { file.memory_map() }.map_err(TransducerError::Memmap)?; 106 | Ok(HfstTransducer::from_mapped_memory(Arc::new(mmap))) 107 | } 108 | 109 | #[inline(always)] 110 | fn is_final(&self, i: TransitionTableIndex) -> bool { 111 | if i >= TARGET_TABLE { 112 | self.transition_table.is_final(i - TARGET_TABLE) 113 | } else { 114 | self.index_table.is_final(i) 115 | } 116 | } 117 | 118 | #[inline(always)] 119 | fn final_weight(&self, i: TransitionTableIndex) -> Option { 120 | if i >= TARGET_TABLE { 121 | self.transition_table.weight(i - TARGET_TABLE) 122 | } else { 123 | self.index_table.final_weight(i) 124 | } 125 | } 126 | 127 | #[inline(always)] 128 | fn has_transitions(&self, i: TransitionTableIndex, s: Option) -> bool { 129 | let sym = match s { 130 | Some(v) => v, 131 | None => return false, 132 | }; 133 | 134 | if i >= TARGET_TABLE { 135 | match self.transition_table.input_symbol(i - TARGET_TABLE) { 136 | Some(res) => sym == res, 137 | None => false, 138 | } 139 | } else { 140 | match self 141 | .index_table 142 | .input_symbol(i + TransitionTableIndex(sym.0 as u32)) 143 | { 144 | Some(res) => sym == res, 145 | None => false, 146 | } 147 | } 148 | } 149 | 150 | #[inline(always)] 151 | fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool { 152 | if i >= TARGET_TABLE { 153 | match self.transition_table.input_symbol(i - TARGET_TABLE) { 154 | Some(sym) => sym == SymbolNumber::ZERO || self.alphabet.is_flag(sym), 155 | None => false, 156 | } 157 | } else if let Some(SymbolNumber::ZERO) = self.index_table.input_symbol(i) { 158 | true 159 | } else { 160 | false 161 | } 162 | } 163 | 164 | #[inline(always)] 165 | fn take_epsilons(&self, i: TransitionTableIndex) -> Option { 166 | if let Some(SymbolNumber::ZERO) = self.transition_table.input_symbol(i) { 167 | Some(self.transition_table.symbol_transition(i)) 168 | } else { 169 | None 170 | } 171 | } 172 | 173 | #[inline(always)] 174 | fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option { 175 | if let Some(sym) = self.transition_table.input_symbol(i) { 176 | if sym != SymbolNumber::ZERO && !self.alphabet.is_flag(sym) { 177 | None 178 | } else { 179 | Some(self.transition_table.symbol_transition(i)) 180 | } 181 | } else { 182 | None 183 | } 184 | } 185 | 186 | #[inline(always)] 187 | fn take_non_epsilons( 188 | &self, 189 | i: TransitionTableIndex, 190 | symbol: SymbolNumber, 191 | ) -> Option { 192 | if let Some(input_sym) = self.transition_table.input_symbol(i) { 193 | if input_sym != symbol { 194 | None 195 | } else { 196 | Some(self.transition_table.symbol_transition(i)) 197 | } 198 | } else { 199 | None 200 | } 201 | } 202 | 203 | #[inline(always)] 204 | fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option { 205 | if i >= TARGET_TABLE { 206 | Some(i - TARGET_TABLE + TransitionTableIndex::ONE) 207 | } else if let Some(v) = self 208 | .index_table 209 | .target(i + TransitionTableIndex(symbol.0 as u32 + 1)) 210 | { 211 | Some(v - TARGET_TABLE) 212 | } else { 213 | None 214 | } 215 | } 216 | 217 | #[inline(always)] 218 | fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option { 219 | self.transition_table.input_symbol(i) 220 | } 221 | 222 | #[inline(always)] 223 | fn alphabet(&self) -> &TransducerAlphabet { 224 | &self.alphabet 225 | } 226 | 227 | #[inline(always)] 228 | fn mut_alphabet(&mut self) -> &mut TransducerAlphabet { 229 | &mut self.alphabet 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /src/transducer/thfst/transition_table.rs: -------------------------------------------------------------------------------- 1 | use std::{mem, ptr}; 2 | 3 | use crate::transducer::TransducerError; 4 | use crate::transducer::TransitionTable; 5 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 6 | use crate::vfs::{self, Filesystem}; 7 | use memmap2::Mmap; 8 | 9 | #[derive(Debug)] 10 | pub struct MemmapTransitionTable { 11 | buf: Mmap, 12 | pub(crate) size: TransitionTableIndex, 13 | _file: std::marker::PhantomData, 14 | } 15 | 16 | const TRANS_TABLE_SIZE: usize = 12; 17 | 18 | impl MemmapTransitionTable { 19 | pub fn from_path_partial( 20 | fs: &FS, 21 | path: P, 22 | chunk: u64, 23 | total: u64, 24 | ) -> Result 25 | where 26 | P: AsRef, 27 | FS: Filesystem, 28 | { 29 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 30 | let len = file.len().map_err(TransducerError::Io)? / total; 31 | let buf = unsafe { 32 | file.partial_memory_map(chunk * len, len as usize) 33 | .map_err(TransducerError::Memmap)? 34 | }; 35 | let size = TransitionTableIndex((buf.len() / TRANS_TABLE_SIZE) as u32); 36 | Ok(MemmapTransitionTable { 37 | buf, 38 | size, 39 | _file: std::marker::PhantomData::, 40 | }) 41 | } 42 | 43 | #[inline] 44 | fn read_symbol_from_cursor(&self, index: usize) -> Option { 45 | let x = unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 46 | if x == SymbolNumber::MAX { 47 | None 48 | } else { 49 | Some(x) 50 | } 51 | } 52 | } 53 | 54 | impl TransitionTable for MemmapTransitionTable { 55 | fn from_path(fs: &FS, path: P) -> Result 56 | where 57 | P: AsRef, 58 | FS: Filesystem, 59 | { 60 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 61 | let buf = unsafe { file.memory_map() }.map_err(TransducerError::Memmap)?; 62 | let size = (buf.len() / TRANS_TABLE_SIZE) as u32; 63 | Ok(MemmapTransitionTable { 64 | buf, 65 | size: TransitionTableIndex(size), 66 | _file: std::marker::PhantomData::, 67 | }) 68 | } 69 | 70 | fn input_symbol(&self, i: TransitionTableIndex) -> Option { 71 | if i >= self.size { 72 | return None; 73 | } 74 | 75 | let index = TRANS_TABLE_SIZE as usize * i.0 as usize; 76 | self.read_symbol_from_cursor(index) 77 | } 78 | 79 | fn output_symbol(&self, i: TransitionTableIndex) -> Option { 80 | if i >= self.size { 81 | return None; 82 | } 83 | 84 | let index = ((TRANS_TABLE_SIZE * i.0 as usize) + mem::size_of::()) as usize; 85 | self.read_symbol_from_cursor(index) 86 | } 87 | 88 | fn target(&self, i: TransitionTableIndex) -> Option { 89 | if i >= self.size { 90 | return None; 91 | } 92 | 93 | let index = (TRANS_TABLE_SIZE * i.0 as usize) + (2 * mem::size_of::()); 94 | 95 | let x: TransitionTableIndex = 96 | unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 97 | if x == TransitionTableIndex::MAX { 98 | None 99 | } else { 100 | Some(x) 101 | } 102 | } 103 | 104 | fn weight(&self, i: TransitionTableIndex) -> Option { 105 | if i >= self.size { 106 | return None; 107 | } 108 | 109 | let index = (TRANS_TABLE_SIZE * i.0 as usize) 110 | + (2 * mem::size_of::()) 111 | + mem::size_of::(); 112 | 113 | let x: Weight = unsafe { ptr::read(self.buf.as_ptr().add(index) as *const _) }; 114 | 115 | Some(x) 116 | } 117 | } 118 | 119 | #[cfg(unix)] 120 | mod unix { 121 | use super::*; 122 | 123 | use crate::transducer::TransducerError; 124 | use crate::transducer::TransitionTable; 125 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 126 | use crate::vfs::{self, Filesystem}; 127 | 128 | pub struct FileTransitionTable { 129 | file: F, 130 | size: TransitionTableIndex, 131 | } 132 | 133 | impl FileTransitionTable { 134 | #[inline(always)] 135 | fn read_u16_at(&self, index: u64) -> u16 { 136 | let mut buf = [0u8; 2]; 137 | self.file 138 | .read_exact_at(&mut buf, index) 139 | .expect("failed to read u16"); 140 | u16::from_le_bytes(buf) 141 | } 142 | 143 | #[inline(always)] 144 | fn read_u32_at(&self, index: u64) -> u32 { 145 | let mut buf = [0u8; 4]; 146 | self.file 147 | .read_exact_at(&mut buf, index) 148 | .expect("failed to read u32"); 149 | u32::from_le_bytes(buf) 150 | } 151 | } 152 | 153 | impl TransitionTable for FileTransitionTable { 154 | fn from_path(fs: &FS, path: P) -> Result 155 | where 156 | P: AsRef, 157 | FS: Filesystem, 158 | { 159 | let file = fs.open_file(path).map_err(TransducerError::Io)?; 160 | Ok(FileTransitionTable { 161 | size: TransitionTableIndex(file.len().map_err(TransducerError::Io)? as u32), 162 | file, 163 | }) 164 | } 165 | 166 | #[inline(always)] 167 | fn input_symbol(&self, i: TransitionTableIndex) -> Option { 168 | if i >= self.size { 169 | return None; 170 | } 171 | 172 | let index = TRANS_TABLE_SIZE as usize * i.0 as usize; 173 | let x = SymbolNumber(self.read_u16_at(index as u64)); 174 | if x == SymbolNumber::MAX { 175 | None 176 | } else { 177 | Some(x) 178 | } 179 | } 180 | 181 | #[inline(always)] 182 | fn output_symbol(&self, i: TransitionTableIndex) -> Option { 183 | if i >= self.size { 184 | return None; 185 | } 186 | 187 | let index = 188 | ((TRANS_TABLE_SIZE * i.0 as usize) + mem::size_of::()) as usize; 189 | let x = SymbolNumber(self.read_u16_at(index as u64)); 190 | if x == SymbolNumber::MAX { 191 | None 192 | } else { 193 | Some(x) 194 | } 195 | } 196 | 197 | #[inline(always)] 198 | fn target(&self, i: TransitionTableIndex) -> Option { 199 | if i >= self.size { 200 | return None; 201 | } 202 | 203 | let index = (TRANS_TABLE_SIZE * i.0 as usize) + (2 * mem::size_of::()); 204 | 205 | let x = TransitionTableIndex(self.read_u32_at(index as u64)); 206 | if x == TransitionTableIndex::MAX { 207 | None 208 | } else { 209 | Some(x) 210 | } 211 | } 212 | 213 | #[inline(always)] 214 | fn weight(&self, i: TransitionTableIndex) -> Option { 215 | if i >= self.size { 216 | return None; 217 | } 218 | 219 | let index = (TRANS_TABLE_SIZE * i.0 as usize) 220 | + (2 * mem::size_of::()) 221 | + mem::size_of::(); 222 | let x = self.read_u32_at(index as u64); 223 | let x = Weight(f32::from_bits(x)); 224 | Some(x) 225 | } 226 | } 227 | } 228 | 229 | #[cfg(unix)] 230 | pub use self::unix::FileTransitionTable; 231 | -------------------------------------------------------------------------------- /src/transducer/thfst/mod.rs: -------------------------------------------------------------------------------- 1 | //! Finite-state automaton in optimised mmapped format. 2 | // We manually ensure alignment of reads in this file. 3 | #![allow(clippy::cast_ptr_alignment)] 4 | 5 | use std::path::Path; 6 | use std::{u16, u32}; 7 | 8 | use crate::constants::TARGET_TABLE; 9 | use crate::transducer::{TransducerError, symbol_transition::SymbolTransition}; 10 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 11 | use serde::{Deserialize, Serialize}; 12 | 13 | mod chunked; 14 | mod index_table; 15 | mod transition_table; 16 | 17 | pub use self::chunked::{MemmapThfstChunkedTransducer, ThfstChunkedTransducer}; 18 | pub use self::index_table::MemmapIndexTable; 19 | pub use self::transition_table::MemmapTransitionTable; 20 | 21 | pub type MemmapThfstTransducer = 22 | ThfstTransducer, MemmapTransitionTable, F>; 23 | 24 | #[cfg(unix)] 25 | pub type FileThfstTransducer = ThfstTransducer< 26 | self::index_table::FileIndexTable, 27 | self::transition_table::FileTransitionTable, 28 | F, 29 | >; 30 | 31 | use crate::transducer::{Transducer, TransducerAlphabet}; 32 | use crate::vfs::{self, Filesystem}; 33 | 34 | #[repr(C)] 35 | pub(crate) union WeightOrTarget { 36 | target: u32, 37 | weight: f32, 38 | } 39 | 40 | #[repr(C)] 41 | pub struct IndexTableRecord { 42 | input_symbol: u16, 43 | #[doc(hidden)] 44 | __padding: u16, 45 | weight_or_target: WeightOrTarget, 46 | } 47 | 48 | #[repr(C)] 49 | pub struct TransitionTableRecord { 50 | input_symbol: u16, 51 | output_symbol: u16, 52 | weight_or_target: WeightOrTarget, 53 | } 54 | 55 | #[derive(Serialize, Deserialize)] 56 | pub struct MetaRecord { 57 | pub index_table_count: usize, 58 | pub transition_table_count: usize, 59 | pub chunk_size: usize, 60 | pub alphabet: TransducerAlphabet, 61 | } 62 | 63 | pub struct ThfstTransducer 64 | where 65 | I: crate::transducer::IndexTable, 66 | T: crate::transducer::TransitionTable, 67 | F: vfs::File, 68 | { 69 | index_table: I, 70 | transition_table: T, 71 | alphabet: TransducerAlphabet, 72 | _file: std::marker::PhantomData, 73 | } 74 | 75 | macro_rules! error { 76 | ($path:path, $name:expr_2021) => { 77 | TransducerError::Io(std::io::Error::new( 78 | std::io::ErrorKind::NotFound, 79 | format!( 80 | "`{}` not found in transducer path, looked for {}", 81 | $name, 82 | $path.join($name).display() 83 | ), 84 | )) 85 | }; 86 | } 87 | 88 | impl Transducer for ThfstTransducer 89 | where 90 | I: crate::transducer::IndexTable, 91 | T: crate::transducer::TransitionTable, 92 | F: vfs::File, 93 | { 94 | const FILE_EXT: &'static str = "thfst"; 95 | 96 | fn from_path(fs: &FS, path: P) -> Result 97 | where 98 | P: AsRef, 99 | FS: Filesystem, 100 | { 101 | let path = path.as_ref(); 102 | let alphabet_file = fs 103 | .open_file(&path.join("alphabet")) 104 | .map_err(|_| error!(path, "alphabet"))?; 105 | 106 | let alphabet: TransducerAlphabet = serde_json::from_reader(alphabet_file) 107 | .map_err(|e| TransducerError::Alphabet(Box::new(e)))?; 108 | 109 | let index_table = 110 | I::from_path(fs, path.join("index")).map_err(|_| error!(path, "index"))?; 111 | let transition_table = 112 | T::from_path(fs, path.join("transition")).map_err(|_| error!(path, "transition"))?; 113 | 114 | Ok(ThfstTransducer { 115 | index_table, 116 | transition_table, 117 | alphabet, 118 | _file: std::marker::PhantomData::, 119 | }) 120 | } 121 | 122 | #[inline(always)] 123 | fn is_final(&self, i: TransitionTableIndex) -> bool { 124 | if i >= TARGET_TABLE { 125 | self.transition_table.is_final(i - TARGET_TABLE) 126 | } else { 127 | self.index_table.is_final(i) 128 | } 129 | } 130 | 131 | #[inline(always)] 132 | fn final_weight(&self, i: TransitionTableIndex) -> Option { 133 | if i >= TARGET_TABLE { 134 | self.transition_table.weight(i - TARGET_TABLE) 135 | } else { 136 | self.index_table.final_weight(i) 137 | } 138 | } 139 | 140 | #[inline(always)] 141 | fn has_transitions(&self, i: TransitionTableIndex, s: Option) -> bool { 142 | let sym = match s { 143 | Some(v) => v, 144 | None => return false, 145 | }; 146 | 147 | if i >= TARGET_TABLE { 148 | match self.transition_table.input_symbol(i - TARGET_TABLE) { 149 | Some(res) => sym == res, 150 | None => false, 151 | } 152 | } else { 153 | match self 154 | .index_table 155 | .input_symbol(i + TransitionTableIndex(sym.0 as u32)) 156 | { 157 | Some(res) => sym == res, 158 | None => false, 159 | } 160 | } 161 | } 162 | 163 | #[inline(always)] 164 | fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool { 165 | if i >= TARGET_TABLE { 166 | match self.transition_table.input_symbol(i - TARGET_TABLE) { 167 | Some(sym) => sym == SymbolNumber::ZERO || self.alphabet.is_flag(sym), 168 | None => false, 169 | } 170 | } else if let Some(SymbolNumber::ZERO) = self.index_table.input_symbol(i) { 171 | true 172 | } else { 173 | false 174 | } 175 | } 176 | 177 | #[inline(always)] 178 | fn take_epsilons(&self, i: TransitionTableIndex) -> Option { 179 | if let Some(SymbolNumber::ZERO) = self.transition_table.input_symbol(i) { 180 | Some(self.transition_table.symbol_transition(i)) 181 | } else { 182 | None 183 | } 184 | } 185 | 186 | #[inline(always)] 187 | fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option { 188 | if let Some(sym) = self.transition_table.input_symbol(i) { 189 | if sym != SymbolNumber::ZERO && !self.alphabet.is_flag(sym) { 190 | None 191 | } else { 192 | Some(self.transition_table.symbol_transition(i)) 193 | } 194 | } else { 195 | None 196 | } 197 | } 198 | 199 | #[inline(always)] 200 | fn take_non_epsilons( 201 | &self, 202 | i: TransitionTableIndex, 203 | symbol: SymbolNumber, 204 | ) -> Option { 205 | if let Some(input_sym) = self.transition_table.input_symbol(i) { 206 | if input_sym != symbol { 207 | None 208 | } else { 209 | Some(self.transition_table.symbol_transition(i)) 210 | } 211 | } else { 212 | None 213 | } 214 | } 215 | 216 | #[inline(always)] 217 | fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option { 218 | if i >= TARGET_TABLE { 219 | Some(i - TARGET_TABLE + TransitionTableIndex(1)) 220 | } else if let Some(v) = self 221 | .index_table 222 | .target(i + TransitionTableIndex(symbol.0 as u32 + 1)) 223 | { 224 | Some(v - TARGET_TABLE) 225 | } else { 226 | None 227 | } 228 | } 229 | 230 | #[inline(always)] 231 | fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option { 232 | self.transition_table.input_symbol(i) 233 | } 234 | 235 | #[inline(always)] 236 | fn alphabet(&self) -> &TransducerAlphabet { 237 | &self.alphabet 238 | } 239 | 240 | #[inline(always)] 241 | fn mut_alphabet(&mut self) -> &mut TransducerAlphabet { 242 | &mut self.alphabet 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /src/types.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | fmt::Display, 3 | ops::{Add, Div, Mul, Sub}, 4 | }; 5 | 6 | use serde::{Deserialize, Serialize}; 7 | 8 | /// Flag diacritic operator for morphological constraints. 9 | /// 10 | /// Flag diacritics are used in finite-state morphology to enforce complex 11 | /// constraints during analysis and generation. 12 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 13 | pub enum FlagDiacriticOperator { 14 | /// Positive set - sets a feature to a value 15 | PositiveSet, 16 | /// Negative set - sets a feature to disallowed 17 | NegativeSet, 18 | /// Require - requires a feature to have a value 19 | Require, 20 | /// Disallow - requires a feature to not have a value 21 | Disallow, 22 | /// Clear - clears a feature value 23 | Clear, 24 | /// Unification - unifies feature values 25 | Unification, 26 | } 27 | 28 | impl std::str::FromStr for FlagDiacriticOperator { 29 | type Err = (); 30 | 31 | fn from_str(s: &str) -> Result { 32 | match s { 33 | "P" => Ok(FlagDiacriticOperator::PositiveSet), 34 | "N" => Ok(FlagDiacriticOperator::NegativeSet), 35 | "R" => Ok(FlagDiacriticOperator::Require), 36 | "D" => Ok(FlagDiacriticOperator::Disallow), 37 | "C" => Ok(FlagDiacriticOperator::Clear), 38 | "U" => Ok(FlagDiacriticOperator::Unification), 39 | _ => Err(()), 40 | } 41 | } 42 | } 43 | 44 | /// Transducer header property flags. 45 | /// 46 | /// These flags describe properties of the finite-state transducer. 47 | #[derive(Debug)] 48 | pub enum HeaderFlag { 49 | /// Transducer has weighted transitions 50 | Weighted, 51 | /// Transducer is deterministic 52 | Deterministic, 53 | /// Input side is deterministic 54 | InputDeterministic, 55 | /// Transducer is minimized 56 | Minimized, 57 | /// Transducer contains cycles 58 | Cyclic, 59 | /// Has epsilon-epsilon transitions 60 | HasEpsilonEpsilonTransitions, 61 | /// Has input epsilon transitions 62 | HasInputEpsilonTransitions, 63 | /// Has input epsilon cycles 64 | HasInputEpsilonCycles, 65 | /// Has unweighted input epsilon cycles 66 | HasUnweightedInputEpsilonCycles, 67 | } 68 | 69 | /// A flag diacritic operation in a finite-state transducer. 70 | /// 71 | /// Combines an operation, feature, and value to enforce morphological constraints. 72 | #[derive(Debug, Serialize, Deserialize)] 73 | pub struct FlagDiacriticOperation { 74 | /// The operation to perform 75 | pub operation: FlagDiacriticOperator, 76 | /// The feature being operated on 77 | pub feature: SymbolNumber, 78 | /// The value for the feature 79 | pub value: ValueNumber, 80 | } 81 | 82 | /// Symbol number in a transducer alphabet. 83 | /// 84 | /// Represents an index into the symbol table of a finite-state transducer. 85 | /// Symbol 0 is typically epsilon (empty string). 86 | #[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] 87 | #[repr(transparent)] 88 | #[serde(transparent)] 89 | pub struct SymbolNumber(pub u16); 90 | 91 | impl SymbolNumber { 92 | pub(crate) const ZERO: Self = SymbolNumber(0); 93 | pub(crate) const MAX: Self = SymbolNumber(u16::MAX); 94 | 95 | #[inline(always)] 96 | pub(crate) fn incr(&self) -> Self { 97 | Self(self.0 + 1) 98 | } 99 | } 100 | 101 | /// Value number for flag diacritics. 102 | /// 103 | /// Represents the value assigned to a feature in flag diacritic operations. 104 | #[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] 105 | #[repr(transparent)] 106 | #[serde(transparent)] 107 | pub struct ValueNumber(pub i16); 108 | 109 | impl ValueNumber { 110 | /// Zero value constant 111 | pub const ZERO: Self = ValueNumber(0); 112 | 113 | #[inline(always)] 114 | pub(crate) fn invert(&self) -> Self { 115 | ValueNumber(-self.0) 116 | } 117 | 118 | #[inline(always)] 119 | pub(crate) fn incr(&self) -> Self { 120 | ValueNumber(self.0 + 1) 121 | } 122 | } 123 | 124 | /// Index into the input string being processed. 125 | #[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] 126 | #[repr(transparent)] 127 | #[serde(transparent)] 128 | pub struct InputIndex(pub u32); 129 | 130 | impl InputIndex { 131 | #[inline(always)] 132 | pub(crate) fn incr(&self, val: u32) -> Self { 133 | Self(self.0 + val) 134 | } 135 | } 136 | 137 | /// Index into a transducer's transition table. 138 | /// 139 | /// Identifies a specific state or transition in the finite-state transducer. 140 | #[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord, Hash)] 141 | #[repr(transparent)] 142 | #[serde(transparent)] 143 | pub struct TransitionTableIndex(pub u32); 144 | 145 | impl Display for TransitionTableIndex { 146 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 147 | write!(f, "{}", self.0) 148 | } 149 | } 150 | 151 | impl Add for TransitionTableIndex { 152 | type Output = Self; 153 | 154 | fn add(self, rhs: Self) -> Self::Output { 155 | TransitionTableIndex(self.0 + rhs.0) 156 | } 157 | } 158 | 159 | impl Sub for TransitionTableIndex { 160 | type Output = Self; 161 | 162 | fn sub(self, rhs: Self) -> Self::Output { 163 | TransitionTableIndex(self.0 - rhs.0) 164 | } 165 | } 166 | 167 | impl Mul for TransitionTableIndex { 168 | type Output = Self; 169 | 170 | fn mul(self, rhs: Self) -> Self::Output { 171 | TransitionTableIndex(self.0 * rhs.0) 172 | } 173 | } 174 | 175 | impl Div for TransitionTableIndex { 176 | type Output = Self; 177 | 178 | fn div(self, rhs: Self) -> Self::Output { 179 | TransitionTableIndex(self.0 / rhs.0) 180 | } 181 | } 182 | 183 | impl TransitionTableIndex { 184 | pub(crate) const MAX: Self = TransitionTableIndex(u32::MAX); 185 | pub(crate) const ONE: Self = TransitionTableIndex(1); 186 | 187 | #[inline(always)] 188 | pub(crate) fn incr(&self) -> Self { 189 | Self(self.0 + 1) 190 | } 191 | } 192 | 193 | /// Weight (cost) of a transducer transition. 194 | /// 195 | /// Lower weights represent more preferred paths through the FST. 196 | /// Used for ranking spelling suggestions and morphological analyses. 197 | #[derive(Clone, Copy, Debug, Serialize, Deserialize)] 198 | #[repr(transparent)] 199 | #[serde(transparent)] 200 | pub struct Weight(pub f32); 201 | 202 | impl PartialEq for Weight { 203 | fn eq(&self, other: &Self) -> bool { 204 | self.0.eq(&other.0) 205 | } 206 | } 207 | 208 | impl Eq for Weight {} 209 | 210 | impl PartialOrd for Weight { 211 | fn partial_cmp(&self, other: &Self) -> Option { 212 | Some(self.0.total_cmp(&other.0)) 213 | } 214 | } 215 | 216 | impl Ord for Weight { 217 | fn cmp(&self, other: &Self) -> std::cmp::Ordering { 218 | self.0.total_cmp(&other.0) 219 | } 220 | } 221 | 222 | impl Weight { 223 | /// Zero weight (no cost) 224 | pub const ZERO: Self = Weight(0.0); 225 | /// Maximum finite weight 226 | pub const MAX: Self = Weight(f32::MAX); 227 | /// Infinite weight (blocked path) 228 | pub const INFINITE: Self = Weight(f32::INFINITY); 229 | } 230 | 231 | impl Display for Weight { 232 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 233 | write!(f, "{}", self.0) 234 | } 235 | } 236 | 237 | impl Add for Weight { 238 | type Output = Self; 239 | 240 | fn add(self, rhs: Self) -> Self::Output { 241 | Weight(self.0 + rhs.0) 242 | } 243 | } 244 | 245 | impl Sub for Weight { 246 | type Output = Self; 247 | 248 | fn sub(self, rhs: Self) -> Self::Output { 249 | Weight(self.0 - rhs.0) 250 | } 251 | } 252 | 253 | impl Mul for Weight { 254 | type Output = Self; 255 | 256 | fn mul(self, rhs: Self) -> Self::Output { 257 | Weight(self.0 * rhs.0) 258 | } 259 | } 260 | 261 | impl Div for Weight { 262 | type Output = Self; 263 | 264 | fn div(self, rhs: Self) -> Self::Output { 265 | Weight(self.0 / rhs.0) 266 | } 267 | } 268 | 269 | /// State vector for flag diacritics during FST traversal. 270 | pub type FlagDiacriticState = Vec; 271 | 272 | /// Map from symbol numbers to their flag diacritic operations. 273 | pub type OperationsMap = hashbrown::HashMap; 274 | -------------------------------------------------------------------------------- /src/archive/meta.rs: -------------------------------------------------------------------------------- 1 | //! Data structures of speller metadata. 2 | //! 3 | //! These are usually read from the speller archives, in XML or JSON files or 4 | //! such. 5 | use serde::{Deserialize, Serialize}; 6 | use serde_xml_rs::{Error, ParserConfig, from_reader}; 7 | 8 | /// Speller metadata 9 | #[derive(Serialize, Deserialize, Debug, Clone)] 10 | pub struct SpellerMetadata { 11 | /// speller info 12 | info: SpellerMetadataInfo, 13 | /// acceptor metadata 14 | acceptor: SpellerMetadataAcceptor, 15 | /// error model metadata 16 | errmodel: SpellerMetadataErrmodel, 17 | } 18 | 19 | impl SpellerMetadata { 20 | /// Get the speller information 21 | pub fn info(&self) -> &SpellerMetadataInfo { 22 | &self.info 23 | } 24 | 25 | /// Get the acceptor metadata 26 | pub fn acceptor(&self) -> &SpellerMetadataAcceptor { 27 | &self.acceptor 28 | } 29 | 30 | /// Get the error model metadata 31 | pub fn errmodel(&self) -> &SpellerMetadataErrmodel { 32 | &self.errmodel 33 | } 34 | 35 | /// Get mutable reference to acceptor metadata 36 | /// 37 | /// # Warning 38 | /// This method is only for internal tooling use and should not be used in normal applications. 39 | /// It may be removed in a future version. 40 | #[doc(hidden)] 41 | pub fn acceptor_mut(&mut self) -> &mut SpellerMetadataAcceptor { 42 | &mut self.acceptor 43 | } 44 | 45 | /// Get mutable reference to error model metadata 46 | /// 47 | /// # Warning 48 | /// This method is only for internal tooling use and should not be used in normal applications. 49 | /// It may be removed in a future version. 50 | #[doc(hidden)] 51 | pub fn errmodel_mut(&mut self) -> &mut SpellerMetadataErrmodel { 52 | &mut self.errmodel 53 | } 54 | } 55 | 56 | /// localised speller title 57 | #[derive(Serialize, Deserialize, Debug, Clone)] 58 | pub struct SpellerTitle { 59 | /// ISO 639 code of the title's content language 60 | pub lang: Option, 61 | /// translated title 62 | #[serde(rename = "$value")] 63 | pub value: String, 64 | } 65 | 66 | /// Speller metadata 67 | #[derive(Serialize, Deserialize, Debug, Clone)] 68 | pub struct SpellerMetadataInfo { 69 | /// ISO-639 code of speller language 70 | locale: String, 71 | /// localised, human readable titles of speller 72 | title: Vec, 73 | /// human readable description of speller 74 | description: String, 75 | /// creator and copyright owner of the speller 76 | producer: String, 77 | } 78 | 79 | impl SpellerMetadataInfo { 80 | /// Get the ISO-639 locale code 81 | pub fn locale(&self) -> &str { 82 | &self.locale 83 | } 84 | 85 | /// Get the localized titles 86 | pub fn title(&self) -> &[SpellerTitle] { 87 | &self.title 88 | } 89 | 90 | /// Get the description 91 | pub fn description(&self) -> &str { 92 | &self.description 93 | } 94 | 95 | /// Get the producer/creator 96 | pub fn producer(&self) -> &str { 97 | &self.producer 98 | } 99 | } 100 | 101 | /// Acceptor metadata 102 | #[derive(Serialize, Deserialize, Debug, Clone)] 103 | pub struct SpellerMetadataAcceptor { 104 | /// acceptor type: 105 | /// - `blah` if normal dictionary automaton 106 | /// - `foo` if analyzer 107 | #[serde(rename = "type", default)] 108 | type_: String, 109 | /// locally unique id for this acceptor 110 | id: String, 111 | /// localised human readable titles of speller 112 | title: Vec, 113 | /// human readable description of the acceptor 114 | description: String, 115 | /// marker for incomplete strings 116 | continuation: Option, 117 | } 118 | 119 | impl SpellerMetadataAcceptor { 120 | /// Get the acceptor type 121 | pub fn type_(&self) -> &str { 122 | &self.type_ 123 | } 124 | 125 | /// Get the acceptor ID 126 | pub fn id(&self) -> &str { 127 | &self.id 128 | } 129 | 130 | /// Get the localized titles 131 | pub fn title(&self) -> &[SpellerTitle] { 132 | &self.title 133 | } 134 | 135 | /// Get the description 136 | pub fn description(&self) -> &str { 137 | &self.description 138 | } 139 | 140 | /// Get the continuation marker for incomplete strings 141 | pub fn continuation(&self) -> Option<&str> { 142 | self.continuation.as_deref() 143 | } 144 | 145 | /// Set the acceptor ID 146 | /// 147 | /// # Warning 148 | /// This method is only for internal tooling use and should not be used in normal applications. 149 | /// It may be removed in a future version. 150 | #[doc(hidden)] 151 | pub fn set_id(&mut self, id: String) { 152 | self.id = id; 153 | } 154 | } 155 | 156 | /// Error model metadata 157 | #[derive(Serialize, Deserialize, Debug, Clone)] 158 | pub struct SpellerMetadataErrmodel { 159 | /// locally unique id for the error model 160 | id: String, 161 | /// localised human readable titles for the error model 162 | title: Vec, 163 | /// human readable description of the error model 164 | description: String, 165 | } 166 | 167 | impl SpellerMetadataErrmodel { 168 | /// Get the error model ID 169 | pub fn id(&self) -> &str { 170 | &self.id 171 | } 172 | 173 | /// Get the localized titles 174 | pub fn title(&self) -> &[SpellerTitle] { 175 | &self.title 176 | } 177 | 178 | /// Get the description 179 | pub fn description(&self) -> &str { 180 | &self.description 181 | } 182 | 183 | /// Set the error model ID 184 | /// 185 | /// # Warning 186 | /// This method is only for internal tooling use and should not be used in normal applications. 187 | /// It may be removed in a future version. 188 | #[doc(hidden)] 189 | pub fn set_id(&mut self, id: String) { 190 | self.id = id; 191 | } 192 | } 193 | 194 | impl std::str::FromStr for SpellerMetadata { 195 | type Err = Error; 196 | 197 | fn from_str(string: &str) -> Result { 198 | SpellerMetadata::from_bytes(string.as_bytes()) 199 | } 200 | } 201 | 202 | impl SpellerMetadata { 203 | /// Parse speller metadata from XML bytes. 204 | /// 205 | /// The XML format follows the HFST speller specification with ``, 206 | /// ``, and `` elements containing metadata about 207 | /// the spell-checker and its component transducers. 208 | /// 209 | /// # Errors 210 | /// 211 | /// Returns an error if the XML is malformed or doesn't match the expected schema. 212 | pub fn from_bytes(bytes: &[u8]) -> Result { 213 | let mut reader = ParserConfig::new() 214 | .trim_whitespace(true) 215 | .ignore_comments(true) 216 | .coalesce_characters(true) 217 | .create_reader(bytes) 218 | .into_inner(); 219 | 220 | from_reader(&mut reader) 221 | } 222 | } 223 | 224 | #[test] 225 | fn test_xml_parse() { 226 | use std::str::FromStr; 227 | 228 | let xml_data = r##" 229 | 230 | 231 | 232 | se 233 | Giellatekno/Divvun/UiT fst-based speller for Northern Sami 234 | This is an fst-based speller for Northern Sami. It is based 235 | on the normative subset of the morphological analyzer for Northern Sami. 236 | The source code can be found at: 237 | https://victorio.uit.no/langtech/trunk/langs/sme/ 238 | License: GPL3+. 239 | GT_VERSION 240 | DATE 241 | Giellatekno/Divvun/UiT contributors 242 | 243 | 244 | 245 | Giellatekno/Divvun/UiT dictionary Northern Sami 246 | Giellatekno/Divvun/UiT dictionary for 247 | Northern Sami compiled for HFST. 248 | 249 | 250 | Levenshtein edit distance transducer 251 | Correction model for keyboard misstrokes, at most 2 per 252 | word. 253 | 254 | errormodel.default.hfst 255 | 256 | 257 | "##; 258 | 259 | let s = SpellerMetadata::from_str(&xml_data).unwrap(); 260 | println!("{:#?}", s); 261 | } 262 | -------------------------------------------------------------------------------- /src/tokenizer/mod.rs: -------------------------------------------------------------------------------- 1 | //! Tokenizer splits strings into words and punctuations. 2 | use unic_ucd_common::alphanumeric::is_alphanumeric; 3 | use word::{WordBoundIndices, Words}; 4 | 5 | pub(crate) mod case_handling; 6 | pub mod word; 7 | mod word_break; 8 | 9 | /// Iterator over word indices in a string, filtering out non-alphanumeric tokens. 10 | /// 11 | /// Returns tuples of (byte_offset, word_str) for each word containing at least 12 | /// one alphanumeric character. 13 | pub struct WordIndices<'a> { 14 | iter: WordBoundIndices<'a>, 15 | } 16 | 17 | impl<'a> Iterator for WordIndices<'a> { 18 | type Item = (usize, &'a str); 19 | 20 | fn next(&mut self) -> Option { 21 | while let Some(item) = self.iter.next() { 22 | if item.1.chars().any(is_alphanumeric) { 23 | return Some(item); 24 | } 25 | } 26 | 27 | None 28 | } 29 | } 30 | 31 | /// Trait for tokenizing strings into words. 32 | /// 33 | /// Provides methods to split text into words with various options for 34 | /// alphabet customization and boundary detection. 35 | pub trait Tokenize { 36 | /// Get an iterator over word boundaries with byte indices. 37 | fn word_bound_indices(&self) -> WordBoundIndices<'_>; 38 | 39 | /// Get an iterator over words with byte indices (alphanumeric words only). 40 | fn word_indices(&self) -> WordIndices<'_>; 41 | 42 | /// Get word boundaries using a custom alphabet. 43 | fn word_bound_indices_with_alphabet(&self, alphabet: Vec) -> WordBoundIndices<'_>; 44 | 45 | /// Get words using a custom alphabet. 46 | fn words_with_alphabet(&self, alphabet: Vec) -> Words<'_>; 47 | } 48 | 49 | impl Tokenize for str { 50 | fn word_bound_indices(&self) -> WordBoundIndices<'_> { 51 | WordBoundIndices::new(self) 52 | } 53 | 54 | fn word_indices(&self) -> WordIndices<'_> { 55 | WordIndices { 56 | iter: WordBoundIndices::new(self), 57 | } 58 | } 59 | 60 | fn word_bound_indices_with_alphabet(&self, alphabet: Vec) -> WordBoundIndices<'_> { 61 | WordBoundIndices::new_with_alphabet(self, alphabet) 62 | } 63 | 64 | fn words_with_alphabet(&self, alphabet: Vec) -> Words<'_> { 65 | Words::new_with_alphabet(self, |s| s.chars().any(|ch| ch.is_alphanumeric()), alphabet) 66 | } 67 | } 68 | 69 | /// A word with its byte offset in the original string. 70 | pub struct IndexedWord { 71 | /// Byte offset of the word in the original string 72 | pub index: usize, 73 | /// The word text 74 | pub word: String, 75 | } 76 | 77 | /// Context information for a word, including surrounding words. 78 | /// 79 | /// Useful for context-sensitive spell-checking and analysis. 80 | #[derive(Debug, Clone)] 81 | pub struct WordContext { 82 | /// The current word (byte_offset, text) 83 | pub current: (usize, String), 84 | /// The word immediately before, if any 85 | pub first_before: Option<(usize, String)>, 86 | /// The second word before, if any 87 | pub second_before: Option<(usize, String)>, 88 | /// The word immediately after, if any 89 | pub first_after: Option<(usize, String)>, 90 | /// The second word after, if any 91 | pub second_after: Option<(usize, String)>, 92 | } 93 | 94 | #[cfg(feature = "internal_ffi")] 95 | impl crate::ffi::fbs::IntoFlatbuffer for WordContext { 96 | fn into_flatbuffer<'a>(self) -> Vec { 97 | use crate::ffi::fbs::tokenizer::*; 98 | 99 | macro_rules! add_indexed_word { 100 | ($fbb:expr, $data:expr) => {{ 101 | use $crate::ffi::fbs::tokenizer::*; 102 | 103 | if let Some((index, word)) = $data { 104 | let s = $fbb.create_string(&word); 105 | Some(IndexedWord::create( 106 | &mut $fbb, 107 | &IndexedWordArgs { 108 | index: index as u64, 109 | value: Some(s), 110 | }, 111 | )) 112 | } else { 113 | None 114 | } 115 | }}; 116 | } 117 | 118 | let mut builder = flatbuffers::FlatBufferBuilder::with_capacity(1024); 119 | let current = add_indexed_word!(builder, Some(self.current)); 120 | let first_before = add_indexed_word!(builder, self.first_before); 121 | let second_before = add_indexed_word!(builder, self.second_before); 122 | let first_after = add_indexed_word!(builder, self.first_after); 123 | let second_after = add_indexed_word!(builder, self.second_after); 124 | let word_context = WordContext::create( 125 | &mut builder, 126 | &WordContextArgs { 127 | current, 128 | first_before, 129 | second_before, 130 | first_after, 131 | second_after, 132 | }, 133 | ); 134 | builder.finish(word_context, None); 135 | builder.finished_data().to_vec() 136 | } 137 | } 138 | 139 | /// Extract word context around a cursor position. 140 | /// 141 | /// Given text split at a cursor position (first_half, second_half), 142 | /// returns the word at the cursor and up to 2 words before/after. 143 | /// 144 | /// # Example 145 | /// ```ignore 146 | /// let context = cursor_context("hello wo", "rld goodbye"); 147 | /// // context.current would be ("hello ".len(), "world") 148 | /// ``` 149 | pub fn cursor_context(first_half: &str, second_half: &str) -> WordContext { 150 | // Find the point in the first half where the first "word" happens 151 | let mut first_half_iter = first_half.word_bound_indices().rev(); 152 | let mut second_half_iter = second_half.word_bound_indices(); 153 | 154 | let current = { 155 | let first_half_last_item = match first_half_iter.next() { 156 | Some(v) if v.1.chars().any(is_alphanumeric) => v, 157 | _ => (0, ""), 158 | }; 159 | 160 | let second_half_first_item = match second_half_iter.next() { 161 | Some(v) if v.1.chars().any(is_alphanumeric) => v, 162 | _ => (0, ""), 163 | }; 164 | 165 | let first_word = format!("{}{}", first_half_last_item.1, second_half_first_item.1); 166 | let first_index = if first_half_last_item.1 == "" { 167 | first_half.len() + second_half_first_item.0 168 | } else { 169 | first_half_last_item.0 170 | }; 171 | 172 | (first_index, first_word) 173 | }; 174 | 175 | let mut first_half_iter = first_half_iter 176 | .filter(|x| x.1.chars().any(is_alphanumeric)) 177 | .map(|x| (x.0, x.1.to_string())); 178 | let mut second_half_iter = second_half_iter 179 | .filter(|x| x.1.chars().any(is_alphanumeric)) 180 | .map(|x| (x.0, x.1.to_string())); 181 | 182 | WordContext { 183 | current, 184 | first_before: first_half_iter.next(), 185 | second_before: first_half_iter.next(), 186 | first_after: second_half_iter.next(), 187 | second_after: second_half_iter.next(), 188 | } 189 | } 190 | 191 | #[cfg(test)] 192 | mod tests { 193 | use super::*; 194 | 195 | #[test] 196 | fn rev() { 197 | let msg = "this is life"; 198 | assert_eq!( 199 | msg.word_bound_indices() 200 | .rev() 201 | .collect::>(), 202 | vec![(8, "life"), (7, " "), (5, "is"), (4, " "), (0, "this")] 203 | ); 204 | } 205 | 206 | #[test] 207 | fn basic() { 208 | let msg = "this is an ordinary sentence! \"This was quoted,\", an emoji: (😄), and\t a tab was there and a new line.\n Some extreme unicode; bismala: (﷽), in long form: بِسْمِ اللهِ الرَّحْمٰنِ الرَّحِيْمِ."; 209 | msg.word_bound_indices().for_each(|t| println!("{:?}", t)); 210 | println!("{}", &msg); 211 | } 212 | 213 | #[test] 214 | fn word_bounds_alphabet() { 215 | let msg = "this is an ordinary-sentence! \"This was quoted,\", an emoji: (😄), and\t a"; 216 | 217 | assert_eq!( 218 | msg.word_bound_indices_with_alphabet("abcdefghijklmnopqr-stuvwxyz".chars().collect()) 219 | .collect::>(), 220 | vec![ 221 | (0, "this"), 222 | (4, " "), 223 | (5, "is"), 224 | (7, " "), 225 | (8, "an"), 226 | (10, " "), 227 | (11, "ordinary-sentence"), 228 | (28, "!"), 229 | (29, " "), 230 | (30, "\""), 231 | (31, "This"), 232 | (35, " "), 233 | (36, "was"), 234 | (39, " "), 235 | (40, "quoted"), 236 | (46, ","), 237 | (47, "\""), 238 | (48, ","), 239 | (49, " "), 240 | (50, "an"), 241 | (52, " "), 242 | (53, "emoji"), 243 | (58, ":"), 244 | (59, " "), 245 | (60, "("), 246 | (61, "😄"), 247 | (65, ")"), 248 | (66, ","), 249 | (67, " "), 250 | (68, "and"), 251 | (71, "\t"), 252 | (72, " "), 253 | (73, "a") 254 | ] 255 | ); 256 | } 257 | } 258 | -------------------------------------------------------------------------------- /src/transducer/tree_node.rs: -------------------------------------------------------------------------------- 1 | use lifeguard::{Pool, Recycled}; 2 | use std::cmp::Ordering; 3 | use std::hash::{Hash, Hasher}; 4 | 5 | use super::symbol_transition::SymbolTransition; 6 | use crate::types::{ 7 | FlagDiacriticOperation, FlagDiacriticOperator, FlagDiacriticState, InputIndex, SymbolNumber, 8 | TransitionTableIndex, ValueNumber, Weight, 9 | }; 10 | 11 | #[derive(Debug, Clone)] 12 | pub(crate) struct TreeNode { 13 | pub(crate) lexicon_state: TransitionTableIndex, 14 | pub(crate) mutator_state: TransitionTableIndex, 15 | pub(crate) input_state: InputIndex, 16 | pub(crate) weight: Weight, 17 | pub(crate) flag_state: FlagDiacriticState, 18 | pub(crate) string: Vec, 19 | } 20 | 21 | impl std::cmp::PartialEq for TreeNode { 22 | fn eq(&self, other: &TreeNode) -> bool { 23 | self.lexicon_state == other.lexicon_state 24 | && self.mutator_state == other.mutator_state 25 | && self.input_state == other.input_state 26 | && self.weight == other.weight 27 | && self.flag_state == other.flag_state 28 | && self.string == other.string 29 | } 30 | } 31 | 32 | impl std::cmp::Ord for TreeNode { 33 | #[allow(clippy::comparison_chain)] 34 | fn cmp(&self, other: &Self) -> Ordering { 35 | if self.weight < other.weight { 36 | Ordering::Less 37 | } else if self.weight > other.weight { 38 | Ordering::Greater 39 | } else { 40 | self.string.cmp(&other.string) 41 | } 42 | } 43 | } 44 | 45 | impl std::cmp::PartialOrd for TreeNode { 46 | fn partial_cmp(&self, other: &Self) -> Option { 47 | Some(self.cmp(other)) 48 | } 49 | } 50 | 51 | impl std::cmp::Eq for TreeNode {} 52 | 53 | impl Hash for TreeNode { 54 | fn hash(&self, state: &mut H) { 55 | self.input_state.hash(state); 56 | self.mutator_state.hash(state); 57 | self.lexicon_state.hash(state); 58 | } 59 | } 60 | 61 | impl lifeguard::Recycleable for TreeNode { 62 | fn new() -> Self { 63 | TreeNode { 64 | string: Vec::with_capacity(1), 65 | input_state: InputIndex(0), 66 | mutator_state: TransitionTableIndex(0), 67 | lexicon_state: TransitionTableIndex(0), 68 | flag_state: vec![], 69 | weight: Weight(0.0), 70 | } 71 | } 72 | 73 | fn reset(&mut self) { 74 | // There is nothing done to reset it. 75 | // Implementers must reset any fields where used! 76 | } 77 | } 78 | 79 | impl lifeguard::InitializeWith<&TreeNode> for TreeNode { 80 | fn initialize_with(&mut self, source: &TreeNode) { 81 | if self.string != source.string { 82 | self.string.truncate(0); 83 | self.string.extend(&source.string); 84 | } 85 | 86 | self.input_state = source.input_state; 87 | self.mutator_state = source.mutator_state; 88 | self.lexicon_state = source.lexicon_state; 89 | 90 | if self.flag_state != source.flag_state { 91 | self.flag_state.truncate(0); 92 | self.flag_state 93 | .extend_from_slice(&source.flag_state.as_slice()); 94 | } 95 | 96 | self.weight = source.weight; 97 | } 98 | } 99 | 100 | impl TreeNode { 101 | #[inline(always)] 102 | pub fn empty<'a>( 103 | pool: &'a Pool, 104 | start_state: FlagDiacriticState, 105 | ) -> Recycled<'a, TreeNode> { 106 | pool.attach(TreeNode { 107 | string: vec![], 108 | input_state: InputIndex(0), 109 | mutator_state: TransitionTableIndex(0), 110 | lexicon_state: TransitionTableIndex(0), 111 | flag_state: start_state, 112 | weight: Weight(0.0), 113 | }) 114 | } 115 | 116 | #[inline(always)] 117 | pub fn weight(&self) -> Weight { 118 | self.weight 119 | } 120 | 121 | #[inline(always)] 122 | pub fn update_lexicon<'a>( 123 | &self, 124 | pool: &'a Pool, 125 | transition: SymbolTransition, 126 | ) -> Recycled<'a, TreeNode> { 127 | let mut node = pool.new(); 128 | 129 | if node.string != self.string { 130 | node.string.truncate(0); 131 | node.string.extend(&self.string); 132 | } 133 | 134 | if let Some(value) = transition.symbol() { 135 | if value.0 != 0 { 136 | node.string.push(value); 137 | } 138 | } 139 | 140 | node.input_state = self.input_state; 141 | node.mutator_state = self.mutator_state; 142 | node.lexicon_state = transition.target().unwrap(); 143 | 144 | if node.flag_state != self.flag_state { 145 | node.flag_state.truncate(0); 146 | node.flag_state 147 | .extend_from_slice(&self.flag_state.as_slice()); 148 | } 149 | 150 | node.weight = self.weight + transition.weight().unwrap(); 151 | 152 | node 153 | } 154 | 155 | #[inline(always)] 156 | pub fn update_mutator<'a>( 157 | &self, 158 | pool: &'a Pool, 159 | transition: SymbolTransition, 160 | ) -> Recycled<'a, TreeNode> { 161 | let mut node = pool.new(); 162 | if node.string != self.string { 163 | node.string.truncate(0); 164 | node.string.extend(&self.string); 165 | } 166 | node.input_state = self.input_state; 167 | node.mutator_state = transition.target().unwrap(); 168 | node.lexicon_state = self.lexicon_state; 169 | 170 | if node.flag_state != self.flag_state { 171 | node.flag_state.truncate(0); 172 | node.flag_state 173 | .extend_from_slice(&self.flag_state.as_slice()); 174 | } 175 | 176 | node.weight = self.weight + transition.weight().unwrap(); 177 | node 178 | } 179 | 180 | #[inline(always)] 181 | pub fn update<'a>( 182 | &self, 183 | pool: &'a Pool, 184 | output_symbol: SymbolNumber, 185 | next_input: Option, 186 | next_mutator: TransitionTableIndex, 187 | next_lexicon: TransitionTableIndex, 188 | weight: Weight, 189 | ) -> Recycled<'a, TreeNode> { 190 | let mut node = pool.new(); 191 | 192 | if node.string != self.string { 193 | node.string.truncate(0); 194 | node.string.extend(&self.string); 195 | } 196 | 197 | if output_symbol.0 != 0 { 198 | node.string.push(output_symbol); 199 | } 200 | 201 | node.mutator_state = next_mutator; 202 | node.lexicon_state = next_lexicon; 203 | 204 | if node.flag_state != self.flag_state { 205 | node.flag_state.truncate(0); 206 | node.flag_state 207 | .extend_from_slice(&self.flag_state.as_slice()); 208 | } 209 | 210 | node.weight = self.weight + weight; 211 | 212 | if let Some(input) = next_input { 213 | node.input_state = input; 214 | } else { 215 | node.input_state = self.input_state; 216 | } 217 | 218 | node 219 | } 220 | 221 | #[inline(always)] 222 | fn update_flag<'a>( 223 | &self, 224 | pool: &'a Pool, 225 | feature: SymbolNumber, 226 | value: ValueNumber, 227 | transition: &SymbolTransition, 228 | ) -> Recycled<'a, TreeNode> { 229 | let mut node = self.apply_transition(pool, transition); 230 | node.flag_state[feature.0 as usize] = value; 231 | node 232 | } 233 | 234 | #[inline(always)] 235 | pub fn apply_transition<'a>( 236 | &self, 237 | pool: &'a Pool, 238 | transition: &SymbolTransition, 239 | ) -> Recycled<'a, TreeNode> { 240 | let mut node = pool.new(); 241 | 242 | if node.string != self.string { 243 | node.string.truncate(0); 244 | node.string.extend(&self.string); 245 | } 246 | 247 | node.input_state = self.input_state; 248 | node.mutator_state = self.mutator_state; 249 | node.lexicon_state = transition.target().unwrap(); 250 | 251 | if node.flag_state != self.flag_state { 252 | node.flag_state.truncate(0); 253 | node.flag_state 254 | .extend_from_slice(&self.flag_state.as_slice()); 255 | } 256 | 257 | node.weight = self.weight + transition.weight().unwrap(); 258 | node 259 | } 260 | 261 | #[inline(always)] 262 | pub fn apply_operation<'a>( 263 | &self, 264 | pool: &'a Pool, 265 | op: &FlagDiacriticOperation, 266 | transition: &SymbolTransition, 267 | ) -> Option> { 268 | match op.operation { 269 | FlagDiacriticOperator::PositiveSet => { 270 | Some(self.update_flag(pool, op.feature, op.value, transition)) 271 | } 272 | FlagDiacriticOperator::NegativeSet => { 273 | Some(self.update_flag(pool, op.feature, op.value.invert(), transition)) 274 | } 275 | FlagDiacriticOperator::Require => { 276 | let res = if op.value.0 == 0 { 277 | self.flag_state[op.feature.0 as usize] != ValueNumber(0) 278 | } else { 279 | self.flag_state[op.feature.0 as usize] == op.value 280 | }; 281 | 282 | if res { 283 | Some(self.apply_transition(pool, transition)) 284 | } else { 285 | None 286 | } 287 | } 288 | FlagDiacriticOperator::Disallow => { 289 | let res = if op.value.0 == 0 { 290 | self.flag_state[op.feature.0 as usize] == ValueNumber(0) 291 | } else { 292 | self.flag_state[op.feature.0 as usize] != op.value 293 | }; 294 | 295 | if res { 296 | Some(self.apply_transition(pool, transition)) 297 | } else { 298 | None 299 | } 300 | } 301 | FlagDiacriticOperator::Clear => { 302 | Some(self.update_flag(pool, op.feature, ValueNumber(0), transition)) 303 | } 304 | FlagDiacriticOperator::Unification => { 305 | // if the feature is unset OR the feature is to this value already OR 306 | // the feature is negatively set to something else than this value 307 | let f = self.flag_state[op.feature.0 as usize]; 308 | 309 | if f.0 == 0 || f == op.value || (f.0 < 0 && f.invert() != op.value) { 310 | Some(self.update_flag(pool, op.feature, op.value, transition)) 311 | } else { 312 | None 313 | } 314 | } 315 | } 316 | } 317 | } 318 | -------------------------------------------------------------------------------- /src/transducer/thfst/chunked.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use crate::constants::TARGET_TABLE; 4 | use crate::transducer::symbol_transition::SymbolTransition; 5 | use crate::types::{SymbolNumber, TransitionTableIndex, Weight}; 6 | 7 | use super::index_table::MemmapIndexTable; 8 | use super::transition_table::MemmapTransitionTable; 9 | use crate::transducer::{Transducer, TransducerAlphabet, TransducerError}; 10 | use crate::vfs::{self, Filesystem}; 11 | 12 | use crate::transducer::{IndexTable, TransitionTable}; 13 | 14 | /// Tromsø-Helsinki Finite State Transducer format 15 | #[derive(Debug)] 16 | pub struct ThfstChunkedTransducer 17 | where 18 | F: vfs::File, 19 | { 20 | // meta: MetaRecord, 21 | index_tables: Vec>, 22 | indexes_per_chunk: TransitionTableIndex, 23 | transition_tables: Vec>, 24 | transitions_per_chunk: TransitionTableIndex, 25 | alphabet: TransducerAlphabet, 26 | _file: std::marker::PhantomData, 27 | } 28 | 29 | pub type MemmapThfstChunkedTransducer = ThfstChunkedTransducer; 30 | 31 | macro_rules! transition_rel_index { 32 | ($self:expr_2021, $x:expr_2021) => {{ 33 | let index_page = $x / $self.transitions_per_chunk; 34 | let relative_index = $x - ($self.transitions_per_chunk * index_page); 35 | (index_page.0 as usize, relative_index) 36 | }}; 37 | } 38 | 39 | macro_rules! index_rel_index { 40 | ($self:expr_2021, $x:expr_2021) => {{ 41 | let index_page = $x / $self.indexes_per_chunk; 42 | let relative_index = $x - ($self.indexes_per_chunk * index_page); 43 | (index_page.0 as usize, relative_index) 44 | }}; 45 | } 46 | 47 | macro_rules! error { 48 | ($path:path, $name:expr_2021) => { 49 | TransducerError::Io(std::io::Error::new( 50 | std::io::ErrorKind::NotFound, 51 | format!( 52 | "`{}` not found in transducer path, looked for {}", 53 | $name, 54 | $path.join($name).display() 55 | ), 56 | )) 57 | }; 58 | } 59 | 60 | impl Transducer for ThfstChunkedTransducer { 61 | const FILE_EXT: &'static str = "thfst"; 62 | 63 | fn from_path(fs: &FS, path: P) -> Result 64 | where 65 | P: AsRef, 66 | FS: Filesystem, 67 | { 68 | let path = path.as_ref(); 69 | let alphabet_file = fs 70 | .open_file(&path.join("alphabet")) 71 | .map_err(|_| error!(path, "alphabet"))?; 72 | 73 | let alphabet: TransducerAlphabet = serde_json::from_reader(alphabet_file) 74 | .map_err(|e| TransducerError::Alphabet(Box::new(e)))?; 75 | 76 | let mut index_chunk_count = 1; 77 | let index_tables; 78 | 79 | loop { 80 | let index_path = path.join("index"); 81 | let indexes = (0..index_chunk_count) 82 | .map(|i| MemmapIndexTable::from_path_partial(fs, &index_path, i, index_chunk_count)) 83 | .collect::, _>>(); 84 | 85 | match indexes { 86 | Ok(v) => { 87 | index_tables = v; 88 | break; 89 | } 90 | Err(TransducerError::Memmap(_)) => { 91 | index_chunk_count *= 2; 92 | 93 | if index_chunk_count > 16 { 94 | return Err(TransducerError::Memmap(std::io::Error::new( 95 | std::io::ErrorKind::Other, 96 | "Could not memory map index table in 16 chunks", 97 | ))); 98 | } 99 | } 100 | Err(e) => return Err(e), 101 | } 102 | } 103 | 104 | let mut trans_chunk_count = 1; 105 | let transition_tables; 106 | 107 | loop { 108 | let trans_path = path.join("transition"); 109 | let tables = (0..trans_chunk_count) 110 | .map(|i| { 111 | MemmapTransitionTable::from_path_partial(fs, &trans_path, i, trans_chunk_count) 112 | }) 113 | .collect::, _>>(); 114 | 115 | match tables { 116 | Ok(v) => { 117 | transition_tables = v; 118 | break; 119 | } 120 | Err(TransducerError::Memmap(_)) => { 121 | trans_chunk_count *= 2; 122 | 123 | if trans_chunk_count > 16 { 124 | return Err(TransducerError::Memmap(std::io::Error::new( 125 | std::io::ErrorKind::Other, 126 | "Could not memory map transition table in 16 chunks", 127 | ))); 128 | } 129 | } 130 | Err(e) => return Err(e), 131 | } 132 | } 133 | 134 | let transducer = ThfstChunkedTransducer { 135 | indexes_per_chunk: index_tables[0].size, 136 | transitions_per_chunk: transition_tables[0].size, 137 | index_tables, 138 | transition_tables, 139 | alphabet, 140 | _file: std::marker::PhantomData::, 141 | }; 142 | 143 | tracing::debug!("{:#?}", transducer); 144 | 145 | Ok(transducer) 146 | } 147 | 148 | #[inline(always)] 149 | fn alphabet(&self) -> &TransducerAlphabet { 150 | &self.alphabet 151 | } 152 | 153 | #[inline(always)] 154 | fn mut_alphabet(&mut self) -> &mut TransducerAlphabet { 155 | &mut self.alphabet 156 | } 157 | 158 | #[inline(always)] 159 | fn transition_input_symbol(&self, i: TransitionTableIndex) -> Option { 160 | let (page, index) = transition_rel_index!(self, i); 161 | self.transition_tables[page].input_symbol(index) 162 | } 163 | 164 | #[inline(always)] 165 | fn is_final(&self, i: TransitionTableIndex) -> bool { 166 | if i >= TARGET_TABLE { 167 | let (page, index) = transition_rel_index!(self, i - TARGET_TABLE); 168 | self.transition_tables[page].is_final(index) 169 | } else { 170 | let (page, index) = index_rel_index!(self, i); 171 | self.index_tables[page].is_final(index) 172 | } 173 | } 174 | 175 | #[inline(always)] 176 | fn final_weight(&self, i: TransitionTableIndex) -> Option { 177 | if i >= TARGET_TABLE { 178 | let (page, index) = transition_rel_index!(self, i - TARGET_TABLE); 179 | self.transition_tables[page].weight(index) 180 | } else { 181 | let (page, index) = index_rel_index!(self, i); 182 | self.index_tables[page].final_weight(index) 183 | } 184 | } 185 | 186 | #[inline(always)] 187 | fn has_transitions(&self, i: TransitionTableIndex, s: Option) -> bool { 188 | let sym = match s { 189 | Some(v) => v, 190 | None => return false, 191 | }; 192 | 193 | if i >= TARGET_TABLE { 194 | let (page, index) = transition_rel_index!(self, i - TARGET_TABLE); 195 | if page >= self.transition_tables.len() { 196 | return false; 197 | } 198 | match self.transition_tables[page].input_symbol(index) { 199 | Some(res) => sym == res, 200 | None => false, 201 | } 202 | } else { 203 | tracing::trace!("has_transitions: i:{} s:{:?}", i, s); 204 | let (page, index) = index_rel_index!(self, i + TransitionTableIndex(sym.0 as u32)); 205 | tracing::trace!("has_transitions: page:{} index:{:?}", page, index); 206 | if page >= self.index_tables.len() { 207 | return false; 208 | } 209 | match self.index_tables[page].input_symbol(index) { 210 | Some(res) => sym == res, 211 | None => false, 212 | } 213 | } 214 | } 215 | 216 | #[inline(always)] 217 | fn has_epsilons_or_flags(&self, i: TransitionTableIndex) -> bool { 218 | if i >= TARGET_TABLE { 219 | let (page, index) = transition_rel_index!(self, i - TARGET_TABLE); 220 | match self.transition_tables[page].input_symbol(index) { 221 | Some(sym) => sym == SymbolNumber::ZERO || self.alphabet.is_flag(sym), 222 | None => false, 223 | } 224 | } else { 225 | let (page, index) = index_rel_index!(self, i); 226 | if let Some(SymbolNumber::ZERO) = self.index_tables[page].input_symbol(index) { 227 | true 228 | } else { 229 | false 230 | } 231 | } 232 | } 233 | 234 | #[inline(always)] 235 | fn take_epsilons(&self, i: TransitionTableIndex) -> Option { 236 | let (page, index) = transition_rel_index!(self, i); 237 | 238 | if let Some(SymbolNumber::ZERO) = self.transition_tables[page].input_symbol(index) { 239 | Some(self.transition_tables[page].symbol_transition(index)) 240 | } else { 241 | None 242 | } 243 | } 244 | 245 | #[inline(always)] 246 | fn take_epsilons_and_flags(&self, i: TransitionTableIndex) -> Option { 247 | let (page, index) = transition_rel_index!(self, i); 248 | 249 | if let Some(sym) = self.transition_tables[page].input_symbol(index) { 250 | if sym != SymbolNumber::ZERO && !self.alphabet.is_flag(sym) { 251 | None 252 | } else { 253 | Some(self.transition_tables[page].symbol_transition(index)) 254 | } 255 | } else { 256 | None 257 | } 258 | } 259 | 260 | #[inline(always)] 261 | fn take_non_epsilons( 262 | &self, 263 | i: TransitionTableIndex, 264 | symbol: SymbolNumber, 265 | ) -> Option { 266 | let (page, index) = transition_rel_index!(self, i); 267 | if let Some(input_sym) = self.transition_tables[page].input_symbol(index) { 268 | if input_sym != symbol { 269 | None 270 | } else { 271 | Some(self.transition_tables[page].symbol_transition(index)) 272 | } 273 | } else { 274 | None 275 | } 276 | } 277 | 278 | #[inline(always)] 279 | fn next(&self, i: TransitionTableIndex, symbol: SymbolNumber) -> Option { 280 | if i >= TARGET_TABLE { 281 | Some(i - TARGET_TABLE + TransitionTableIndex(1)) 282 | } else { 283 | let (page, index) = 284 | index_rel_index!(self, i + TransitionTableIndex(symbol.0 as u32 + 1)); 285 | 286 | if let Some(v) = self.index_tables[page].target(index) { 287 | Some(v - TARGET_TABLE) 288 | } else { 289 | None 290 | } 291 | } 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /crates/accuracy/src/main.rs: -------------------------------------------------------------------------------- 1 | /*! Accuracy testing for Finite-State Spell-Checkers 2 | 3 | A tool to help testing quality of finite-state spell-checkers. Shows precision 4 | and recall and F scores. 5 | 6 | # Usage examples 7 | 8 | It's a command-line tool: 9 | ```console 10 | $ cargo run -- typos.txt se.zhfst 11 | ``` 12 | will produce statistics of spelling corrections. 13 | 14 | It is possible to fine-tune the options using a configuration file in json 15 | format. The format of json file follows from the [`SpellerConfig`] definition in 16 | the main library: 17 | ```console 18 | $ cargo run -- --config config.json typos.txt se.zhfst 19 | ``` 20 | For automated testing in CI there is a --threshold parametre: 21 | ```console 22 | $ cargo run -- --threshold 0.9 typos.txt se.zhfst 23 | ``` 24 | */ 25 | 26 | use chrono::prelude::*; 27 | use divvunspell::types::Weight; 28 | use std::error::Error; 29 | use std::{ 30 | io::Write, 31 | path::Path, 32 | time::{Instant, SystemTime}, 33 | }; 34 | 35 | use clap::Parser; 36 | use distance::damerau_levenshtein; 37 | use divvunspell::archive; 38 | use divvunspell::speller::suggestion::Suggestion; 39 | use divvunspell::speller::{ReweightingConfig, SpellerConfig}; 40 | use indicatif::{ParallelProgressIterator, ProgressBar, ProgressStyle}; 41 | use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; 42 | use serde::Serialize; 43 | use std::path::PathBuf; 44 | 45 | static CFG: SpellerConfig = SpellerConfig { 46 | n_best: Some(10), 47 | max_weight: Some(Weight(10000.0)), 48 | beam: None, 49 | reweight: Some(ReweightingConfig::default_const()), 50 | node_pool_size: 128, 51 | recase: true, 52 | completion_marker: None, 53 | }; 54 | 55 | fn load_words( 56 | path: &str, 57 | max_words: Option, 58 | ) -> Result, Box> { 59 | let mut rdr = csv::ReaderBuilder::new() 60 | .comment(Some(b'#')) 61 | .delimiter(b'\t') 62 | .has_headers(false) 63 | .flexible(true) 64 | .from_path(path)?; 65 | 66 | Ok(rdr 67 | .records() 68 | .filter_map(Result::ok) 69 | .filter_map(|r| { 70 | r.get(0) 71 | .and_then(|x| r.get(1).map(|y| (x.to_string(), y.to_string()))) 72 | }) 73 | .take(max_words.unwrap_or(std::usize::MAX)) 74 | .collect()) 75 | } 76 | 77 | #[derive(Debug, Default, Serialize, PartialOrd, Ord, PartialEq, Eq, Clone, Copy)] 78 | struct Time { 79 | secs: u64, 80 | subsec_nanos: u32, 81 | } 82 | 83 | impl std::fmt::Display for Time { 84 | fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { 85 | let ms = self.secs * 1000 + (self.subsec_nanos as u64 / 1_000_000); 86 | write!(f, "{}ms", ms) 87 | } 88 | } 89 | 90 | #[derive(Debug, Serialize)] 91 | struct AccuracyResult<'a> { 92 | input: &'a str, 93 | expected: &'a str, 94 | distance: usize, 95 | suggestions: Vec, 96 | position: Option, 97 | time: Time, 98 | } 99 | 100 | #[derive(Debug, Serialize)] 101 | struct Report<'a> { 102 | metadata: Option<&'a divvunspell::archive::meta::SpellerMetadata>, 103 | config: &'a SpellerConfig, 104 | summary: Summary, 105 | results: Vec>, 106 | start_timestamp: Time, 107 | total_time: Time, 108 | } 109 | 110 | #[derive(Serialize, Default, Debug, Clone)] 111 | struct Summary { 112 | total_words: u32, 113 | first_position: u32, 114 | top_five: u32, 115 | any_position: u32, 116 | no_suggestions: u32, 117 | only_wrong: u32, 118 | slowest_lookup: Time, 119 | fastest_lookup: Time, 120 | average_time: Time, 121 | average_time_95pc: Time, 122 | } 123 | 124 | impl std::fmt::Display for Summary { 125 | fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { 126 | let percent = 127 | |v: u32| -> String { format!("{:.2}%", v as f32 / self.total_words as f32 * 100f32) }; 128 | 129 | write!( 130 | f, 131 | "[#1] {} [^5] {} [any] {} [none] {} [wrong] {} [fast] {} [slow] {}", 132 | percent(self.first_position), 133 | percent(self.top_five), 134 | percent(self.any_position), 135 | percent(self.no_suggestions), 136 | percent(self.only_wrong), 137 | self.fastest_lookup, 138 | self.slowest_lookup 139 | ) 140 | } 141 | } 142 | 143 | impl Summary { 144 | fn new<'a>(results: &[AccuracyResult<'a>]) -> Summary { 145 | let mut summary = Summary::default(); 146 | 147 | results.iter().for_each(|result| { 148 | summary.total_words += 1; 149 | 150 | if let Some(position) = result.position { 151 | summary.any_position += 1; 152 | 153 | if position == 0 { 154 | summary.first_position += 1; 155 | } 156 | 157 | if position < 5 { 158 | summary.top_five += 1; 159 | } 160 | } else if result.suggestions.is_empty() { 161 | summary.no_suggestions += 1; 162 | } else { 163 | summary.only_wrong += 1; 164 | } 165 | }); 166 | 167 | summary.slowest_lookup = results 168 | .iter() 169 | .max_by(|x, y| x.time.cmp(&y.time)) 170 | .unwrap() 171 | .time; 172 | summary.fastest_lookup = results 173 | .iter() 174 | .min_by(|x, y| x.time.cmp(&y.time)) 175 | .unwrap() 176 | .time; 177 | 178 | summary 179 | } 180 | } 181 | 182 | #[derive(Debug, Parser)] 183 | #[command( 184 | name = "divvunspell-accuracy", 185 | version, 186 | about = "Accuracy testing for DivvunSpell." 187 | )] 188 | struct Args { 189 | /// Provide JSON config file to override test defaults 190 | #[arg(short = 'c', long)] 191 | config: Option, 192 | 193 | /// The 'input -> expected' list in tab-delimited value file (TSV) 194 | words: Option, 195 | 196 | /// Use the given ZHFST file 197 | zhfst: Option, 198 | 199 | /// The file path for the JSON report output 200 | #[arg(short = 'o', long = "json-output")] 201 | json_output: Option, 202 | 203 | /// The file path for the TSV line append 204 | #[arg(short = 't', long = "tsv-output")] 205 | tsv_output: Option, 206 | 207 | /// Truncate typos list to max number of words specified 208 | #[arg(short = 'w', long = "max-words")] 209 | max_words: Option, 210 | 211 | /// Minimum precision @ 5 for automated testing 212 | #[arg(short = 'T', long)] 213 | threshold: Option, 214 | } 215 | 216 | fn main() -> Result<(), Box> { 217 | tracing_subscriber::fmt::init(); 218 | 219 | let args = Args::parse(); 220 | 221 | let cfg: SpellerConfig = match args.config { 222 | Some(path) => { 223 | let file = std::fs::File::open(path)?; 224 | serde_json::from_reader(file)? 225 | } 226 | None => CFG.clone(), 227 | }; 228 | 229 | let archive = match args.zhfst { 230 | Some(path) => archive::open(Path::new(&path))?, 231 | None => { 232 | eprintln!("No ZHFST found for given path; aborting."); 233 | std::process::exit(1); 234 | } 235 | }; 236 | 237 | let words = match args.words { 238 | Some(path) => load_words(&path, args.max_words)?, 239 | None => { 240 | eprintln!("No word list for given path; aborting."); 241 | std::process::exit(1); 242 | } 243 | }; 244 | 245 | let pb = ProgressBar::new(words.len() as u64); 246 | pb.set_style( 247 | ProgressStyle::default_bar() 248 | .template("{pos}/{len} [{percent}%] {wide_bar} {elapsed_precise}"), 249 | ); 250 | 251 | let start_time = Instant::now(); 252 | let results = words 253 | .par_iter() 254 | .progress_with(pb) 255 | .map(|(input, expected)| { 256 | let now = Instant::now(); 257 | let suggestions = archive.speller().suggest_with_config(&input, &cfg); 258 | let now = now.elapsed(); 259 | 260 | let time = Time { 261 | secs: now.as_secs(), 262 | subsec_nanos: now.subsec_nanos(), 263 | }; 264 | 265 | let position = suggestions.iter().position(|x| x.value == expected); 266 | 267 | let distance = damerau_levenshtein(input, expected); 268 | AccuracyResult { 269 | input, 270 | expected, 271 | distance, 272 | time, 273 | suggestions, 274 | position, 275 | } 276 | }) 277 | .collect::>(); 278 | 279 | let now = start_time.elapsed(); 280 | let total_time = Time { 281 | secs: now.as_secs(), 282 | subsec_nanos: now.subsec_nanos(), 283 | }; 284 | let now_date = SystemTime::now() 285 | .duration_since(SystemTime::UNIX_EPOCH) 286 | .unwrap(); 287 | let start_timestamp = Time { 288 | secs: now_date.as_secs(), 289 | subsec_nanos: now_date.subsec_nanos(), 290 | }; 291 | 292 | let summary = Summary::new(&results); 293 | println!("{}", summary); 294 | 295 | if let Some(path) = args.json_output { 296 | let output = std::fs::File::create(path)?; 297 | let report = Report { 298 | metadata: archive.metadata(), 299 | config: &cfg, 300 | summary: summary.clone(), 301 | results, 302 | start_timestamp, 303 | total_time, 304 | }; 305 | println!("Writing JSON report…"); 306 | serde_json::to_writer_pretty(output, &report)?; 307 | } else if let Some(path) = args.tsv_output { 308 | let mut output = match std::fs::OpenOptions::new().append(true).open(&path) { 309 | Ok(f) => Ok(f), 310 | Err(_) => std::fs::OpenOptions::new() 311 | .create(true) 312 | .append(true) 313 | .open(&path), 314 | }?; 315 | let md = output.metadata()?; 316 | if md.len() == 0 { 317 | // new file, write headers: 318 | output 319 | .write_all(b"id\tdate\ttag/branch\ttop1\ttop5\tworse\tno suggs\twrong suggs\n")?; 320 | } 321 | let git_id = std::process::Command::new("git") 322 | .arg("rev-parse") 323 | .arg("--short") 324 | .arg("HEAD") 325 | .output()?; 326 | output.write_all(String::from_utf8(git_id.stdout).unwrap().trim().as_bytes())?; 327 | output.write_all(b"\t")?; 328 | output.write_all(Local::now().to_rfc3339().as_bytes())?; 329 | output.write_all(b"\t")?; 330 | let git_descr = std::process::Command::new("git").arg("describe").output()?; 331 | output.write_all( 332 | String::from_utf8(git_descr.stdout) 333 | .unwrap() 334 | .trim() 335 | .as_bytes(), 336 | )?; 337 | output.write_all(b"\t")?; 338 | output.write_all(summary.first_position.to_string().as_bytes())?; 339 | output.write_all(b"\t")?; 340 | output.write_all(summary.top_five.to_string().as_bytes())?; 341 | output.write_all(b"\t")?; 342 | output.write_all(summary.any_position.to_string().as_bytes())?; 343 | output.write_all(b"\t")?; 344 | output.write_all(summary.no_suggestions.to_string().as_bytes())?; 345 | output.write_all(b"\t")?; 346 | output.write_all(summary.only_wrong.to_string().as_bytes())?; 347 | output.write_all(b"\n")?; 348 | }; 349 | 350 | println!("Done!"); 351 | match args.threshold { 352 | Some(threshold) => { 353 | if threshold < (summary.top_five as f32 / summary.total_words as f32 * 100.0) { 354 | Ok(()) 355 | } else { 356 | Err("accuracy @5 lower threshold")? 357 | } 358 | } 359 | None => Ok(()), 360 | } 361 | } 362 | --------------------------------------------------------------------------------