├── src ├── sophia │ ├── .ignore │ ├── build.rs │ ├── rustfmt.toml │ ├── src │ │ ├── lib.rs │ │ ├── tokenizer │ │ │ ├── mod.rs │ │ │ ├── input.rs │ │ │ ├── cleaner.rs │ │ │ └── token.rs │ │ ├── interpreter │ │ │ ├── mod.rs │ │ │ ├── interpretation.rs │ │ │ ├── coref_categories.rs │ │ │ ├── interpreter.rs │ │ │ ├── antecedent_buffer.rs │ │ │ └── buffer.rs │ │ ├── interpret │ │ │ ├── mod.rs │ │ │ ├── interpretation.rs │ │ │ ├── coref_categories.rs │ │ │ ├── interpreter.rs │ │ │ ├── antecedent_buffer.rs │ │ │ └── buffer.rs │ │ ├── error.rs │ │ ├── vocab │ │ │ ├── mod.rs │ │ │ ├── future_verbs.rs │ │ │ ├── pronoun.rs │ │ │ ├── cache.rs │ │ │ ├── stats.rs │ │ │ ├── f8.rs │ │ │ ├── phrase_intents.rs │ │ │ ├── mwe.rs │ │ │ ├── database.rs │ │ │ ├── category.rs │ │ │ └── spell_check.rs │ │ ├── pos_tagger │ │ │ ├── mod.rs │ │ │ ├── schema.rs │ │ │ ├── tagger.rs │ │ │ ├── pos_tag.rs │ │ │ ├── hmm.rs │ │ │ └── model.rs │ │ └── sophia.rs │ ├── Cargo.toml │ ├── LICENSE │ ├── examples │ │ ├── tokenize.rs │ │ └── interpret.rs │ └── README.md ├── rustfmt.toml └── Cargo.toml ├── .gitignore ├── contribute.md ├── README.md └── LICENSE /src/sophia/.ignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | Cargo.lock 3 | 4 | -------------------------------------------------------------------------------- /src/rustfmt.toml: -------------------------------------------------------------------------------- 1 | 2 | # Keep lines longer for screen reader accessibility 3 | max_width = 100 4 | chain_width = 100 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/sophia/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | println!("cargo:rustc-link-lib=bz2"); 3 | println!("cargo:rustc-link-lib=zstd"); 4 | } 5 | -------------------------------------------------------------------------------- /src/sophia/rustfmt.toml: -------------------------------------------------------------------------------- 1 | 2 | # Keep lines longer for screen reader accessibility 3 | max_width = 100 4 | chain_width = 100 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /contribute.md: -------------------------------------------------------------------------------- 1 | 2 | # Code of Conduct 3 | 4 | 5 | ## rustfmt.toml 6 | 7 | The rustfmt.toml has been configured to retain longer lines, instead of the standard of splitting every chain link onto a new line. 8 | 9 | PRs submitted with that configuration removed will be rejected. I know this will be a pet peeve for some, but I'm blind and maintainer on this project, so that stays as working with tiny lines via screen reader is a nightmare. 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [workspace] 3 | members = [ 4 | #"cicero/client", 5 | #"cicero/lib", 6 | #"cicero/server", 7 | #"evolve", 8 | #"quadris", 9 | #"sdk", 10 | "sophia", 11 | # "verax/lib", 12 | #"plugins/core", 13 | #"deps/atlas", 14 | #"deps/falcon-cli", 15 | #"deps/http", 16 | #"deps/opus", 17 | #"deps/parsex", 18 | #"deps/sermo" 19 | ] 20 | resolver = "2" 21 | 22 | [profile.release] 23 | strip="symbols" 24 | debug=true 25 | lto = true 26 | opt-level = 3 27 | codegen-units = 1 28 | panic = "abort" 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/sophia/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | #![allow(non_camel_case_types)] 8 | 9 | pub use self::error::Error; 10 | pub use self::sophia::Sophia; 11 | 12 | pub mod error; 13 | pub mod interpret; 14 | pub mod pos_tagger; 15 | pub mod sophia; 16 | pub mod tokenizer; 17 | pub mod vocab; 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Cicero is your self-hosted AI, built in Rust to understand, remember, and handle your digital tasks. Open-source and fiercely private, it locks big tech out, keeping your data and life yours. 3 | 4 | General release coming shortly. For full details and to stay updated, please visit [https://cicero.sh/](https://cicero.sh/). 5 | 6 | Sophia NLU Engine is now open source and available within the /src/sophia/ directory of this repository. Details at: [https://cicero.sh/sophia/](https://cicero.sh/sophia/). 7 | 8 | 9 | ## Contact 10 | 11 | Join the community discussion at: [https://cicero.sh/forums/](https://cicero.sh/forums/). 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/sophia/src/tokenizer/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | pub use self::input::{MWE, TokenizedInput}; 8 | pub use self::token::{Token, TokenType}; 9 | pub use self::tokenizer::{Buffer, Tokenizer}; 10 | pub use cleaner::TokenCleaner; 11 | 12 | mod cleaner; 13 | mod input; 14 | pub mod token; 15 | mod tokenizer; 16 | -------------------------------------------------------------------------------- /src/sophia/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cicero-sophia" 3 | description = "High-performance NLU (natural language understanding) engine built in Rust for speed, accuracy, and privacy. " 4 | version = "0.6.5" 5 | edition = "2024" 6 | authors = ["Aquila Labs"] 7 | homepage = "https://cicero.sh/sophia/" 8 | keywords = ["nlu", "natural-language", "chat", "nlp"] 9 | categories = ["text-processing"] 10 | readme = "README.md" 11 | license="PolyForm-Noncommercial-1.0.0" 12 | repository = "https://github.com/cicero-ai/cicero" 13 | 14 | [lib] 15 | name="sophia" 16 | 17 | [dependencies] 18 | bincode = "1.3.3" 19 | indexmap = { version = "2.11.0", features = ["serde"] } 20 | regex = "1.11.2" 21 | serde = { version = "1.0.219", features = ["derive"] } 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/sophia/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Sophia NLU Engine 3 | Copyright (c) 2025 Aquila Labs of Alberta, Canada 4 | 5 | Licensed under the PolyForm Noncommercial License 1.0.0. 6 | 7 | You may use, copy, modify, and distribute this software for noncommercial purposes, 8 | including personal projects, research, educational use, hobby projects, and internal 9 | use by nonprofit organizations. 10 | 11 | Commercial use — including use in a for-profit company, in a product or service 12 | offered for sale or subscription, or in any revenue-generating application — 13 | requires a separate commercial license. 14 | 15 | For commercial licensing information, visit: 16 | https://cicero.sh/sophia/ 17 | 18 | Full license text: 19 | https://polyformproject.org/licenses/noncommercial/1.0.0/ 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | pub use self::antecedent_buffer::AntecedentBuffer; 7 | pub use self::buffer::Buffer; 8 | pub use self::coref_categories::CoreferenceCategories; 9 | pub use self::interpretation::Interpretation; 10 | pub use self::interpreter::Interpreter; 11 | pub use self::phrase::{ 12 | Adjective, Adverb, Noun, NounModifier, NounOwner, NounSibling, Phrase, PhraseTense, Verb, 13 | VerbModifier, VerbSibling, 14 | }; 15 | pub use self::phrase_buffer::PhraseBuffer; 16 | 17 | mod antecedent_buffer; 18 | mod buffer; 19 | mod coref_categories; 20 | mod interpretation; 21 | mod interpreter; 22 | mod phrase; 23 | mod phrase_buffer; 24 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | pub use self::antecedent_buffer::AntecedentBuffer; 8 | pub use self::buffer::Buffer; 9 | pub use self::coref_categories::CoreferenceCategories; 10 | pub use self::interpretation::Interpretation; 11 | pub use self::interpreter::Interpreter; 12 | pub use self::phrase::{ 13 | Adjective, Adverb, Noun, NounModifier, NounOwner, NounSibling, Phrase, PhraseTense, Verb, 14 | VerbModifier, VerbSibling, 15 | }; 16 | pub use self::phrase_buffer::PhraseBuffer; 17 | 18 | mod antecedent_buffer; 19 | mod buffer; 20 | mod coref_categories; 21 | mod interpretation; 22 | mod interpreter; 23 | mod phrase; 24 | mod phrase_buffer; 25 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/interpretation.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use super::{Buffer, Phrase}; 7 | use crate::tokenizer::Token; 8 | use std::collections::HashMap; 9 | 10 | /// Represents the result of interpreting input, containing classification scores, tokens, multi-word expressions (MWE), and phrases. 11 | pub struct Interpretation { 12 | pub scores: HashMap, 13 | pub tokens: Vec, 14 | pub mwe: Vec, 15 | pub phrases: Vec, 16 | } 17 | 18 | impl Interpretation { 19 | /// Adds a phrase to the interpretation, checking for enclosed character phrases in the buffer before appending. 20 | pub fn push_phrase(&mut self, phrase: Phrase, buffer: &mut Buffer) { 21 | // Combine enclosed phrases, if needed 22 | buffer.enclosed_chars.is_empty(); 23 | 24 | self.phrases.push(phrase); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/interpretation.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{Buffer, Phrase}; 8 | use crate::tokenizer::Token; 9 | use std::collections::HashMap; 10 | 11 | /// Represents the result of interpreting input, containing classification scores, tokens, multi-word expressions (MWE), and phrases. 12 | pub struct Interpretation { 13 | pub scores: HashMap, 14 | pub tokens: Vec, 15 | pub mwe: Vec, 16 | pub phrases: Vec, 17 | } 18 | 19 | impl Interpretation { 20 | /// Adds a phrase to the interpretation, checking for enclosed character phrases in the buffer before appending. 21 | pub fn push_phrase(&mut self, phrase: Phrase, buffer: &mut Buffer) { 22 | // Combine enclosed phrases, if needed 23 | buffer.enclosed_chars.is_empty(); 24 | 25 | self.phrases.push(phrase); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/sophia/src/error.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use std::fmt; 8 | 9 | #[derive(Debug)] 10 | pub enum Error { 11 | Save(String), 12 | Load(String), 13 | POSPrediction(String), 14 | Generic(String), 15 | } 16 | 17 | impl std::error::Error for Error {} 18 | 19 | impl fmt::Display for Error { 20 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 21 | match self { 22 | Error::Save(err) => write!(f, "Save error: {}", err), 23 | Error::Load(err) => write!(f, "Load error: {}", err), 24 | Error::POSPrediction(err) => write!(f, "POS tagger logistic regression error: {}", err), 25 | Error::Generic(msg) => write!(f, "{}", msg), 26 | } 27 | } 28 | } 29 | 30 | impl From for Error { 31 | fn from(err: std::io::Error) -> Self { 32 | Error::Generic(err.to_string()) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | pub use self::cache::VocabCache; 8 | pub use self::category::{VocabCategory, VocabCategoryDatabase, VocabCategoryIndex}; 9 | pub use self::database::{ 10 | VocabDatabase, VocabDatabaseMeta, VocabPreProcessDatabase, VocabWordDatabase, 11 | }; 12 | pub use self::future_verbs::FutureVerbPhrases; 13 | pub use self::mwe::{Capitalization, MWEType, VocabMWE}; 14 | pub use self::phrase_intents::{PhraseIntent, PhraseIntents}; 15 | pub use self::pronoun::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson}; 16 | pub use self::spell_check::{ 17 | SpellChecker, SpellCheckerCohort, SpellCheckerCohortPOS, SpellCheckerCohortSize, 18 | SpellCheckerEntry, 19 | }; 20 | pub use self::stats::VocabStats; 21 | 22 | mod cache; 23 | mod category; 24 | mod database; 25 | pub mod f8; 26 | mod future_verbs; 27 | pub mod mwe; 28 | mod phrase_intents; 29 | mod pronoun; 30 | mod spell_check; 31 | mod stats; 32 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/mod.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | pub use self::context::{ 8 | AUXILLARY_VERBS, COMMON_ADVERBS, PASSIVE_INDICATORS, POSContext, POSFeature, POSFeatureToken, 9 | POSPrefix, POSSuffix, POSTagGroup, POSWordGroup, SIBLING_TAGS_AFTER, SIBLING_TAGS_BEFORE, 10 | TEMPORAL_ADVERBS, 11 | }; 12 | pub use self::hmm::{HMM, TOTAL_TAGS}; 13 | pub use self::model::{ 14 | POSConjunction, POSModel, POSModelInterface, POSTagModel, POSTagModelRepo, POSWeight, 15 | }; 16 | pub use self::pos_tag::POSTag; 17 | pub use self::tagger::{POSPrediction, POSPredictionMethod, POSTagger}; 18 | use crate::tokenizer::Token; 19 | 20 | mod context; 21 | mod hmm; 22 | mod model; 23 | mod pos_tag; 24 | mod tagger; 25 | 26 | pub trait TokenKey { 27 | fn get_key(&self) -> S; 28 | } 29 | 30 | impl TokenKey for Token { 31 | fn get_key(&self) -> i32 { 32 | self.index 33 | } 34 | } 35 | 36 | impl TokenKey for Token { 37 | fn get_key(&self) -> String { 38 | self.word.to_string() 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | 3 | Cicero Monorepo License Overview 4 | Copyright (c) 2025 Aquila Labs of Alberta, Canada 5 | 6 | This repository contains multiple crates. Each crate may specify its own license in its `Cargo.toml` or its own `LICENSE` file. 7 | 8 | **General Rule:** 9 | 10 | * Unless otherwise noted in a crate's own `Cargo.toml` or `LICENSE` file, all source code in this repository (except code under `/src/deps/`) is licensed under the **PolyForm Noncommercial License 1.0.0**. 11 | * Code under `/src/deps/` is third-party code and is licensed under its original license, which may be MIT, Apache 2.0, GPL, or a similar open-source license. See individual files for details. 12 | 13 | **PolyForm Noncommercial License Summary:** 14 | You may use, copy, modify, and distribute this software for **noncommercial purposes**, including personal projects, research, educational use, hobby projects, and internal use by nonprofit organizations. 15 | 16 | **Commercial use** — including use in a for-profit company, in a product or service offered for sale or subscription, or in any revenue-generating application — requires a separate commercial license. 17 | 18 | For commercial licensing information, visit: 19 | 🔗 **[https://cicero.sh/sophia/](https://cicero.sh/sophia/)** 20 | 21 | Full license text: 22 | 🔗 **[https://polyformproject.org/licenses/noncommercial/1.0.0/](https://polyformproject.org/licenses/noncommercial/1.0.0/)** 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/future_verbs.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use serde::{Deserialize, Serialize}; 8 | use std::collections::HashMap; 9 | 10 | /// A trie-like structure for storing future verb phrases, tracking completion status and expected verb POS tags. 11 | #[derive(Serialize, Deserialize)] 12 | pub struct FutureVerbPhrases { 13 | pub is_complete: bool, 14 | pub expected_verb_pos: Option, 15 | pub children: HashMap>, 16 | } 17 | 18 | impl FutureVerbPhrases { 19 | /// Creates a new FutureVerbPhrases node with an optional expected verb POS tag and empty children. 20 | pub fn new(expected_verb_pos: Option) -> Self { 21 | Self { 22 | is_complete: false, 23 | expected_verb_pos, 24 | children: HashMap::new(), 25 | } 26 | } 27 | 28 | /// Inserts a phrase into the trie, marking the final node as complete and handling verb placeholders. 29 | pub fn insert(&mut self, phrase: &str) { 30 | let mut current = self; 31 | for word in phrase.split(" ").collect::>().iter() { 32 | let child = if word.starts_with("V") { 33 | "[verb]".to_string() 34 | } else { 35 | word.to_string() 36 | }; 37 | let expected_verb = if word.starts_with("V") { 38 | Some(word.to_string()) 39 | } else { 40 | None 41 | }; 42 | 43 | current = current 44 | .children 45 | .entry(child.to_lowercase()) 46 | .or_insert(Box::new(FutureVerbPhrases::new(expected_verb))); 47 | } 48 | 49 | current.is_complete = true; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/pronoun.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use serde::{Deserialize, Serialize}; 8 | 9 | /// Represents a pronoun with its linguistic properties, including category, gender, person, and number. 10 | #[derive(Serialize, Deserialize, Clone, Debug)] 11 | pub struct Pronoun { 12 | pub category: PronounCategory, 13 | pub sub_category: PronounCategory, 14 | pub gender: PronounGender, 15 | pub person: PronounPerson, 16 | pub number: PronounNumber, 17 | } 18 | 19 | /// Defines the category of a pronoun, such as personal, possessive, or indefinite. 20 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)] 21 | pub enum PronounCategory { 22 | none, 23 | personal, 24 | possessive, 25 | indefinite, 26 | reflexive, 27 | demonstrative, 28 | interrogative, 29 | relative, 30 | } 31 | 32 | /// Defines the gender of a pronoun, which can be neutral, male, or female. 33 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)] 34 | pub enum PronounGender { 35 | neutral, 36 | male, 37 | female, 38 | } 39 | 40 | /// Defines the person of a pronoun, which can be neutral, first, second, or third. 41 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)] 42 | pub enum PronounPerson { 43 | neutral, 44 | first, 45 | second, 46 | third, 47 | } 48 | 49 | /// Defines the number of a pronoun, which can be neutral, singular, or plural. 50 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)] 51 | pub enum PronounNumber { 52 | neutral, 53 | singular, 54 | plural, 55 | } 56 | 57 | impl Pronoun { 58 | /// Checks if the pronoun requires anaphora resolution, based on its category and person. 59 | pub fn is_anaphora(&self) -> bool { 60 | if ![ 61 | PronounCategory::personal, 62 | PronounCategory::possessive, 63 | PronounCategory::reflexive, 64 | ] 65 | .contains(&self.category) 66 | { 67 | return false; 68 | } 69 | 70 | if self.person == PronounPerson::first { 71 | return false; 72 | } 73 | 74 | true 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/cache.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use crate::error::Error; 8 | use bincode; 9 | use serde::{Deserialize, Serialize}; 10 | use std::collections::HashMap; 11 | use std::fs; 12 | use std::path::Path; 13 | 14 | /// A cache for storing vocabulary-related data, such as typos, to improve processing efficiency. 15 | #[derive(Serialize, Deserialize, Default)] 16 | pub struct VocabCache { 17 | pub typos: HashMap, 18 | } 19 | 20 | impl VocabCache { 21 | /// Loads the vocabulary cache from a file in the specified directory, returning a default cache if the file does not exist. 22 | pub fn load(vocab_dir: &str) -> Result { 23 | let cache_file = format!("{}/cache.dat", vocab_dir); 24 | if !Path::new(&cache_file).exists() { 25 | return Ok(Self::default()); 26 | } 27 | 28 | let encoded = fs::read(&cache_file)?; 29 | let cache: VocabCache = match bincode::deserialize(&encoded[..]) { 30 | Ok(r) => r, 31 | Err(e) => { 32 | return Err(Error::Load(format!( 33 | "Unable to load vocabulary cache, {}", 34 | e 35 | ))); 36 | } 37 | }; 38 | 39 | Ok(cache) 40 | } 41 | 42 | /// Saves the vocabulary cache to a file in the specified directory using bincode serialization. 43 | pub fn save(&self, vocab_dir: &str) -> Result<(), Error> { 44 | let cache_file = format!("{}/cache.dat", vocab_dir); 45 | let encoded = match bincode::serialize(&self) { 46 | Ok(r) => r, 47 | Err(e) => { 48 | return Err(Error::Save(format!( 49 | "Unable to serialize vocabulary cache, {}", 50 | e 51 | ))); 52 | } 53 | }; 54 | fs::write(&cache_file, &encoded)?; 55 | Ok(()) 56 | } 57 | 58 | /// Adds a typo mapping to the cache, associating the original word with its correct form. 59 | pub fn add_typo(&mut self, original: &str, correct: &str) { 60 | self.typos.insert(original.to_string(), correct.to_string()); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/stats.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::VocabDatabase; 8 | use crate::pos_tagger::POSTag; 9 | use std::collections::HashMap; 10 | 11 | /// Populated with basic statistical and meta information regarding the vocabulary data store 12 | /// including number of words, MWEs, ambiguous words, named entities, categories, and so on. 13 | #[derive(Debug, Default)] 14 | pub struct VocabStats { 15 | pub singular_words: usize, 16 | pub ambiguous_words: usize, 17 | pub mwes: usize, 18 | pub nouns: usize, 19 | pub verbs: usize, 20 | pub adverbs: usize, 21 | pub adjectives: usize, 22 | pub named_entities: usize, 23 | pub synonyms: usize, 24 | pub hypernyms: usize, 25 | pub hyponyms: usize, 26 | pub pos_tags: HashMap, 27 | } 28 | 29 | impl VocabStats { 30 | pub fn compile(vocab: &VocabDatabase) -> Self { 31 | let mut stats = Self::default(); 32 | 33 | // GO through wordlist 34 | for (_, pos_map) in vocab.words.wordlist.iter() { 35 | // Singular or ambiguous? 36 | if pos_map.len() > 1 { 37 | stats.ambiguous_words += 1; 38 | } 39 | 40 | // POS tags 41 | for (tag, _) in pos_map.iter() { 42 | *stats.pos_tags.entry(*tag).or_insert(0) += 1; 43 | } 44 | } 45 | 46 | // Go through all tokens 47 | for (_, token) in vocab.words.id2token.iter() { 48 | // MWE? 49 | if token.word.contains(" ") { 50 | stats.mwes += 1; 51 | } else { 52 | stats.singular_words += 1; 53 | } 54 | 55 | // Counts 56 | stats.synonyms += token.synonyms.len(); 57 | stats.hypernyms += token.hypernyms.len(); 58 | stats.hyponyms += token.hyponyms.len(); 59 | 60 | // Part of speech 61 | if token.is_noun() { 62 | stats.nouns += 1; 63 | } else if token.is_verb() { 64 | stats.verbs += 1; 65 | } else if token.is_adverb() { 66 | stats.adverbs += 1; 67 | } else if token.is_adjective() { 68 | stats.adjectives += 1; 69 | } 70 | 71 | if token.is_named_entity() { 72 | stats.named_entities += 1; 73 | } 74 | } 75 | 76 | stats 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/sophia/examples/tokenize.rs: -------------------------------------------------------------------------------- 1 | use sophia::{Error, Sophia}; 2 | use std::env; 3 | use std::process; 4 | 5 | /// Tokenizes a sample sentence using the Sophia NLP library, demonstrating token and MWE iteration. 6 | /// 7 | /// This example reads the vocabulary data directory from the first command-line argument. 8 | /// If no argument is provided, it exits with an error message. 9 | /// 10 | /// # Usage 11 | /// 12 | /// ```bash 13 | /// cargo run --example tokenize -- ./vocab_data 14 | /// ``` 15 | /// 16 | /// The example processes the sentence "The quick brown fox jumps over the lazy dog" and prints: 17 | /// - Individual tokens with their indices, words, and part-of-speech (POS) tags. 18 | /// - Multi-word entities (MWEs) with their indices, words, and POS tags. 19 | /// - Tokens with stopwords removed, showing only content-bearing words. 20 | fn main() { 21 | // Retrieve the data directory from the first command-line argument 22 | let datadir = env::args().nth(1).unwrap_or_else(|| { 23 | eprintln!("Error: No data directory provided."); 24 | eprintln!("Usage: cargo run --example tokenize -- "); 25 | process::exit(1); 26 | }); 27 | 28 | // Run the tokenization example, handling errors 29 | if let Err(e) = run(&datadir) { 30 | eprintln!("Error: {}", e); 31 | process::exit(1); 32 | } 33 | } 34 | 35 | /// Runs the tokenization example with the provided data directory. 36 | fn run(datadir: &str) -> Result<(), Error> { 37 | // Initialize Sophia with the vocabulary directory and language 38 | let sophia = Sophia::new(datadir, "en")?; 39 | 40 | // Tokenize the input text 41 | let input = "The quick brown fox jumps over the lazy dog"; 42 | let output = sophia.tokenize(input)?; 43 | 44 | // Print individual tokens 45 | println!("\nIndividual Tokens:"); 46 | println!("{:-<50}", ""); 47 | println!("{:>6} {:<15} {}", "Index", "Word", "POS"); 48 | println!("{:-<50}", ""); 49 | for token in output.iter() { 50 | println!("{:>6} {:<15} {}", token.index, token.word, token.pos); 51 | } 52 | 53 | // Print multi-word entities (MWEs) 54 | println!("\nMulti-Word Entities (MWEs):"); 55 | println!("{:-<50}", ""); 56 | println!("{:>6} {:<15} {}", "Index", "Word", "POS"); 57 | println!("{:-<50}", ""); 58 | for token in output.mwe() { 59 | println!("{:>6} {:<15} {}", token.index, token.word, token.pos); 60 | } 61 | 62 | // Print tokens with stopwords removed 63 | println!("\nTokens with Stopwords Removed:"); 64 | println!("{:-<50}", ""); 65 | println!("{:>6} {:<15} {}", "Index", "Word", "POS"); 66 | println!("{:-<50}", ""); 67 | for token in output.remove_stop_words().iter() { 68 | println!("{:>6} {:<15} {}", token.index, token.word, token.pos); 69 | } 70 | 71 | Ok(()) 72 | } 73 | -------------------------------------------------------------------------------- /src/sophia/examples/interpret.rs: -------------------------------------------------------------------------------- 1 | use cicero_sophia::{Error, Sophia}; 2 | use std::env; 3 | use std::process; 4 | 5 | /// Interprets a sample sentence using the Sophia NLP library, demonstrating phrase, token, and score analysis. 6 | /// 7 | /// This example reads the vocabulary data directory from the first command-line argument. 8 | /// If no argument is provided, it exits with an error message. 9 | /// 10 | /// # Usage 11 | /// 12 | /// ```bash 13 | /// cargo run --example interpret -- ./vocab_data 14 | /// ``` 15 | /// 16 | /// The example processes the sentence "The quick brown fox jumps over the lazy dog" and prints: 17 | /// - Phrases with their token contents (debug format). 18 | /// - Individual tokens with their indices, words, and part-of-speech (POS) tags. 19 | /// - Multi-word entities (MWEs) with their indices, words, and POS tags. 20 | /// - Classification scores with their labels and floating-point values. 21 | fn main() { 22 | // Retrieve the data directory from the first command-line argument 23 | let datadir = env::args().nth(1).unwrap_or_else(|| { 24 | eprintln!("Error: No data directory provided."); 25 | eprintln!("Usage: cargo run --example interpret -- "); 26 | process::exit(1); 27 | }); 28 | 29 | // Run the interpretation example, handling errors 30 | if let Err(e) = run(&datadir) { 31 | eprintln!("Error: {}", e); 32 | process::exit(1); 33 | } 34 | } 35 | 36 | /// Runs the interpretation example with the provided data directory. 37 | fn run(datadir: &str) -> Result<(), Error> { 38 | // Initialize Sophia with the vocabulary directory and language 39 | let sophia = Sophia::new(datadir, "en")?; 40 | 41 | // Interpret the input text 42 | let input = "The quick brown fox jumps over the lazy dog"; 43 | let output = sophia.interpret(input)?; 44 | 45 | // Print phrases 46 | println!("\nPhrases:"); 47 | println!("{:-<50}", ""); 48 | for (i, phrase) in output.phrases.iter().enumerate() { 49 | println!("Phrase {}: {:?}", i + 1, phrase); 50 | } 51 | 52 | // Print individual tokens 53 | println!("\nIndividual Tokens:"); 54 | println!("{:-<50}", ""); 55 | println!("{:>6} {:<15} {}", "Index", "Word", "POS"); 56 | println!("{:-<50}", ""); 57 | for token in output.tokens.iter() { 58 | println!("{:>6} {:<15} {}", token.index, token.word, token.pos); 59 | } 60 | 61 | // Print multi-word entities (MWEs) 62 | println!("\nMulti-Word Entities (MWEs):"); 63 | println!("{:-<50}", ""); 64 | println!("{:>6} {:<15} {}", "Index", "Word", "POS"); 65 | println!("{:-<50}", ""); 66 | for token in output.mwe() { 67 | println!("{:>6} {:<15} {}", token.index, token.word, token.pos); 68 | } 69 | 70 | // Print classification scores 71 | println!("\nClassification Scores:"); 72 | println!("{:-<50}", ""); 73 | println!("{:>6} {:<15}", "Label", "Score"); 74 | println!("{:-<50}", ""); 75 | for (label, score) in output.scores.iter() { 76 | println!("{:>6} {:<15.4}", label, score.to_f32()); 77 | } 78 | 79 | Ok(()) 80 | } 81 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/schema.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use super::POSTag; 7 | use crate::vocab::f8::f8; 8 | use serde::{Deserialize, Serialize}; 9 | use std::collections::HashMap; 10 | use std::hash::Hash; 11 | use std::ops::Add; 12 | 13 | /// A trait for types that can be used as scores in the POS tagger, requiring default, addition, and serialization capabilities. 14 | pub trait Score: Default + Add + Serialize + for<'de> Deserialize<'de> {} 15 | 16 | /// A part-of-speech tagger structure that maps tags to words and tracks tag, initial, before, and after scores. 17 | #[derive(Default, Serialize, Deserialize)] 18 | #[serde( 19 | bound = "T: Score, S: Default + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>" 20 | )] 21 | pub struct POSTagger { 22 | pub tag2tag: POSTaggerLayer, 23 | pub tag2word: HashMap>, 24 | pub word2word: HashMap, 25 | } 26 | 27 | /// A layer of the POS tagger, containing tag scores and initial, before, and after scoring structures. 28 | #[derive(Serialize, Deserialize, Clone)] 29 | #[serde(bound = "T: Score")] 30 | pub struct POSTaggerLayer { 31 | pub tags: HashMap, 32 | pub initial: POSTaggerScores, 33 | pub before: POSTaggerScores, 34 | pub after: POSTaggerScores, 35 | } 36 | 37 | /// Stores exact match trie and bigram scores for POS tagging. 38 | #[derive(Default, Serialize, Deserialize, Clone)] 39 | #[serde(bound = "T: Score")] 40 | pub struct POSTaggerScores { 41 | pub exact_matches: HashMap, 42 | pub bigrams: Vec>, 43 | } 44 | 45 | /// Stores bigram scores as a mapping from bigram identifiers to lists of tag-score pairs. 46 | #[derive(Default, Clone, Serialize, Deserialize)] 47 | #[serde(bound = "T: Score")] 48 | pub struct POSTaggerBigramScores(pub HashMap>); 49 | 50 | impl Deserialize<'de>> POSTagger { 51 | /// Creates a new POSTagger instance with default values. 52 | pub fn new() -> Self { 53 | Self::default() 54 | } 55 | } 56 | 57 | impl Score for usize {} 58 | impl Score for f32 {} 59 | impl Score for f8 {} 60 | 61 | impl POSTaggerScores { 62 | pub fn new(size: usize) -> Self { 63 | Self { 64 | exact_matches: HashMap::new(), 65 | bigrams: (0..size) 66 | .map(|_| POSTaggerBigramScores::::default()) 67 | .collect::>>(), 68 | } 69 | } 70 | } 71 | 72 | impl POSTaggerBigramScores { 73 | /// Increments the score for a given bigram and tag, adding a new entry if the tag is not present. 74 | pub fn incr(&mut self, bigram: u16, tag: POSTag) { 75 | let scores = self.0.entry(bigram).or_default(); 76 | let index = match scores.iter().position(|score| score.0 == tag) { 77 | Some(r) => r, 78 | None => { 79 | scores.push((tag, 0)); 80 | scores.len() - 1 81 | } 82 | }; 83 | scores[index].1 += 1; 84 | } 85 | } 86 | 87 | impl Default for POSTaggerLayer { 88 | fn default() -> Self { 89 | Self { 90 | tags: HashMap::new(), 91 | initial: POSTaggerScores::new(2), 92 | before: POSTaggerScores::new(4), 93 | after: POSTaggerScores::new(2), 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/f8.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use serde::{Deserialize, Serialize}; 8 | use std::convert::From; 9 | use std::fmt; 10 | use std::ops::{Add, AddAssign, Mul}; 11 | 12 | /// A fixed-point 8-bit representation for floating-point values in the range [0.0, 1.0], storing a sum and value history. 13 | #[derive(Clone, Default, Debug, PartialEq, Eq)] 14 | pub struct f8 { 15 | values: Vec, 16 | pub sum: u8, 17 | } 18 | 19 | impl f8 { 20 | /// Creates a new f8 instance from a u8 value, initializing the sum and value history. 21 | pub fn new(value: u8) -> Self { 22 | f8 { 23 | values: vec![value], 24 | sum: value, 25 | } 26 | } 27 | 28 | /// Converts the f8 value to a f32 in the range [0.0, 1.0]. 29 | pub fn to_f32(&self) -> f32 { 30 | self.sum as f32 / 255.0 31 | } 32 | 33 | /// Calculates the sum of all values in the f8 instance as a u16. 34 | fn calculate_sum(&self) -> u16 { 35 | self.values.iter().map(|&v| v as u16).sum() 36 | } 37 | 38 | /// Returns the sum as a u8, representing the quantized value. 39 | fn to_u8(&self) -> u8 { 40 | self.sum 41 | } 42 | } 43 | 44 | impl From for f8 { 45 | /// Converts a f32 value in [0.0, 1.0] to an f8 instance, quantizing to a u8. 46 | fn from(value: f32) -> Self { 47 | // Map the range [0.0, 1.0] to [0, 255] 48 | let quantized = (value * 255.0).round() as u8; 49 | f8::new(quantized) 50 | } 51 | } 52 | 53 | impl From for f32 { 54 | /// Converts an f8 instance to a f32 value in [0.0, 1.0]. 55 | fn from(val: f8) -> Self { 56 | (val.to_u8() as f32) / 255.0 57 | } 58 | } 59 | 60 | impl Add for f8 { 61 | /// Adds two f8 instances, combining their value histories and capping the sum at 255. 62 | type Output = Self; 63 | 64 | fn add(mut self, other: Self) -> Self { 65 | self.values.extend(other.values); 66 | let new_sum = self.calculate_sum().min(255) as u8; 67 | f8 { 68 | values: self.values, 69 | sum: new_sum, 70 | } 71 | } 72 | } 73 | 74 | impl AddAssign for f8 { 75 | fn add_assign(&mut self, rhs: usize) { 76 | self.values.push(rhs as u8); 77 | self.sum += rhs as u8; 78 | } 79 | } 80 | 81 | impl Mul for f8 { 82 | type Output = Self; 83 | 84 | fn mul(self, rhs: f32) -> Self { 85 | // Convert the F8 value to f32, multiply, then convert back to F8 86 | let result_f32 = (self.to_u8() as f32 / 255.0) * rhs; 87 | f8::from(result_f32) 88 | } 89 | } 90 | 91 | impl Serialize for f8 { 92 | fn serialize(&self, serializer: S) -> Result 93 | where 94 | S: serde::Serializer, 95 | { 96 | serializer.serialize_u8(self.sum) 97 | } 98 | } 99 | 100 | impl<'de> Deserialize<'de> for f8 { 101 | fn deserialize(deserializer: D) -> Result 102 | where 103 | D: serde::Deserializer<'de>, 104 | { 105 | let value = u8::deserialize(deserializer)?; 106 | Ok(f8::new(value)) 107 | } 108 | } 109 | 110 | impl PartialOrd for f8 { 111 | fn partial_cmp(&self, other: &Self) -> Option { 112 | Some(self.calculate_sum().cmp(&other.calculate_sum())) 113 | } 114 | } 115 | 116 | impl Ord for f8 { 117 | fn cmp(&self, other: &Self) -> std::cmp::Ordering { 118 | self.calculate_sum().cmp(&other.calculate_sum()) 119 | } 120 | } 121 | 122 | impl fmt::Display for f8 { 123 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 124 | write!(f, "{}", self.to_f32()) 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/coref_categories.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use crate::pos_tagger::POSTag; 7 | use crate::tokenizer::Token; 8 | use crate::vocab::VocabDatabase; 9 | use std::ops::Range; 10 | 11 | /// Stores category ranges for coreference resolution, including named entity recognition (NER) and noun-based person and entity classifications. 12 | #[derive(Clone, Default)] 13 | pub struct CoreferenceCategories { 14 | ner_person: Range, 15 | noun_person: Vec>, 16 | ner_entity: Vec>, 17 | noun_entity: Vec>, 18 | } 19 | 20 | impl CoreferenceCategories { 21 | /// Creates a new CoreferenceCategories instance, initializing ranges from the provided vocabulary database. 22 | pub fn new(vocab: &VocabDatabase) -> Self { 23 | Self { 24 | ner_person: vocab.categories.ner.path2range("person").unwrap_or(0..0), 25 | noun_person: Self::compile_noun_person(vocab), 26 | ner_entity: Self::compile_ner_entity(vocab), 27 | noun_entity: Self::compile_noun_entity(vocab), 28 | } 29 | } 30 | 31 | /// Compiles a list of NER entity category ranges for facilities, organizations, and businesses from the vocabulary database. 32 | pub fn compile_ner_entity(vocab: &VocabDatabase) -> Vec> { 33 | let mut res: Vec> = Vec::new(); 34 | for label in ["facility", "organization", "business"].iter() { 35 | if let Some(r) = vocab.categories.ner.path2range(label) { 36 | res.push(r); 37 | } 38 | } 39 | 40 | res 41 | } 42 | 43 | /// Compiles a list of noun person category ranges for military ranks, family relations, occupations, corporate jobs, and individuals. 44 | pub fn compile_noun_person(vocab: &VocabDatabase) -> Vec> { 45 | // Set paths 46 | let paths = [ 47 | "military/military_rank", 48 | "health_and_human/family_relation", 49 | "education/occupation", 50 | "business_and_finance/corporate_job", 51 | "personnel/individual", 52 | ]; 53 | 54 | let mut res: Vec> = Vec::new(); 55 | for path in paths.iter() { 56 | if let Some(r) = vocab.categories.nouns.path2range(path) { 57 | res.push(r); 58 | } 59 | } 60 | 61 | res 62 | } 63 | 64 | /// Compiles a list of noun entity category ranges for transportation, military vehicles, landforms, infrastructure, and groups. 65 | pub fn compile_noun_entity(vocab: &VocabDatabase) -> Vec> { 66 | // Set paths 67 | let paths = [ 68 | "transportation/aircraft", 69 | "transportation/automobile", 70 | "transportation/bycycle", 71 | "transportation/_public_transportation", 72 | "transportation/ship", 73 | "military/vehicle", 74 | "environment/landform", 75 | "architecture_and_construction/infrastructure", 76 | "personnel/group", 77 | ]; 78 | 79 | let mut res: Vec> = Vec::new(); 80 | for path in paths.iter() { 81 | if let Some(r) = vocab.categories.nouns.path2range(path) { 82 | res.push(r); 83 | } 84 | } 85 | 86 | res 87 | } 88 | 89 | /// Checks if a token represents a person, based on NER person range or noun person categories. 90 | pub fn is_person(&self, token: &Token) -> bool { 91 | if token.is_named_entity() { 92 | return token.has_ner(&self.ner_person); 93 | } 94 | 95 | self.noun_person.iter().any(|r| token.has_category(r)) 96 | } 97 | 98 | /// Checks if a token represents an entity, based on NER entity ranges, plural noun person categories, or noun entity categories. 99 | pub fn is_entity(&self, token: &Token) -> bool { 100 | if token.is_named_entity() { 101 | return self.ner_entity.iter().any(|r| token.has_ner(r)); 102 | } 103 | 104 | if token.pos == POSTag::NNS && self.noun_person.iter().any(|r| token.has_category(r)) { 105 | return true; 106 | } 107 | 108 | self.noun_entity.iter().any(|r| token.has_category(r)) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/phrase_intents.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use crate::error::Error; 8 | use crate::tokenizer::{Token, TokenizedInput}; 9 | use serde::{Deserialize, Serialize}; 10 | use std::collections::HashMap; 11 | use std::fmt; 12 | 13 | /// A trie-like structure for storing phrase intents 14 | #[derive(Serialize, Deserialize)] 15 | pub struct PhraseIntents { 16 | pub intent: Option, 17 | pub children: HashMap>, 18 | } 19 | 20 | #[derive(Default, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] 21 | pub enum PhraseIntent { 22 | acknowledgment, 23 | affirmation, 24 | emphasis, 25 | hesitation, 26 | negation, 27 | #[default] 28 | neutral, 29 | rejection, 30 | request, 31 | } 32 | 33 | impl Default for PhraseIntents { 34 | fn default() -> Self { 35 | Self::new() 36 | } 37 | } 38 | 39 | impl PhraseIntents { 40 | /// Creates a new phrase intents node 41 | pub fn new() -> Self { 42 | Self { 43 | intent: None, 44 | children: HashMap::new(), 45 | } 46 | } 47 | 48 | /// Inserts a phrase into the trie 49 | pub fn insert(&mut self, intent: PhraseIntent, tokens: &[Token]) { 50 | let mut current = self; 51 | for token in tokens.iter() { 52 | current = current.children.entry(token.index).or_insert(Box::new(PhraseIntents::new())); 53 | } 54 | current.intent = Some(intent); 55 | } 56 | 57 | /// Check a vector of tokens as to whether or not a phrase intent entry exists 58 | pub fn check( 59 | &self, 60 | mut position: usize, 61 | output: &TokenizedInput, 62 | ) -> Option<(PhraseIntent, usize)> { 63 | let mut length = 0; 64 | let mut index = if let Some(token) = &output.mwe[position].token { 65 | token.index 66 | } else { 67 | output.tokens[output.mwe[position].position].index 68 | }; 69 | let mut current = self; 70 | while let Some(node) = current.children.get(&index) { 71 | length += 1; 72 | if let Some(intent) = node.intent { 73 | return Some((intent, length)); 74 | } 75 | 76 | position += 1; 77 | if position >= output.mwe.len() { 78 | return None; 79 | } 80 | index = if let Some(child_token) = &output.mwe[position].token { 81 | child_token.index 82 | } else { 83 | output.tokens[output.mwe[position].position].index 84 | }; 85 | current = node; 86 | } 87 | 88 | None 89 | } 90 | } 91 | 92 | impl PhraseIntent { 93 | pub fn from_str(value: &str) -> Result { 94 | match value { 95 | "acknowledgment" => Ok(PhraseIntent::acknowledgment), 96 | "affirmation" => Ok(PhraseIntent::affirmation), 97 | "emphasis" => Ok(PhraseIntent::emphasis), 98 | "hesitation" => Ok(PhraseIntent::hesitation), 99 | "negation" => Ok(PhraseIntent::negation), 100 | "neutral" => Ok(PhraseIntent::neutral), 101 | "rejection" => Ok(PhraseIntent::rejection), 102 | "request" => Ok(PhraseIntent::request), 103 | _ => Err(Error::Generic(format!( 104 | "Invalid phrase intent value, {}", 105 | value 106 | ))), 107 | } 108 | } 109 | } 110 | 111 | impl fmt::Display for PhraseIntent { 112 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 113 | let value = match self { 114 | PhraseIntent::acknowledgment => "acknowledgment".to_string(), 115 | PhraseIntent::affirmation => "affirmation".to_string(), 116 | PhraseIntent::emphasis => "emphasis".to_string(), 117 | PhraseIntent::hesitation => "hesitation".to_string(), 118 | PhraseIntent::negation => "negation".to_string(), 119 | PhraseIntent::neutral => "neutral".to_string(), 120 | PhraseIntent::rejection => "rejection".to_string(), 121 | PhraseIntent::request => "request".to_string(), 122 | }; 123 | write!(f, "{}", value) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/coref_categories.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use crate::pos_tagger::POSTag; 8 | use crate::tokenizer::Token; 9 | use crate::vocab::VocabDatabase; 10 | use std::ops::Range; 11 | 12 | /// Stores category ranges for coreference resolution, including named entity recognition (NER) and noun-based person and entity classifications. 13 | #[derive(Clone, Default)] 14 | pub struct CoreferenceCategories { 15 | ner_person: Range, 16 | noun_person: Vec>, 17 | ner_entity: Vec>, 18 | noun_entity: Vec>, 19 | } 20 | 21 | impl CoreferenceCategories { 22 | /// Creates a new CoreferenceCategories instance, initializing ranges from the provided vocabulary database. 23 | pub fn new(vocab: &VocabDatabase) -> Self { 24 | Self { 25 | ner_person: vocab.categories.ner.path2range("person").unwrap_or(0..0), 26 | noun_person: Self::compile_noun_person(vocab), 27 | ner_entity: Self::compile_ner_entity(vocab), 28 | noun_entity: Self::compile_noun_entity(vocab), 29 | } 30 | } 31 | 32 | /// Compiles a list of NER entity category ranges for facilities, organizations, and businesses from the vocabulary database. 33 | pub fn compile_ner_entity(vocab: &VocabDatabase) -> Vec> { 34 | let mut res: Vec> = Vec::new(); 35 | for label in ["facility", "organization", "business"].iter() { 36 | if let Some(r) = vocab.categories.ner.path2range(label) { 37 | res.push(r); 38 | } 39 | } 40 | 41 | res 42 | } 43 | 44 | /// Compiles a list of noun person category ranges for military ranks, family relations, occupations, corporate jobs, and individuals. 45 | pub fn compile_noun_person(vocab: &VocabDatabase) -> Vec> { 46 | // Set paths 47 | let paths = [ 48 | "military/military_rank", 49 | "health_and_human/family_relation", 50 | "education/occupation", 51 | "business_and_finance/corporate_job", 52 | "personnel/individual", 53 | ]; 54 | 55 | let mut res: Vec> = Vec::new(); 56 | for path in paths.iter() { 57 | if let Some(r) = vocab.categories.nouns.path2range(path) { 58 | res.push(r); 59 | } 60 | } 61 | 62 | res 63 | } 64 | 65 | /// Compiles a list of noun entity category ranges for transportation, military vehicles, landforms, infrastructure, and groups. 66 | pub fn compile_noun_entity(vocab: &VocabDatabase) -> Vec> { 67 | // Set paths 68 | let paths = [ 69 | "transportation/aircraft", 70 | "transportation/automobile", 71 | "transportation/bycycle", 72 | "transportation/_public_transportation", 73 | "transportation/ship", 74 | "military/vehicle", 75 | "environment/landform", 76 | "architecture_and_construction/infrastructure", 77 | "personnel/group", 78 | ]; 79 | 80 | let mut res: Vec> = Vec::new(); 81 | for path in paths.iter() { 82 | if let Some(r) = vocab.categories.nouns.path2range(path) { 83 | res.push(r); 84 | } 85 | } 86 | 87 | res 88 | } 89 | 90 | /// Checks if a token represents a person, based on NER person range or noun person categories. 91 | pub fn is_person(&self, token: &Token) -> bool { 92 | if token.is_named_entity() { 93 | return token.has_ner(&self.ner_person); 94 | } 95 | 96 | self.noun_person.iter().any(|r| token.has_category(r)) 97 | } 98 | 99 | /// Checks if a token represents an entity, based on NER entity ranges, plural noun person categories, or noun entity categories. 100 | pub fn is_entity(&self, token: &Token) -> bool { 101 | if token.is_named_entity() { 102 | return self.ner_entity.iter().any(|r| token.has_ner(r)); 103 | } 104 | 105 | if token.pos == POSTag::NNS && self.noun_person.iter().any(|r| token.has_category(r)) { 106 | return true; 107 | } 108 | 109 | self.noun_entity.iter().any(|r| token.has_category(r)) 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/sophia/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Sophia NLU Engine (cicero-sophia) 3 | 4 | High-performance NLU (natural language understanding) engine built in Rust for speed, accuracy, and privacy. 5 | 6 | ![Crates.io](https://img.shields.io/crates/v/cicero-sophia.svg) 7 | ![Docs.rs](https://docs.rs/cicero-sophia/badge.svg) 8 | ![License](https://img.shields.io/badge/license-GPLv3-blue.svg) (LICENSE) 9 | 10 | ## Features 11 | 12 | **Core Capabilities** 13 | 14 | * Industry-leading vocabulary with 914,000 (full) or 145,000 (lite) words 15 | * Sophisticated categorization system spanning 8,700+ hierarchical categories, allowing for easy word to action mapping 16 | * Advanced language processing including POS tagging, anaphora resolution, and named entity recognition 17 | * Intelligent phrase parsing with automated spelling correction 18 | 19 | **Performance** 20 | 21 | * Process ~25,000 words per second on a single thread 22 | * Lightweight deployment: Single 79MB (lite) or 177MB (full) data store 23 | * Zero external dependencies or API calls required 24 | * Privacy-focused with all processing done locally 25 | 26 | Github: https://github.com/cicero-ai/cicero/ 27 | 28 | ## License 29 | 30 | Typical dual license model, free and open source for individual use via the GPLv3 license, but premium license required for commercial use. For full details including online demo, please visit: [https://cicero.sh/sophia/](https://cicero.sh/sophia/). 31 | 32 | 33 | ## Installation 34 | 35 | Add cicero-sophia to your project by including it in your Cargo.toml: 36 | 37 | toml 38 | 39 | [dependencies] 40 | cicero-sophia = "0.3.0" 41 | 42 | 43 | ## Vocabulary Data Store 44 | 45 | To use Sophia, you must obtain the vocabulary data store, which is available free of charge. Simply visit [https://cicero.sh/](https://ciciero.sh/) register for a free account, and the vocabulary data store is available for download within the member's area. 46 | 47 | ## Usage 48 | 49 | **Example 1: Tokenizing Text** 50 | 51 | ```rust 52 | use sophia::{Sophia, Error}; 53 | 54 | fn main() -> Result<(), Error> { 55 | // Initialize Sophia 56 | let datadir = "./vocab_data"; 57 | let sophia = Sophia::new(datadir, "en")?; 58 | 59 | // Tokenize the input text 60 | let output = sophia.tokenize("The quick brown fox jumps over the lazy dog")?; 61 | 62 | // Print individual tokens 63 | println!("Individual Tokens:"); 64 | for token in output.iter() { 65 | println!(" Word: {} POS: {}", token.word, token.pos); 66 | } 67 | 68 | // Print MWEs 69 | println!("\nMulti-Word Entities (MWEs):"); 70 | for token in output.mwe() { 71 | println!(" Word: {} POS: {}", token.word, token.pos); 72 | } 73 | 74 | Ok(()) 75 | } 76 | ``` 77 | 78 | **Example 2: Interpreting Text** 79 | 80 | ```rust 81 | 82 | use sophia::{Sophia, Error}; 83 | 84 | fn main() -> Result<(), Error> { 85 | // Initialize Sophia 86 | let datadir = "./vocab_data"; 87 | let sophia = Sophia::new(datadir, "en")?; 88 | 89 | // Interpret the input text 90 | let output = sophia.interpret("The quick brown fox jumps over the lazy dog")?; 91 | 92 | // Print phrases 93 | println!("Phrases:"); 94 | for phrase in output.phrases.iter() { 95 | println!(" {:?}", phrase); 96 | } 97 | 98 | // Print individual tokens 99 | println!("\nIndividual Tokens:"); 100 | for token in output.tokens.iter() { 101 | println!(" Word: {} POS: {}", token.word, token.pos); 102 | } 103 | 104 | Ok(()) 105 | } 106 | ``` 107 | 108 | 109 | **Example 3: Retrieve individual word / toekn** 110 | 111 | ```rust 112 | 113 | use sophia::{Sophia, Error}; 114 | 115 | fn main() -> Result<(), Error> { 116 | // Initialize Sophia 117 | let datadir = "./vocab_data"; 118 | let sophia = Sophia::new(datadir, "en")?; 119 | 120 | // Get word 121 | let token = sophia.get_word("future").unwrap(); 122 | println!("Got word {}, id {}, pos {}", token.word, token.index, token.pos); 123 | 124 | // Get specific token 125 | let token = sophia.get_token(82251).unwrap(); 126 | println!("Got word {}, id {}, pos {}", token.word, token.index, token.pos); 127 | 128 | Ok(()) 129 | } 130 | ``` 131 | 132 | **Example 4: Retrieve Category** 133 | 134 | ```rust 135 | 136 | use sophia::{Sophia, Error}; 137 | 138 | fn main() -> Result<(), Error> { 139 | // Initialize Sophia 140 | let datadir = "./vocab_data"; 141 | let sophia = Sophia::new(datadir, "en")?; 142 | 143 | // Get category 144 | let cat = sophia.get_category("verbs/action/travel/depart").unwrap(); 145 | println!("name {}", cat.name); 146 | println!("fqn: {}", cat.fqn); 147 | println!("word ids: {:?}", cat.words); 148 | 149 | Ok(()) 150 | } 151 | ``` 152 | 153 | ## Contact 154 | 155 | For all inquiries, please complete the contact form at: https://cicero.sh/contact 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/mwe.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use serde::{Deserialize, Serialize}; 8 | use std::collections::HashMap; 9 | 10 | /// Represents a multi-word entity (MWE) node in a trie-like structure, with index, type, capitalization, and child nodes. 11 | #[derive(Default, Serialize, Deserialize)] 12 | pub struct VocabMWE { 13 | pub index: i32, 14 | pub mwe_type: MWEType, 15 | pub capitalization: Capitalization, 16 | pub orig_word: String, 17 | pub children: HashMap>, 18 | } 19 | 20 | /// Defines the type of a multi-word entity, which can be standard, scoring, or both. 21 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Clone)] 22 | pub enum MWEType { 23 | #[default] 24 | standard, 25 | scoring, 26 | both, 27 | } 28 | 29 | /// Defines the capitalization style of a word, which can be lowercase, uppercase, title case, or other. 30 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Hash, Clone)] 31 | pub enum Capitalization { 32 | #[default] 33 | lower, 34 | upper, 35 | title, 36 | other(String), 37 | } 38 | 39 | impl VocabMWE { 40 | /// Creates a new VocabMWE node for a word with the specified MWE type and inferred capitalization. 41 | pub fn new(word: &str, mwe_type: MWEType) -> VocabMWE { 42 | let capitalization = Self::classify_capitalization(word); 43 | 44 | VocabMWE { 45 | index: 0, 46 | mwe_type, 47 | capitalization, 48 | orig_word: String::new(), 49 | children: HashMap::new(), 50 | } 51 | } 52 | 53 | /// Inserts a phrase into the MWE trie, assigning the given index and MWE type, and returns the index. 54 | pub fn insert(&mut self, phrase: &str, index: i32, mwe_type: MWEType) -> i32 { 55 | let mut current = self; 56 | for word in phrase.split(" ").collect::>().iter() { 57 | current = current 58 | .children 59 | .entry(word.to_lowercase().to_string()) 60 | .or_insert(Box::new(VocabMWE::new(word, mwe_type.clone()))); 61 | 62 | if current.mwe_type == MWEType::standard && mwe_type == MWEType::scoring { 63 | current.mwe_type = MWEType::both; 64 | } else if current.mwe_type == MWEType::scoring && mwe_type == MWEType::standard { 65 | current.mwe_type = MWEType::both; 66 | } 67 | } 68 | 69 | current.index = index; 70 | index 71 | } 72 | 73 | /// Retrieves the index of a multi-word entity phrase from the trie, returning 0 if not found. 74 | pub fn get(&self, phrase: &str) -> i32 { 75 | let mut current = self; 76 | for word in phrase.to_lowercase().split(" ").collect::>().iter() { 77 | match current.children.get(*word) { 78 | Some(next) => current = next.as_ref(), 79 | None => return 0, 80 | } 81 | } 82 | current.index 83 | } 84 | 85 | /// Classifies the capitalization style of a string (lowercase, uppercase, title case, or other). 86 | pub fn classify_capitalization(s: &str) -> Capitalization { 87 | if s.to_lowercase() == s { 88 | Capitalization::lower 89 | } else if s.to_uppercase() == s { 90 | Capitalization::upper 91 | } else if s.chars().all(|c| c.is_uppercase()) && s.chars().any(|c| c.is_lowercase()) { 92 | Capitalization::title 93 | } else { 94 | Capitalization::other(s.to_string()) 95 | } 96 | } 97 | 98 | /// Formats a word according to the node's capitalization style (lowercase, uppercase, title case, or original). 99 | pub fn format(&self, word: &str) -> String { 100 | match self.capitalization { 101 | Capitalization::lower => word.to_lowercase(), 102 | Capitalization::upper => word.to_uppercase(), 103 | Capitalization::title => format!( 104 | "{}{}", 105 | word.chars().next().unwrap().to_uppercase(), 106 | &word[1..].to_lowercase() 107 | ), 108 | _ => self.orig_word.to_string(), 109 | } 110 | } 111 | } 112 | 113 | impl Capitalization { 114 | /// Creates a Capitalization variant from a string value, using the original string for 'other' cases. 115 | pub fn from_str(value: &str, orig: &str) -> Self { 116 | match value { 117 | "lower" => Self::lower, 118 | "upper" => Self::upper, 119 | "title" => Self::title, 120 | "other" => Self::other(orig.to_string()), 121 | _ => panic!("Invalid capitalization value, {}", value), 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/sophia/src/tokenizer/input.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::Token; 8 | 9 | /// Represents the tokenized output of input text, including tokens, multi-word entities (MWEs), and iteration state. 10 | #[derive(Default, Clone)] 11 | pub struct TokenizedInput { 12 | pub original: String, 13 | pub tokens: Vec, 14 | pub mwe: Vec, 15 | pub mwe_scoring: Vec, 16 | position: usize, 17 | filter_mwe: bool, 18 | filter_mwe_scoring: bool, 19 | filter_stopwords: bool, 20 | } 21 | 22 | /// Represents a multi-word entity (MWE) with a position and an optional associated token. 23 | #[derive(Clone)] 24 | pub struct MWE { 25 | pub position: usize, 26 | pub token: Option, 27 | } 28 | 29 | impl TokenizedInput { 30 | /// Creates a new TokenizedInput instance with the provided original text and empty token/MWE lists. 31 | pub fn new(original: &str) -> Self { 32 | Self { 33 | original: original.to_string(), 34 | tokens: Vec::new(), 35 | mwe: Vec::new(), 36 | mwe_scoring: Vec::new(), 37 | position: 0, 38 | filter_mwe: false, 39 | filter_mwe_scoring: false, 40 | filter_stopwords: false, 41 | } 42 | } 43 | 44 | /// Returns a new TokenizedInput configured to iterate over individual tokens. 45 | pub fn iter(&self) -> Self { 46 | let mut c = self.clone(); 47 | c.filter_mwe = false; 48 | c.position = 0; 49 | c 50 | } 51 | 52 | /// Returns a new TokenizedInput configured to iterate over MWEs. 53 | pub fn mwe(&self) -> Self { 54 | let mut c = self.clone(); 55 | c.filter_mwe = true; 56 | c.position = 0; 57 | c 58 | } 59 | 60 | /// Returns a new TokenizedInput configured to iterate over MWE scoring tokens. 61 | pub fn mwe_scoring(&self) -> Self { 62 | let mut c = self.clone(); 63 | c.filter_mwe_scoring = true; 64 | c.position = 0; 65 | c 66 | } 67 | 68 | /// Configures the TokenizedInput to filter out stopwords during iteration. 69 | pub fn remove_stop_words(mut self) -> Self { 70 | self.filter_stopwords = true; 71 | self 72 | } 73 | 74 | /// Configures the TokenizedInput to include stopwords during iteration. 75 | pub fn add_stop_words(mut self) -> Self { 76 | self.filter_stopwords = false; 77 | self 78 | } 79 | 80 | /// Retrieves the next MWE token, either from the MWE's token or the token at the MWE's position. 81 | fn next_mwe(&mut self) -> Option { 82 | if self.position >= self.mwe.len() { 83 | return None; 84 | } 85 | let mwe = self.mwe.get(self.position).unwrap(); 86 | self.position += 1; 87 | 88 | let token = match mwe.token.clone() { 89 | Some(r) => r, 90 | None => self.tokens.get(mwe.position).unwrap().clone(), 91 | }; 92 | 93 | Some(token) 94 | } 95 | 96 | /// Retrieves the next MWE scoring token, either from the MWE scoring's token or the token at its position. 97 | fn next_mwe_scoring(&mut self) -> Option { 98 | if self.position >= self.mwe_scoring.len() { 99 | return None; 100 | } 101 | let mwe = self.mwe_scoring.get(self.position).unwrap(); 102 | self.position += 1; 103 | 104 | let token = match mwe.token.clone() { 105 | Some(r) => r, 106 | None => self.tokens.get(mwe.position).unwrap().clone(), 107 | }; 108 | 109 | Some(token) 110 | } 111 | } 112 | 113 | impl std::ops::Index for TokenizedInput { 114 | type Output = Token; 115 | 116 | /// Provides read-only indexing into the token vector by position. 117 | fn index(&self, index: usize) -> &Self::Output { 118 | &self.tokens[index] 119 | } 120 | } 121 | 122 | impl std::ops::IndexMut for TokenizedInput { 123 | /// Provides mutable indexing into the token vector by position. 124 | fn index_mut(&mut self, index: usize) -> &mut Self::Output { 125 | &mut self.tokens[index] 126 | } 127 | } 128 | 129 | impl Iterator for TokenizedInput { 130 | /// Advances the iterator, returning the next token based on the current filter (MWE, MWE scoring, or individual tokens). 131 | type Item = Token; 132 | 133 | fn next(&mut self) -> Option { 134 | // MWE 135 | if self.filter_mwe { 136 | return self.next_mwe(); 137 | } else if self.filter_mwe_scoring { 138 | return self.next_mwe_scoring(); 139 | } 140 | 141 | if self.position >= self.tokens.len() { 142 | return None; 143 | } 144 | let token = self.tokens.get(self.position).unwrap(); 145 | self.position += 1; 146 | 147 | Some(token.clone()) 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/interpreter.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use super::{CoreferenceCategories, Interpretation, PhraseBuffer}; 7 | use crate::interpreter::phrase::{Adjective, Adverb}; 8 | use crate::pos_tagger::POSTag; 9 | use crate::tokenizer::{TokenizedInput, Tokenizer}; 10 | use crate::vocab::VocabDatabase; 11 | use std::collections::HashMap; 12 | 13 | pub struct Interpreter { 14 | coref_categories: CoreferenceCategories, 15 | } 16 | 17 | impl Interpreter { 18 | /// Creates a new Interpreter instance from the provided vocabulary database. 19 | pub fn new(vocab: &VocabDatabase) -> Self { 20 | Self { 21 | coref_categories: CoreferenceCategories::new(vocab), 22 | } 23 | } 24 | 25 | /// Interprets user input by tokenizing, processing, and categorizing tokens into an Interpretation struct. 26 | /// Returns the constructed Interpretation with scores, tokens, multi-word expressions, and phrases. 27 | pub fn interpret( 28 | &self, 29 | input: &str, 30 | tokenizer: &Tokenizer, 31 | vocab: &VocabDatabase, 32 | ) -> Interpretation { 33 | // Tokenize input 34 | let mut tokens = tokenizer.encode(input, vocab); 35 | let mut buffer = PhraseBuffer::new(&self.coref_categories, vocab); 36 | 37 | // GO through tokens 38 | for (x, token) in tokens.mwe().enumerate() { 39 | buffer.tokens.push(token.clone()); 40 | 41 | // Check for phrase intent 42 | if let Some((intent, length)) = vocab.words.phrase_intents.check(x, &tokens) { 43 | buffer.add_intent(intent, length); 44 | } 45 | 46 | if token.is_sentence_stopper() { 47 | buffer.hard_split(x); 48 | } else if token.is_noun() 49 | && buffer.last_pos == POSTag::VBG 50 | && !buffer.current_verbs.is_empty() 51 | { 52 | buffer.current_verbs.last_mut().unwrap().objects.push(x); 53 | } else if token.is_noun() { 54 | buffer.add_noun(x); 55 | } else if vocab.preprocess.auxillary_verbs.contains(&token.index) 56 | || vocab.preprocess.predicative_verbs.contains(&token.index) 57 | { 58 | if vocab.preprocess.auxillary_verbs.contains(&token.index) { 59 | buffer.auxillary_verbs.push(x); 60 | } 61 | if vocab.preprocess.predicative_verbs.contains(&token.index) { 62 | buffer.predicative_verbs.push(x); 63 | } 64 | } else if token.is_verb() { 65 | buffer.add_verb(x); 66 | } else if token.is_adverb() { 67 | buffer.adverbs.push(Adverb::new(x, &token, vocab)); 68 | } else if token.is_adjective() { 69 | buffer.adjectives.push(Adjective::new(x, &token, vocab)); 70 | } else if token.is_pronoun() { 71 | buffer.add_pronoun(x); 72 | } else if token.is_preposition() { 73 | buffer.prepositions.push(x); 74 | } else if token.is_determiner() { 75 | buffer.determiners.push(x); 76 | } else if token.pos == POSTag::CC || [",", ";", "-", "+"].contains(&token.word.as_str()) 77 | { 78 | buffer.noun_seperators.push(x); 79 | } else if !token.is_conjunction() { 80 | buffer.noise.push(x); 81 | } 82 | 83 | // Linker 84 | if token.is_conjunction() { 85 | buffer.linkers.push(x); 86 | } 87 | 88 | // Splitter 89 | if token.is_preposition() || token.is_conjunction() || token.word.as_str() == "," { 90 | buffer.splitters.push(x); 91 | } 92 | 93 | // Add non-pronoun to antecedent buffer 94 | if !token.is_pronoun() { 95 | buffer.antecedents.add_non_noun(&token); 96 | } 97 | buffer.last_pos = token.pos; 98 | } 99 | 100 | // Finish buffer 101 | buffer.hard_split(buffer.tokens.len() - 1); 102 | 103 | // Instantiate interpretation 104 | Interpretation { 105 | scores: self.get_scores(&tokens), 106 | tokens: std::mem::take(&mut tokens.tokens), 107 | mwe: std::mem::take(&mut buffer.tokens), 108 | phrases: std::mem::take(&mut buffer.phrases), 109 | } 110 | } 111 | 112 | /// Computes classification scores for tokens by averaging scores per code from multi-word expression scoring. 113 | /// Returns a HashMap mapping classification codes to their average scores. 114 | fn get_scores(&self, tokens: &TokenizedInput) -> HashMap { 115 | let mut res: HashMap> = HashMap::new(); 116 | for token in tokens.mwe_scoring() { 117 | for (code, score) in token.classification_scores.iter() { 118 | res.entry(*code).or_default().push(score.to_f32()); 119 | } 120 | } 121 | 122 | // Average scores 123 | let mut scores: HashMap = HashMap::new(); 124 | for (code, vec_scores) in res.iter() { 125 | let avg = vec_scores.iter().sum::() / (vec_scores.len() as f32); 126 | scores.insert(*code, avg); 127 | } 128 | 129 | scores 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/interpreter.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{CoreferenceCategories, Interpretation, PhraseBuffer}; 8 | use crate::interpret::phrase::{Adjective, Adverb}; 9 | use crate::pos_tagger::POSTag; 10 | use crate::tokenizer::{TokenizedInput, Tokenizer}; 11 | use crate::vocab::VocabDatabase; 12 | use std::collections::HashMap; 13 | 14 | pub struct Interpreter { 15 | coref_categories: CoreferenceCategories, 16 | } 17 | 18 | impl Interpreter { 19 | /// Creates a new Interpreter instance from the provided vocabulary database. 20 | pub fn new(vocab: &VocabDatabase) -> Self { 21 | Self { 22 | coref_categories: CoreferenceCategories::new(vocab), 23 | } 24 | } 25 | 26 | /// Interprets user input by tokenizing, processing, and categorizing tokens into an Interpretation struct. 27 | /// Returns the constructed Interpretation with scores, tokens, multi-word expressions, and phrases. 28 | pub fn interpret( 29 | &self, 30 | input: &str, 31 | tokenizer: &Tokenizer, 32 | vocab: &VocabDatabase, 33 | ) -> Interpretation { 34 | // Tokenize input 35 | let mut tokens = tokenizer.encode(input, vocab); 36 | let mut buffer = PhraseBuffer::new(&self.coref_categories, vocab); 37 | 38 | // GO through tokens 39 | for (x, token) in tokens.mwe().enumerate() { 40 | buffer.tokens.push(token.clone()); 41 | 42 | // Check for phrase intent 43 | if let Some((intent, length)) = vocab.words.phrase_intents.check(x, &tokens) { 44 | buffer.add_intent(intent, length); 45 | } 46 | 47 | if token.is_sentence_stopper() { 48 | buffer.hard_split(x); 49 | } else if token.is_noun() 50 | && buffer.last_pos == POSTag::VBG 51 | && !buffer.current_verbs.is_empty() 52 | { 53 | buffer.current_verbs.last_mut().unwrap().objects.push(x); 54 | } else if token.is_noun() { 55 | buffer.add_noun(x); 56 | } else if vocab.preprocess.auxillary_verbs.contains(&token.index) 57 | || vocab.preprocess.predicative_verbs.contains(&token.index) 58 | { 59 | if vocab.preprocess.auxillary_verbs.contains(&token.index) { 60 | buffer.auxillary_verbs.push(x); 61 | } 62 | if vocab.preprocess.predicative_verbs.contains(&token.index) { 63 | buffer.predicative_verbs.push(x); 64 | } 65 | } else if token.is_verb() { 66 | buffer.add_verb(x); 67 | } else if token.is_adverb() { 68 | buffer.adverbs.push(Adverb::new(x, &token, vocab)); 69 | } else if token.is_adjective() { 70 | buffer.adjectives.push(Adjective::new(x, &token, vocab)); 71 | } else if token.is_pronoun() { 72 | buffer.add_pronoun(x); 73 | } else if token.is_preposition() { 74 | buffer.prepositions.push(x); 75 | } else if token.is_determiner() { 76 | buffer.determiners.push(x); 77 | } else if token.pos == POSTag::CC || [",", ";", "-", "+"].contains(&token.word.as_str()) 78 | { 79 | buffer.noun_seperators.push(x); 80 | } else if !token.is_conjunction() { 81 | buffer.noise.push(x); 82 | } 83 | 84 | // Linker 85 | if token.is_conjunction() { 86 | buffer.linkers.push(x); 87 | } 88 | 89 | // Splitter 90 | if token.is_preposition() || token.is_conjunction() || token.word.as_str() == "," { 91 | buffer.splitters.push(x); 92 | } 93 | 94 | // Add non-pronoun to antecedent buffer 95 | if !token.is_pronoun() { 96 | buffer.antecedents.add_non_noun(&token); 97 | } 98 | buffer.last_pos = token.pos; 99 | } 100 | 101 | // Finish buffer 102 | buffer.hard_split(buffer.tokens.len() - 1); 103 | 104 | // Instantiate interpretation 105 | Interpretation { 106 | scores: self.get_scores(&tokens), 107 | tokens: std::mem::take(&mut tokens.tokens), 108 | mwe: std::mem::take(&mut buffer.tokens), 109 | phrases: std::mem::take(&mut buffer.phrases), 110 | } 111 | } 112 | 113 | /// Computes classification scores for tokens by averaging scores per code from multi-word expression scoring. 114 | /// Returns a HashMap mapping classification codes to their average scores. 115 | fn get_scores(&self, tokens: &TokenizedInput) -> HashMap { 116 | let mut res: HashMap> = HashMap::new(); 117 | for token in tokens.mwe_scoring() { 118 | for (code, score) in token.classification_scores.iter() { 119 | res.entry(*code).or_default().push(score.to_f32()); 120 | } 121 | } 122 | 123 | // Average scores 124 | let mut scores: HashMap = HashMap::new(); 125 | for (code, vec_scores) in res.iter() { 126 | let avg = vec_scores.iter().sum::() / (vec_scores.len() as f32); 127 | scores.insert(*code, avg); 128 | } 129 | 130 | scores 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/tagger.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{HMM, POSModel, POSModelInterface, POSTag, POSTagModelRepo}; 8 | use crate::tokenizer::{Token, TokenizedInput}; 9 | use crate::vocab::VocabDatabase; 10 | use serde::{Deserialize, Serialize}; 11 | use std::collections::HashMap; 12 | 13 | /// The POS tagger itself including the base HMM, 14 | /// along with tag and word based post processing models 15 | #[derive(Default, Serialize, Deserialize)] 16 | pub struct POSTagger { 17 | pub hmm: HMM, 18 | pub cohort: POSModel, 19 | pub tags: POSTagModelRepo, 20 | pub words: HashMap>, 21 | } 22 | 23 | #[derive(Default, Debug, Clone, Serialize, Deserialize)] 24 | pub struct POSPrediction { 25 | pub method: POSPredictionMethod, 26 | pub word: String, 27 | pub prev_tag: POSTag, 28 | pub tag: POSTag, 29 | pub confidence: f32, 30 | pub probabilities: HashMap, 31 | pub conjunctions: Vec, 32 | } 33 | 34 | #[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize)] 35 | pub enum POSPredictionMethod { 36 | #[default] 37 | non_ambiguous, 38 | hmm, 39 | standard, 40 | conjunction, 41 | deterministic_rule, 42 | exception, 43 | } 44 | 45 | impl POSTagger { 46 | pub fn new() -> Self { 47 | Self::default() 48 | } 49 | 50 | /// Applies part-of-speech tagging to the tokenized input, resolving ambiguous words 51 | pub fn apply(&self, output: &mut TokenizedInput, vocab: &VocabDatabase) { 52 | // Fix spelling typos 53 | self.fix_spelling_typos(output, vocab); 54 | 55 | // Resolve via HMM model 56 | self.hmm.apply(&mut output.tokens); 57 | 58 | // Iterate through words 59 | for position in 0..output.tokens.len() { 60 | if output.tokens[position].potential_pos.len() < 2 { 61 | continue; 62 | } 63 | 64 | // Resolve ambiguity 65 | if let Some(pred) = self.resolve(position, output) { 66 | output.tokens[position].pos_prediction = pred.clone(); 67 | if output.tokens[position].pos != pred.tag 68 | && let Some(new_token) = output.tokens[position].update_pos(pred.tag, vocab) 69 | { 70 | output.tokens[position] = new_token; 71 | } 72 | } 73 | } 74 | } 75 | 76 | /// Fix spelling typos 77 | fn fix_spelling_typos(&self, output: &mut TokenizedInput, vocab: &VocabDatabase) { 78 | for position in 0..output.tokens.len() { 79 | if output.tokens[position].pos != POSTag::FW { 80 | continue; 81 | } 82 | 83 | // Get initial prediction 84 | if let Some(pred) = self.cohort.predict_cohort(position, &output.tokens) { 85 | output.tokens[position].pos_prediction = pred; 86 | 87 | // Get spelling correction 88 | if let Some(correction) = 89 | vocab.preprocess.spellchecker.try_correct(position, &output.tokens, vocab) 90 | { 91 | output.tokens[position] = correction; 92 | } 93 | } 94 | } 95 | } 96 | 97 | // Resolve ambiguity 98 | fn resolve(&self, position: usize, output: &TokenizedInput) -> Option { 99 | // Check word models 100 | if let Some(model) = self.words.get(&output.tokens[position].index) 101 | && let Some(pred) = model.predict(position, &output.tokens) 102 | { 103 | return Some(pred); 104 | } 105 | 106 | // Check tag models 107 | if let Some(pred) = self.check_tag_models(position, &output.tokens) { 108 | return Some(pred); 109 | } 110 | 111 | None 112 | } 113 | 114 | /// Check the tag models 115 | fn check_tag_models(&self, position: usize, tokens: &[Token]) -> Option { 116 | let tag = tokens[position].pos; 117 | 118 | // Check tag models 119 | if let Some(model_names) = self.tags.tags.get(&tag) { 120 | for name in model_names.iter() { 121 | let model = self.tags.models.get(&name.to_string()).unwrap(); 122 | 123 | // Ensure token is valid for model 124 | if !model.target_tags.contains(&tag) { 125 | continue; 126 | } 127 | if !tokens[position] 128 | .potential_pos 129 | .iter() 130 | .filter(|&p_tag| *p_tag != tag) 131 | .any(|p_tag| model.target_tags.contains(p_tag)) 132 | { 133 | continue; 134 | } 135 | 136 | if let Some(pred) = model.predict(position, tokens) { 137 | return Some(pred); 138 | } 139 | } 140 | } 141 | 142 | None 143 | } 144 | } 145 | 146 | impl POSPrediction { 147 | pub fn new( 148 | method: POSPredictionMethod, 149 | word: &str, 150 | prev_tag: POSTag, 151 | tag: POSTag, 152 | confidence: f32, 153 | probabilities: &HashMap, 154 | conjunctions: &[String], 155 | ) -> Self { 156 | Self { 157 | method, 158 | word: word.to_string(), 159 | prev_tag, 160 | tag, 161 | confidence, 162 | probabilities: probabilities.clone(), 163 | conjunctions: conjunctions.to_vec(), 164 | } 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/database.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{ 8 | FutureVerbPhrases, PhraseIntents, SpellChecker, VocabCache, VocabCategoryDatabase, VocabMWE, 9 | }; 10 | use crate::error::Error; 11 | use crate::pos_tagger::{POSTag, POSTagger}; 12 | use crate::tokenizer::Token; 13 | use crate::vocab::mwe::Capitalization; 14 | use bincode; 15 | use indexmap::IndexMap; 16 | use serde::{Deserialize, Serialize}; 17 | use std::collections::HashMap; 18 | use std::fs; 19 | use std::path::Path; 20 | use std::sync::Mutex; 21 | 22 | /// A comprehensive vocabulary database for natural language processing, containing metadata, preprocessing data, words, categories, and a cache. 23 | #[derive(Serialize, Deserialize)] 24 | pub struct VocabDatabase { 25 | pub meta: VocabDatabaseMeta, 26 | pub preprocess: VocabPreProcessDatabase, 27 | pub words: VocabWordDatabase, 28 | pub categories: VocabCategoryDatabase, 29 | #[serde(skip_serializing, skip_deserializing)] 30 | pub cache: Mutex, 31 | } 32 | 33 | /// Metadata for the vocabulary database, including version, language, author, and integrity details. 34 | #[derive(Serialize, Deserialize)] 35 | pub struct VocabDatabaseMeta { 36 | version: (i8, i8, i8), 37 | language: String, 38 | author: String, 39 | creation_time: String, 40 | sha256_hash: String, 41 | signature: String, 42 | comment: String, 43 | } 44 | 45 | /// Preprocessing data for the vocabulary, including hashes, typos, spellchecker, verb prefixes, and other linguistic resources. 46 | #[derive(Serialize, Deserialize, Clone)] 47 | pub struct VocabPreProcessDatabase { 48 | pub hashes: HashMap, 49 | pub spellchecker: SpellChecker, 50 | pub future_verb_prefixes: Vec, 51 | pub stop_words: Vec, 52 | pub predicative_verbs: Vec, 53 | pub auxillary_verbs: Vec, 54 | pub infinitive_prefixes: Vec, 55 | } 56 | 57 | /// Word-specific data for the vocabulary, including word lists, POS tagger, MWEs, capitalization, future verbs, and token mappings. 58 | #[derive(Serialize, Deserialize)] 59 | pub struct VocabWordDatabase { 60 | pub wordlist: HashMap>, 61 | pub pos_tagger: POSTagger, 62 | pub mwe: VocabMWE, 63 | pub capitalization: HashMap, 64 | pub future_verbs: FutureVerbPhrases, 65 | pub phrase_intents: PhraseIntents, 66 | pub id2token: HashMap, 67 | pub plural: HashMap, 68 | } 69 | 70 | impl VocabDatabase { 71 | /// Saves the vocabulary database to a file using bincode serialization. 72 | pub fn save(&mut self, filename: &str) -> Result<(), Error> { 73 | let encoded: Vec = match bincode::serialize(&self) { 74 | Ok(r) => r, 75 | Err(e) => { 76 | return Err(Error::Save(format!( 77 | "Unable to serialize vocabulary data store, {}", 78 | e 79 | ))); 80 | } 81 | }; 82 | fs::write(filename, &encoded)?; 83 | Ok(()) 84 | } 85 | 86 | /// Loads a vocabulary database from a file in the specified directory, initializing the cache. 87 | pub fn load(datadir: &str, language: &str) -> Result { 88 | let filename = format!("{}/{}.dat", datadir, language); 89 | if !Path::new(&filename).exists() { 90 | return Err(Error::Load(format!( 91 | "No vocabulary file exists at, {}", 92 | filename 93 | ))); 94 | } 95 | let contents = fs::read(&filename)?; 96 | 97 | let mut vocab: VocabDatabase = match bincode::deserialize(&contents[..]) { 98 | Ok(r) => r, 99 | Err(e) => { 100 | return Err(Error::Load(format!( 101 | "Unable to load the vocabulary file. Please ensure correct file is in place, and re-download from secure client area if necessary. Contact customer support if the problem persists. Error: {}", 102 | e 103 | ))); 104 | } 105 | }; 106 | 107 | vocab.cache = Mutex::new(VocabCache::load(datadir)?); 108 | Ok(vocab) 109 | } 110 | 111 | /// Looks up a word by string, returning a Token based on its vocabulary entry. 112 | pub fn from_str(&self, word: &str) -> Token { 113 | let (_, lookup) = match self.lookup_word(word) { 114 | Some(r) => r, 115 | None => return Token::default(), 116 | }; 117 | 118 | // get token 119 | let token_id = lookup.values().next().unwrap(); 120 | self.words.id2token.get(&token_id.clone()).unwrap().clone() 121 | } 122 | 123 | /// Converts a word to its corresponding token ID. 124 | pub fn to_int(&self, word: &str) -> i32 { 125 | let token = self.from_str(word); 126 | token.index 127 | } 128 | 129 | /// Looks up a word in the vocabulary, returning its string and POS-to-ID mapping if found. 130 | pub fn lookup_word(&self, word: &str) -> Option<(String, IndexMap)> { 131 | // Check mwe 132 | if word.contains(" ") { 133 | return None; 134 | } 135 | 136 | // Straight lookup 137 | if let Some(pos_map) = self.words.wordlist.get(&word.to_lowercase().to_string()) { 138 | return Some((word.to_string(), pos_map.clone())); 139 | } 140 | 141 | // Lowercase lookup 142 | if let Some(index) = self.words.wordlist.get(&word.to_lowercase()) { 143 | return Some((word.to_string(), index.clone())); 144 | } 145 | 146 | None 147 | } 148 | 149 | /// Creates a Token from a given token ID using the vocabulary database. 150 | pub fn from_int(&self, token_id: i32) -> Token { 151 | Token::from_id(token_id, self) 152 | } 153 | } 154 | 155 | impl Default for VocabDatabaseMeta { 156 | fn default() -> VocabDatabaseMeta { 157 | VocabDatabaseMeta { 158 | version: (1, 0, 0), 159 | language: "en".to_string(), 160 | author: "Aquila Labs".to_string(), 161 | creation_time: String::new(), 162 | sha256_hash: String::new(), 163 | signature: String::new(), 164 | comment: String::new(), 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/category.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::VocabDatabase; 8 | use indexmap::IndexMap; 9 | use serde::{Deserialize, Serialize}; 10 | use std::collections::HashMap; 11 | use std::fmt; 12 | use std::ops::Range; 13 | 14 | /// A database for storing vocabulary categories, including nouns, verbs, adverbs, adjectives, and named entity recognition (NER) indices. 15 | #[derive(Default, Serialize, Deserialize, Clone)] 16 | pub struct VocabCategoryDatabase { 17 | pub counter: i16, 18 | pub nodes: HashMap, 19 | pub nouns: VocabCategoryIndex, 20 | pub verbs: VocabCategoryIndex, 21 | pub adverbs: VocabCategoryIndex, 22 | pub adjectives: VocabCategoryIndex, 23 | pub ner: VocabCategoryIndex, 24 | } 25 | 26 | /// A trie-like index for vocabulary categories, mapping paths to category indices and their children. 27 | #[derive(Serialize, Deserialize, Clone)] 28 | pub struct VocabCategoryIndex { 29 | pub index: i16, 30 | pub children: IndexMap>, 31 | } 32 | 33 | /// Represents a single category with its fully qualified name (FQN), depth, name, and child categories. 34 | #[derive(Serialize, Deserialize, Clone)] 35 | pub struct VocabCategory { 36 | pub fqn: Vec, 37 | pub depth: i8, 38 | pub name: String, 39 | pub children: IndexMap, 40 | #[serde(skip)] 41 | pub pos: String, 42 | #[serde(skip)] 43 | pub words: Vec, 44 | } 45 | 46 | impl VocabCategoryDatabase { 47 | /// Get category by path name 48 | pub fn get_category_by_path(&self, path: &str, vocab: &VocabDatabase) -> Option { 49 | // Split path 50 | let parts: Vec<&str> = path.split("/").collect::>(); 51 | if parts.len() < 2 { 52 | return None; 53 | } 54 | let remaining_path = parts[1..].join("/").to_string(); 55 | 56 | // Get index 57 | let cat_index = match parts[0] { 58 | "nouns" => self.nouns.index_by_path(&remaining_path)?, 59 | "verbs" => self.verbs.index_by_path(&remaining_path)?, 60 | "adverbs" => self.adverbs.index_by_path(&remaining_path)?, 61 | "adjectives" => self.adjectives.index_by_path(&remaining_path)?, 62 | "ner" => self.ner.index_by_path(&remaining_path)?, 63 | _ => return None, 64 | }; 65 | let index = cat_index.index; 66 | 67 | // Get category 68 | let mut cat: VocabCategory = self.nodes.get(&index)?.clone(); 69 | cat.pos = parts[0].to_string(); 70 | cat.words = vocab 71 | .words 72 | .id2token 73 | .iter() 74 | .filter(|(_, token)| token.categories.contains(&index)) 75 | .map(|(id, _)| *id) 76 | .collect(); 77 | 78 | Some(cat) 79 | } 80 | } 81 | 82 | impl VocabCategoryIndex { 83 | /// Creates a new VocabCategoryIndex with default values. 84 | pub fn new() -> Self { 85 | Self { 86 | index: 0, 87 | children: IndexMap::new(), 88 | } 89 | } 90 | 91 | /// Inserts a category path into the index, assigning the given index and returning it. 92 | pub fn insert(&mut self, path: &str, index: i16) -> i16 { 93 | let mut current = self; 94 | for word in path.split("/").collect::>().iter() { 95 | current = current 96 | .children 97 | .entry(word.to_lowercase().to_string()) 98 | .or_insert(Box::new(VocabCategoryIndex::new())); 99 | } 100 | 101 | current.index = index; 102 | index 103 | } 104 | 105 | /// Retrieves the category index for a given path, if it exists. 106 | pub fn by_path(&self, path: &str) -> Option { 107 | if let Some(cat) = self.index_by_path(path) { 108 | return Some(cat.index); 109 | } 110 | None 111 | } 112 | 113 | /// Retrieves the VocabCategoryIndex object for a given path, if it exists. 114 | pub fn index_by_path(&self, path: &str) -> Option { 115 | let mut current = self; 116 | for word in path.to_lowercase().split("/").collect::>().iter() { 117 | match current.children.get(&word.to_string()) { 118 | Some(next) => current = next.as_ref(), 119 | None => return None, 120 | } 121 | } 122 | Some(current.clone()) 123 | } 124 | 125 | /// Returns the range of category IDs for a path, including its children, if the path exists. 126 | pub fn path2range(&self, path: &str) -> Option> { 127 | if let Some(r) = self.index_by_path(path) { 128 | return Some(r.index..(r.index + (r.count_children() + 1) as i16)); 129 | } 130 | 131 | None 132 | } 133 | 134 | /// Counts the total number of children under this index, including nested children. 135 | pub fn count_children(&self) -> usize { 136 | let mut count = self.children.len(); 137 | 138 | for (_, child) in self.children.iter() { 139 | count += child.count_children(); 140 | } 141 | count 142 | } 143 | } 144 | 145 | impl VocabCategoryDatabase { 146 | /// Retrieves a category by its ID, if it exists. 147 | pub fn get(&self, category_id: &i16) -> Option { 148 | if let Some(cat) = self.nodes.get(&category_id.clone()) { 149 | return Some(cat.clone()); 150 | } 151 | None 152 | } 153 | 154 | /// Retrieves a noun category by its path, if it exists. 155 | pub fn nouns_by_path(&self, path: &str) -> Option { 156 | match self.nouns.by_path(path) { 157 | Some(r) => self.get(&r), 158 | None => None, 159 | } 160 | } 161 | 162 | /// Returns the number of children for a given category ID. 163 | pub fn get_children_count(&self, category_id: &i16) -> usize { 164 | let node = self.nodes.get(&category_id.clone()).unwrap(); 165 | node.children.len() 166 | } 167 | 168 | /// Retrieves the fully qualified names of a category's parent categories. 169 | pub fn get_fqn(&self, category: &VocabCategory) -> Vec { 170 | let mut names: Vec = Vec::new(); 171 | for parent_id in category.fqn.iter() { 172 | let parent_name: String = match self.nodes.get(parent_id) { 173 | Some(r) => r.name.to_string(), 174 | None => String::from("Uknown"), 175 | }; 176 | names.push(parent_name.to_string()); 177 | } 178 | 179 | names 180 | } 181 | } 182 | 183 | impl VocabCategory {} 184 | 185 | impl fmt::Display for VocabCategory { 186 | /// Formats the VocabCategory for display, showing its name. 187 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 188 | write!(f, "{}", self.name) 189 | } 190 | } 191 | 192 | impl fmt::Debug for VocabCategory { 193 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 194 | let fqn = self.fqn.iter().map(|id| id.to_string()).collect::>(); 195 | write!(f, "{} -> {}", fqn.join("/"), self.name) 196 | } 197 | } 198 | 199 | impl Default for VocabCategoryIndex { 200 | fn default() -> VocabCategoryIndex { 201 | VocabCategoryIndex { 202 | index: 0, 203 | children: IndexMap::new(), 204 | } 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/sophia/src/tokenizer/cleaner.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{Buffer, Token}; 8 | use crate::vocab::VocabDatabase; 9 | 10 | static SPECIAL_CHARS: &[char] = &[ 11 | '~', '`', '!', '@', '#', '$', '%', '&', '*', '(', ')', '-', '_', '+', '[', ']', '{', '}', '\\', 12 | '|', ';', ':', '\'', '"', ',', '.', '<', '>', 13 | ]; 14 | static NUMERIC_CHARS: &[char] = &['.', ',', '^', '*', '/', ':']; 15 | 16 | /// A utility for cleaning and classifying tokens, tracking character properties like numeric status and special characters. 17 | #[derive(Default)] 18 | pub struct TokenCleaner { 19 | chars: Vec, 20 | word_len: usize, 21 | numeric_len: usize, 22 | pub is_numeric: bool, 23 | pub has_decimal: bool, 24 | pub has_special: bool, 25 | } 26 | 27 | impl TokenCleaner { 28 | /// Creates a new TokenCleaner instance with default values, marking it as numeric. 29 | pub fn new() -> Self { 30 | Self { 31 | is_numeric: true, 32 | ..Default::default() 33 | } 34 | } 35 | 36 | /// Resets the TokenCleaner to its initial state, equivalent to calling `new`. 37 | pub fn reset(&mut self) { 38 | *self = Self::new(); 39 | } 40 | 41 | /// Cleans and classifies a word, updating the buffer with tokens for prefixes, suffixes, or special cases, returning the cleaned word if applicable. 42 | pub fn clean( 43 | &mut self, 44 | mut word: String, 45 | vocab: &VocabDatabase, 46 | buffer: &mut Buffer, 47 | ) -> Option { 48 | // Scan characters 49 | self.scan_chars(&mut word, vocab, buffer); 50 | 51 | // Classify numeric 52 | if self.is_numeric { 53 | self.classify_numeric(&word, vocab, buffer); 54 | return None; 55 | } 56 | 57 | self.classify_token(&word, vocab, buffer) 58 | } 59 | 60 | /// Scans characters in a word, stripping prefixes/suffixes, updating possessive status, and tracking numeric/special properties. 61 | fn scan_chars(&mut self, word: &mut String, vocab: &VocabDatabase, buffer: &mut Buffer) { 62 | // Check for possession 63 | if word.ends_with("'s") { 64 | buffer.is_possessive = true; 65 | *word = word[..word.len() - 2].to_string(); 66 | } 67 | 68 | // Iterate through chars 69 | let mut in_prefix = true; 70 | for (x, c) in word.chars().enumerate() { 71 | // Check for currency 72 | if x == 0 && (c == '-' || c == '+' || c.is_currency_symbol()) { 73 | self.chars.push(c); 74 | 75 | // Prefix symbol 76 | } else if in_prefix && SPECIAL_CHARS.contains(&c) { 77 | buffer.push_token(Token::prefix(&c.to_string(), vocab)); 78 | } else { 79 | in_prefix = false; 80 | 81 | // Update word ending position 82 | if c.is_alphanumeric() { 83 | self.word_len = self.chars.len() + 1; 84 | } 85 | 86 | // Check if it's numeric 87 | if self.is_numeric && !self.check_numeric(c) { 88 | self.is_numeric = false; 89 | self.numeric_len = self.chars.len(); 90 | } 91 | self.chars.push(c); 92 | } 93 | } 94 | } 95 | 96 | /// Classifies a numeric word as a time or general numeric token, pushing it to the buffer. 97 | fn classify_numeric(&mut self, word: &str, vocab: &VocabDatabase, buffer: &mut Buffer) { 98 | if self.is_time() { 99 | buffer.push_token(Token::special(word, "|time|", "", "", vocab)); 100 | } else { 101 | buffer.push_token(Token::numeric(word, vocab)); 102 | } 103 | } 104 | 105 | /// Classifies a non-numeric token, handling numeric suffixes (e.g., decades, ordinals) and returning the cleaned word or None if fully processed. 106 | fn classify_token( 107 | &mut self, 108 | word: &str, 109 | vocab: &VocabDatabase, 110 | buffer: &mut Buffer, 111 | ) -> Option { 112 | // Check for numeric with suffix (eg. 3rd, 90s). 113 | if self.numeric_len > 0 { 114 | let suffix: String = self.chars[self.numeric_len..].iter().collect(); 115 | 116 | if self.is_decade(&suffix) { 117 | let value = self.chars[..self.numeric_len].iter().collect::(); 118 | buffer.push_token(Token::special( 119 | word, 120 | "|date_period|", 121 | &value, 122 | &suffix, 123 | vocab, 124 | )); 125 | return None; 126 | } else if let Some((suffix_tag, _)) = vocab.preprocess.hashes.get(&suffix) { 127 | let value = self.chars[..self.numeric_len].iter().collect::(); 128 | buffer.push_token(Token::special(word, suffix_tag, &value, &suffix, vocab)); 129 | return None; 130 | } 131 | } 132 | 133 | // Add suffix to buffer 134 | for c in self.chars[self.word_len..].iter() { 135 | buffer.prepend_suffix(&Token::suffix(&c.to_string(), vocab)); 136 | } 137 | 138 | if self.word_len > 0 { 139 | Some(self.chars[..self.word_len].iter().collect()) 140 | } else { 141 | None 142 | } 143 | } 144 | 145 | /// Checks if a character maintains the numeric status of a word, updating decimal and special character flags. 146 | fn check_numeric(&mut self, c: char) -> bool { 147 | let mut ok = false; 148 | 149 | // Digit 150 | if c.is_ascii_digit() { 151 | ok = true; 152 | 153 | // Special numeric character (. , /, etc.) 154 | } else if NUMERIC_CHARS.contains(&c) { 155 | // Only one non-comma character allowed 156 | if (self.has_decimal || self.has_special) && c != ',' { 157 | self.is_numeric = false; 158 | return false; 159 | } 160 | 161 | // Decimal or other character? 162 | if c == '.' { 163 | self.has_decimal = true; 164 | } else if c != ',' { 165 | self.has_special = true; 166 | } 167 | ok = true; 168 | } 169 | 170 | ok 171 | } 172 | 173 | /// Determines if the character sequence represents a time format (e.g., H:MM or HH:MM). 174 | pub fn is_time(&self) -> bool { 175 | let res = &self.chars; 176 | 177 | // Check for H:MM 178 | if res.len() == 4 { 179 | if res[1] != ':' || res[0] == '0' { 180 | return false; 181 | } 182 | 183 | // Ensure valid minutes 184 | match format!("{}{}", res[2], res[3]).parse::() { 185 | Ok(mins) => { 186 | if mins > 59 { 187 | return false; 188 | } 189 | } 190 | Err(_) => return false, 191 | }; 192 | 193 | // Check for HH:MM 194 | } else if res.len() == 5 { 195 | if res[2] != ':' || !['0', '1', '2'].contains(&res[0]) { 196 | return false; 197 | } 198 | 199 | // Ensure valid minutes 200 | match format!("{}{}", res[3], res[4]).parse::() { 201 | Ok(mins) => { 202 | if mins > 59 { 203 | return false; 204 | } 205 | } 206 | Err(_) => return false, 207 | }; 208 | } else { 209 | return false; 210 | } 211 | 212 | true 213 | } 214 | 215 | /// Checks if the character sequence represents a decade (e.g., 90s or 1990s). 216 | fn is_decade(&self, suffix: &str) -> bool { 217 | if suffix != "s" { 218 | return false; 219 | } 220 | let res = &self.chars; 221 | 222 | // Check for 2 or 4 digit year 223 | 224 | (res.len() == 3 && res[1] == '0') 225 | || (res.len() == 5 && res[3] == '0' && (res[0] == '1' || res[0] == '2')) 226 | } 227 | } 228 | 229 | /// A trait for checking if a character is a currency symbol. 230 | trait IsCurrencySymbol { 231 | fn is_currency_symbol(self) -> bool; 232 | } 233 | 234 | impl IsCurrencySymbol for char { 235 | /// Implements `IsCurrencySymbol` for `char`, checking if the character is a currency symbol ($, €, £, ¥). 236 | fn is_currency_symbol(self) -> bool { 237 | matches!(self, '$' | '€' | '£' | '¥') 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /src/sophia/src/sophia.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use crate::error::Error; 8 | use crate::interpret::{Interpretation, Interpreter}; 9 | use crate::tokenizer::{Token, TokenizedInput, Tokenizer}; 10 | use crate::vocab::{VocabCategory, VocabDatabase, VocabStats}; 11 | 12 | /// The main entry point for the Sophia natural language processing library, integrating tokenization and interpretation capabilities. 13 | /// 14 | /// The main entry point into the Sophia NLU engine, and contains everything you need for NLU (natural languaging understanding) tasks. 15 | /// Allows simple tokenization of input into both, individual words and MWEs (multi-word entities) mixed with individual words, along with 16 | /// interpreting user input and breaking it down into usable phrase, noun, verb and other constructs. 17 | pub struct Sophia { 18 | pub datadir: String, 19 | _language: String, 20 | pub vocab: VocabDatabase, 21 | pub tokenizer: Tokenizer, 22 | pub interpreter: Interpreter, 23 | } 24 | 25 | impl Sophia { 26 | /// Creates a new `Sophia` instance, loading the vocabulary database from the specified directory and language. 27 | /// 28 | /// # Arguments 29 | /// - `datadir`: The path to the directory containing the vocabulary database files. 30 | /// - `language`: The language code and filename of the .dat vocabulary file (eg. 'en' for 'en.dat' file) 31 | /// 32 | /// # Returns 33 | /// A `Result` containing the initialized `Sophia` instance or an `Error` if the vocabulary cannot be loaded. 34 | /// 35 | pub fn new(datadir: &str, language: &str) -> Result { 36 | let vocab = VocabDatabase::load(datadir, language)?; 37 | 38 | Ok(Self { 39 | datadir: datadir.to_string(), 40 | _language: language.to_string(), 41 | interpreter: Interpreter::new(&vocab), 42 | tokenizer: Tokenizer::new(), 43 | vocab, 44 | }) 45 | } 46 | 47 | /// Tokenizes the input text into a `TokenizedInput` containing tokens and MWEs. 48 | /// 49 | /// This method processes the input string using the `Tokenizer`, breaking it into individual tokens and identifying multi-word entities (MWEs). 50 | /// The resulting `TokenizedInput` can be iterated to access tokens or MWEs, with optional filtering for stopwords. 51 | /// 52 | /// # Arguments 53 | /// - `input`: The text to tokenize. 54 | /// 55 | /// # Returns 56 | /// A `Result` containing a `TokenizedInput` with the tokenized representation of the input text, or an `Error` if tokenization fails. 57 | /// 58 | /// # Example 59 | /// 60 | /// ```rust 61 | /// #![no_run] 62 | /// use sophia::{Sophia, Error}; 63 | /// 64 | /// fn main() -> Result<(), Error> { 65 | /// let sophia = Sophia::new("./vocab_data", "en")?; 66 | /// let output = sophia.tokenize("The quick brown fox jumps")?; 67 | /// 68 | /// // Iterate over individual tokens 69 | /// for token in output.iter() { 70 | /// println!("Word: {}, POS: {}", token.word, token.pos); 71 | /// } 72 | /// 73 | /// // Iterate over MWEs 74 | /// for token in output.mwe() { 75 | /// println!("MWE: {}, POS: {}", token.word, token.pos); 76 | /// } 77 | /// 78 | /// Ok(()) 79 | /// } 80 | /// ``` 81 | pub fn tokenize(&self, input: &str) -> TokenizedInput { 82 | self.tokenizer.encode(input, &self.vocab) 83 | } 84 | 85 | /// Interprets the input text, and returns an `Interpretation` with tokens, MWEs and usable phrases. 86 | /// 87 | /// This method first tokenizes the input using the `Tokenizer` and then processes the tokens using the `Interpreter` to generate a structured 88 | /// interpretation. The result includes individual tokens, MWEs, and phrases with associated scores for semantic analysis. 89 | /// 90 | /// # Arguments 91 | /// - `input`: The text to interpret. 92 | /// 93 | /// # Returns 94 | /// A `Result` containing an `Interpretation` with the analyzed structure of the input text, or an `Error` if tokenization or interpretation fails. 95 | /// # Example 96 | /// 97 | /// ```rust 98 | /// #![no_run] 99 | /// use sophia::{Sophia, Error}; 100 | /// 101 | /// fn main() -> Result<(), Error> { 102 | /// let sophia = Sophia::new("./vocab_data", "en")?; 103 | /// let output = sophia.interpret("The quick brown fox jumps over the fallen tree while running through the forest with his friends.")?; 104 | /// 105 | /// // Iterate over phrases 106 | /// for phrase in output.phrases.iter() { 107 | /// println!("Phrase: {:?}", phrase); 108 | /// for noun in phrase.nouns.iter() { 109 | /// println!("Noun Head: {}", output.tokens[noun.head].word); 110 | /// } 111 | /// 112 | /// for verb in phrase.verbs.iter() { 113 | /// println!("Verb Head: {}", output.tokens[verb.head].word); 114 | /// } 115 | /// 116 | /// // Iterate over individual tokens 117 | /// for token in output.tokens.iter() { 118 | /// println!("Word: {}, POS: {}", token.word, token.pos); 119 | /// } 120 | /// 121 | /// Ok(()) 122 | /// } 123 | /// ``` 124 | pub fn interpret(&self, input: &str) -> Interpretation { 125 | self.interpreter.interpret(input, &self.tokenizer, &self.vocab) 126 | } 127 | 128 | /// Gets an individual token by its index id# 129 | /// 130 | /// # Arguments 131 | /// - `index`: The index id# of the token to retrieve 132 | /// 133 | /// # Returns 134 | /// A `Option` containing the `Token` or None if the index id# does not exist. 135 | /// # Example 136 | /// 137 | /// ```rust 138 | /// #![no_run] 139 | /// use sophia::{Sophia, Error}; 140 | /// 141 | /// fn main() -> Result<(), Error> { 142 | /// let sophia = Sophia::new("./vocab_data", "en")?; 143 | /// let output = sophia.tokenize("She was running down the road"); 144 | /// 145 | /// // Get the stem of 'running' 146 | /// let index = output.tokens[2].stem; // the index id# of the stem of 'running'. 147 | /// if let Some(token) = sophia.get_token(index) { 148 | /// println!("Stem of running is {}", token.word); 149 | /// } 150 | /// Ok(()) 151 | /// } 152 | pub fn get_token(&self, index: i32) -> Option { 153 | let token = self.vocab.words.id2token.get(&index)?; 154 | let mut res = token.clone(); 155 | res.index = index; 156 | Some(res) 157 | } 158 | 159 | /// Gets an individual token by word. 160 | /// 161 | /// # Arguments 162 | /// - `word`: The word to lookup and retrieve `Token` for. 163 | /// 164 | /// # Returns 165 | /// A `Option` containing the `Token` or None if the word does not exist. 166 | /// # Example 167 | /// 168 | /// ```rust 169 | /// #![no_run] 170 | /// use sophia::{Sophia, Error}; 171 | /// 172 | /// fn main() -> Result<(), Error> { 173 | /// let sophia = Sophia::new("./vocab_data", "en")?; 174 | /// if let Some(token) = sophia.get_word("running") { 175 | /// println!("got token {}", token); 176 | /// } 177 | /// Ok(()) 178 | /// } 179 | pub fn get_word(&self, word: &str) -> Option { 180 | // Check wordlist 181 | let pos_map = self.vocab.words.wordlist.get(word)?; 182 | 183 | // Get token 184 | let (_, index) = pos_map.first().unwrap(); 185 | let token = self.vocab.words.id2token.get(index)?; 186 | let mut res = token.clone(); 187 | 188 | res.index = *index; 189 | res.potential_pos = pos_map.keys().copied().collect(); 190 | 191 | Some(res) 192 | } 193 | 194 | /// Gets a category by its path. 195 | /// 196 | /// # Arguments 197 | /// - `category_path`: The full category path to lookup (eg. verbs/action/search/retrieve/pursue) 198 | /// 199 | /// # Returns 200 | /// A `Option` containing the `VocabCategory` or None if the word does not exist. 201 | /// # Example 202 | /// 203 | /// ```rust 204 | /// #![no_run] 205 | /// use sophia::{Sophia, Error}; 206 | /// 207 | /// fn main() -> Result<(), Error> { 208 | /// let sophia = Sophia::new("./vocab_data", "en")?; 209 | /// if let Some(cat) = sophia.get_category("verbs/action/search/retrieve/pursue") { 210 | /// println!("got category fqn {}", cat.fqn); 211 | /// } 212 | /// Ok(()) 213 | /// } 214 | pub fn get_category(&self, category_path: &str) -> Option { 215 | self.vocab.categories.get_category_by_path(category_path, &self.vocab) 216 | } 217 | 218 | /// Returns various statistics regarding the loaded vocabulary file such as total singular / ambiguous words, MWEs, POS tags, and more. 219 | pub fn get_vocab_stats(&self) -> VocabStats { 220 | VocabStats::compile(&self.vocab) 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/antecedent_buffer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use super::CoreferenceCategories; 7 | use crate::pos_tagger::POSTag; 8 | use crate::tokenizer::Token; 9 | use crate::vocab::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson}; 10 | use std::collections::HashSet; 11 | 12 | /// Manages antecedents for coreference resolution, tracking primary and secondary antecedents, person counts, and plural references. 13 | #[derive(Default)] 14 | pub struct AntecedentBuffer { 15 | coref: CoreferenceCategories, 16 | count: usize, 17 | primary: Antecedent, 18 | secondary: Vec, 19 | last_person: String, 20 | plural_person: HashSet, 21 | primary_object: String, 22 | } 23 | 24 | /// Represents an antecedent with a name, part-of-speech tag, type (person, entity, object), and gender. 25 | #[derive(Debug, Clone)] 26 | struct Antecedent { 27 | name: String, 28 | pos: String, 29 | antecedent_type: AntecedentType, 30 | gender: PronounGender, 31 | } 32 | 33 | /// Defines the type of an antecedent, which can be none, person, entity, or object. 34 | #[derive(Debug, Clone, PartialEq)] 35 | enum AntecedentType { 36 | none, 37 | person, 38 | entity, 39 | object, 40 | } 41 | 42 | impl AntecedentBuffer { 43 | /// Creates a new AntecedentBuffer with the provided coreference categories. 44 | pub fn new(coref: &CoreferenceCategories) -> Self { 45 | Self { 46 | coref: coref.clone(), 47 | ..Default::default() 48 | } 49 | } 50 | 51 | /// Adds a noun token to the antecedent buffer, classifying it as person, entity, or object based on coreference rules. 52 | pub fn add_noun(&mut self, token: &Token) { 53 | // Check for person / object 54 | if self.coref.is_person(token) { 55 | self.add(&token.word, &token.pos, AntecedentType::person); 56 | self.last_person = token.word.to_string(); 57 | self.plural_person.insert(token.word.to_string()); 58 | } else if self.coref.is_entity(token) { 59 | self.add(&token.word, &token.pos, AntecedentType::entity); 60 | } else if token.is_noun() { 61 | self.add(&token.word, &token.pos, AntecedentType::object); 62 | } 63 | } 64 | 65 | /// Adds an antecedent to the buffer, updating primary or secondary lists and setting primary object if applicable. 66 | fn add(&mut self, name: &str, pos: &POSTag, antecedent_type: AntecedentType) { 67 | let ant = Antecedent { 68 | name: name.to_string(), 69 | pos: pos.to_str(), 70 | antecedent_type, 71 | gender: PronounGender::neutral, 72 | }; 73 | 74 | if ant.antecedent_type == AntecedentType::object && self.primary_object.is_empty() { 75 | self.primary_object = name.to_string(); 76 | } 77 | 78 | if ant.antecedent_type == AntecedentType::person 79 | && self.primary.antecedent_type == AntecedentType::none 80 | { 81 | self.primary = ant; 82 | } else { 83 | self.secondary.push(ant); 84 | } 85 | } 86 | 87 | /// Adds a non-noun token to the buffer, resetting plural person tracking for verbs/prepositions or clearing the buffer if needed. 88 | pub fn add_non_noun(&mut self, token: &Token) { 89 | if token.is_verb() || token.is_preposition() { 90 | self.plural_person = HashSet::new(); 91 | } 92 | 93 | // Clear, if needed 94 | if self.count >= 30 || token.word.as_str() == "|nl|" { 95 | self.clear(); 96 | } else { 97 | self.count += 1; 98 | } 99 | } 100 | 101 | /// Resolves a pronoun in the token by assigning an antecedent based on gender, number, and person, if applicable. 102 | pub fn resolve_pronoun(&mut self, token: &mut Token) { 103 | // Get pronoun 104 | let pronoun = match &token.pronoun { 105 | Some(r) => r, 106 | None => return, 107 | }; 108 | if !pronoun.is_anaphora() { 109 | return; 110 | } 111 | self.count = 0; 112 | 113 | // Ensure third person 114 | if pronoun.person == PronounPerson::first || pronoun.person == PronounPerson::second { 115 | return; 116 | } 117 | 118 | // Get antecedent 119 | if pronoun.gender != PronounGender::neutral && pronoun.number == PronounNumber::singular { 120 | token.antecedent = self.get_singular_person(pronoun); 121 | } else if pronoun.gender == PronounGender::neutral 122 | && pronoun.number == PronounNumber::singular 123 | { 124 | token.antecedent = if self.primary_object.is_empty() { 125 | None 126 | } else { 127 | Some(self.primary_object.to_string()) 128 | }; 129 | } else if pronoun.number == PronounNumber::plural { 130 | token.antecedent = self.get_plural(pronoun); 131 | } 132 | } 133 | 134 | /// Resolves a singular person pronoun, matching gender and updating the primary or secondary antecedent. 135 | fn get_singular_person(&mut self, pronoun: &Pronoun) -> Option { 136 | // First person, or no primary 137 | if self.primary.antecedent_type == AntecedentType::none { 138 | return None; 139 | } 140 | 141 | // Possessive 142 | if (pronoun.category == PronounCategory::possessive 143 | || self.last_person != self.primary.name) 144 | && (self.primary.gender == pronoun.gender 145 | || self.primary.gender == PronounGender::neutral) 146 | { 147 | if self.primary.gender == PronounGender::neutral { 148 | self.primary.gender = pronoun.gender.clone(); 149 | } 150 | //self.last_person = self.primary.name.to_string(); 151 | return Some(self.primary.name.to_string()); 152 | } 153 | 154 | // Go through names, identify correct gender 155 | let mut name = String::new(); 156 | for elem in self.secondary.iter_mut().rev() { 157 | if elem.antecedent_type != AntecedentType::person { 158 | continue; 159 | } else if elem.gender == pronoun.gender { 160 | name = elem.name.to_string(); 161 | break; 162 | } else if elem.gender == PronounGender::neutral { 163 | elem.gender = pronoun.gender.clone(); 164 | name = elem.name.to_string(); 165 | break; 166 | } 167 | } 168 | 169 | // No name found 170 | if name.is_empty() { 171 | return None; 172 | } 173 | 174 | self.last_person = name.to_string(); 175 | Some(name) 176 | } 177 | 178 | /// Resolves a third-person plural pronoun, prioritizing plural persons or falling back to entities/objects. 179 | fn get_plural(&mut self, _pronoun: &Pronoun) -> Option { 180 | // Try for person 181 | if let Some(name) = self.get_plural_person() { 182 | return Some(name); 183 | } 184 | 185 | // Look for entity or object 186 | let mut res: Option = None; 187 | for elem in self.secondary.iter().rev() { 188 | if elem.antecedent_type == AntecedentType::person { 189 | continue; 190 | } 191 | if elem.antecedent_type == AntecedentType::entity 192 | || (elem.antecedent_type == AntecedentType::object && elem.pos.as_str() == "NP") 193 | { 194 | res = Some(elem.name.to_string()); 195 | break; 196 | } 197 | } 198 | 199 | res 200 | } 201 | 202 | /// Retrieves a third-person plural person antecedent by combining primary and secondary persons, if available. 203 | fn get_plural_person(&mut self) -> Option { 204 | if self.plural_person.len() >= 2 && self.plural_person.contains(&self.primary.name) { 205 | return Some(self.plural_person.iter().cloned().collect::>().join("|")); 206 | } else if self.primary.name.is_empty() { 207 | return None; 208 | } 209 | 210 | // Look for person in buffer 211 | let mut people = vec![self.primary.name.to_string()]; 212 | for elem in self.secondary.iter().rev() { 213 | if elem.antecedent_type != AntecedentType::person { 214 | continue; 215 | } 216 | if elem.name != self.primary.name { 217 | people.push(elem.name.to_string()); 218 | break; 219 | } 220 | } 221 | 222 | // Return, if we have two 223 | if people.len() > 1 { 224 | return Some(people.join("|")); 225 | } 226 | 227 | None 228 | } 229 | 230 | /// Clears the antecedent buffer, resetting all fields to their default state. 231 | fn clear(&mut self) { 232 | self.count = 0; 233 | self.primary = Antecedent::default(); 234 | self.secondary = Vec::new(); 235 | self.last_person = String::new(); 236 | self.plural_person = HashSet::new(); 237 | self.primary_object = String::new(); 238 | } 239 | } 240 | 241 | impl Default for Antecedent { 242 | fn default() -> Antecedent { 243 | Antecedent { 244 | name: String::new(), 245 | pos: String::new(), 246 | antecedent_type: AntecedentType::none, 247 | gender: PronounGender::neutral, 248 | } 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/antecedent_buffer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::CoreferenceCategories; 8 | use crate::pos_tagger::POSTag; 9 | use crate::tokenizer::Token; 10 | use crate::vocab::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson}; 11 | use std::collections::HashSet; 12 | 13 | /// Manages antecedents for coreference resolution, tracking primary and secondary antecedents, person counts, and plural references. 14 | #[derive(Default)] 15 | pub struct AntecedentBuffer { 16 | coref: CoreferenceCategories, 17 | count: usize, 18 | primary: Antecedent, 19 | secondary: Vec, 20 | last_person: String, 21 | plural_person: HashSet, 22 | primary_object: String, 23 | } 24 | 25 | /// Represents an antecedent with a name, part-of-speech tag, type (person, entity, object), and gender. 26 | #[derive(Debug, Clone)] 27 | struct Antecedent { 28 | name: String, 29 | pos: String, 30 | antecedent_type: AntecedentType, 31 | gender: PronounGender, 32 | } 33 | 34 | /// Defines the type of an antecedent, which can be none, person, entity, or object. 35 | #[derive(Debug, Clone, PartialEq)] 36 | enum AntecedentType { 37 | none, 38 | person, 39 | entity, 40 | object, 41 | } 42 | 43 | impl AntecedentBuffer { 44 | /// Creates a new AntecedentBuffer with the provided coreference categories. 45 | pub fn new(coref: &CoreferenceCategories) -> Self { 46 | Self { 47 | coref: coref.clone(), 48 | ..Default::default() 49 | } 50 | } 51 | 52 | /// Adds a noun token to the antecedent buffer, classifying it as person, entity, or object based on coreference rules. 53 | pub fn add_noun(&mut self, token: &Token) { 54 | // Check for person / object 55 | if self.coref.is_person(token) { 56 | self.add(&token.word, &token.pos, AntecedentType::person); 57 | self.last_person = token.word.to_string(); 58 | self.plural_person.insert(token.word.to_string()); 59 | } else if self.coref.is_entity(token) { 60 | self.add(&token.word, &token.pos, AntecedentType::entity); 61 | } else if token.is_noun() { 62 | self.add(&token.word, &token.pos, AntecedentType::object); 63 | } 64 | } 65 | 66 | /// Adds an antecedent to the buffer, updating primary or secondary lists and setting primary object if applicable. 67 | fn add(&mut self, name: &str, pos: &POSTag, antecedent_type: AntecedentType) { 68 | let ant = Antecedent { 69 | name: name.to_string(), 70 | pos: pos.to_str(), 71 | antecedent_type, 72 | gender: PronounGender::neutral, 73 | }; 74 | 75 | if ant.antecedent_type == AntecedentType::object && self.primary_object.is_empty() { 76 | self.primary_object = name.to_string(); 77 | } 78 | 79 | if ant.antecedent_type == AntecedentType::person 80 | && self.primary.antecedent_type == AntecedentType::none 81 | { 82 | self.primary = ant; 83 | } else { 84 | self.secondary.push(ant); 85 | } 86 | } 87 | 88 | /// Adds a non-noun token to the buffer, resetting plural person tracking for verbs/prepositions or clearing the buffer if needed. 89 | pub fn add_non_noun(&mut self, token: &Token) { 90 | if token.is_verb() || token.is_preposition() { 91 | self.plural_person = HashSet::new(); 92 | } 93 | 94 | // Clear, if needed 95 | if self.count >= 30 || token.word.as_str() == "|nl|" { 96 | self.clear(); 97 | } else { 98 | self.count += 1; 99 | } 100 | } 101 | 102 | /// Resolves a pronoun in the token by assigning an antecedent based on gender, number, and person, if applicable. 103 | pub fn resolve_pronoun(&mut self, token: &mut Token) { 104 | // Get pronoun 105 | let pronoun = match &token.pronoun { 106 | Some(r) => r, 107 | None => return, 108 | }; 109 | if !pronoun.is_anaphora() { 110 | return; 111 | } 112 | self.count = 0; 113 | 114 | // Ensure third person 115 | if pronoun.person == PronounPerson::first || pronoun.person == PronounPerson::second { 116 | return; 117 | } 118 | 119 | // Get antecedent 120 | if pronoun.gender != PronounGender::neutral && pronoun.number == PronounNumber::singular { 121 | token.antecedent = self.get_singular_person(pronoun); 122 | } else if pronoun.gender == PronounGender::neutral 123 | && pronoun.number == PronounNumber::singular 124 | { 125 | token.antecedent = if self.primary_object.is_empty() { 126 | None 127 | } else { 128 | Some(self.primary_object.to_string()) 129 | }; 130 | } else if pronoun.number == PronounNumber::plural { 131 | token.antecedent = self.get_plural(pronoun); 132 | } 133 | } 134 | 135 | /// Resolves a singular person pronoun, matching gender and updating the primary or secondary antecedent. 136 | fn get_singular_person(&mut self, pronoun: &Pronoun) -> Option { 137 | // First person, or no primary 138 | if self.primary.antecedent_type == AntecedentType::none { 139 | return None; 140 | } 141 | 142 | // Possessive 143 | if (pronoun.category == PronounCategory::possessive 144 | || self.last_person != self.primary.name) 145 | && (self.primary.gender == pronoun.gender 146 | || self.primary.gender == PronounGender::neutral) 147 | { 148 | if self.primary.gender == PronounGender::neutral { 149 | self.primary.gender = pronoun.gender.clone(); 150 | } 151 | //self.last_person = self.primary.name.to_string(); 152 | return Some(self.primary.name.to_string()); 153 | } 154 | 155 | // Go through names, identify correct gender 156 | let mut name = String::new(); 157 | for elem in self.secondary.iter_mut().rev() { 158 | if elem.antecedent_type != AntecedentType::person { 159 | continue; 160 | } else if elem.gender == pronoun.gender { 161 | name = elem.name.to_string(); 162 | break; 163 | } else if elem.gender == PronounGender::neutral { 164 | elem.gender = pronoun.gender.clone(); 165 | name = elem.name.to_string(); 166 | break; 167 | } 168 | } 169 | 170 | // No name found 171 | if name.is_empty() { 172 | return None; 173 | } 174 | 175 | self.last_person = name.to_string(); 176 | Some(name) 177 | } 178 | 179 | /// Resolves a third-person plural pronoun, prioritizing plural persons or falling back to entities/objects. 180 | fn get_plural(&mut self, _pronoun: &Pronoun) -> Option { 181 | // Try for person 182 | if let Some(name) = self.get_plural_person() { 183 | return Some(name); 184 | } 185 | 186 | // Look for entity or object 187 | let mut res: Option = None; 188 | for elem in self.secondary.iter().rev() { 189 | if elem.antecedent_type == AntecedentType::person { 190 | continue; 191 | } 192 | if elem.antecedent_type == AntecedentType::entity 193 | || (elem.antecedent_type == AntecedentType::object && elem.pos.as_str() == "NP") 194 | { 195 | res = Some(elem.name.to_string()); 196 | break; 197 | } 198 | } 199 | 200 | res 201 | } 202 | 203 | /// Retrieves a third-person plural person antecedent by combining primary and secondary persons, if available. 204 | fn get_plural_person(&mut self) -> Option { 205 | if self.plural_person.len() >= 2 && self.plural_person.contains(&self.primary.name) { 206 | return Some(self.plural_person.iter().cloned().collect::>().join("|")); 207 | } else if self.primary.name.is_empty() { 208 | return None; 209 | } 210 | 211 | // Look for person in buffer 212 | let mut people = vec![self.primary.name.to_string()]; 213 | for elem in self.secondary.iter().rev() { 214 | if elem.antecedent_type != AntecedentType::person { 215 | continue; 216 | } 217 | if elem.name != self.primary.name { 218 | people.push(elem.name.to_string()); 219 | break; 220 | } 221 | } 222 | 223 | // Return, if we have two 224 | if people.len() > 1 { 225 | return Some(people.join("|")); 226 | } 227 | 228 | None 229 | } 230 | 231 | /// Clears the antecedent buffer, resetting all fields to their default state. 232 | fn clear(&mut self) { 233 | self.count = 0; 234 | self.primary = Antecedent::default(); 235 | self.secondary = Vec::new(); 236 | self.last_person = String::new(); 237 | self.plural_person = HashSet::new(); 238 | self.primary_object = String::new(); 239 | } 240 | } 241 | 242 | impl Default for Antecedent { 243 | fn default() -> Antecedent { 244 | Antecedent { 245 | name: String::new(), 246 | pos: String::new(), 247 | antecedent_type: AntecedentType::none, 248 | gender: PronounGender::neutral, 249 | } 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /src/sophia/src/interpreter/buffer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1) 3 | // See the full license at: https://cicero.sh/license.txt 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 5 | 6 | use super::{AntecedentBuffer, CoreferenceCategories, Phrase}; 7 | use crate::pos_tagger::POSTag; 8 | use crate::tokenizer::Token; 9 | use std::fmt; 10 | 11 | /// A buffer for processing tokens, tracking verbs, nouns, pronouns, and antecedents, with support for phrase splitting and enclosed character handling. 12 | #[derive(Default)] 13 | pub struct Buffer { 14 | pub position: usize, 15 | pub tokens: Vec, 16 | pub is_locked: bool, 17 | pub enclosed_chars: Vec, 18 | pub enclosed_chars_num_phrases: usize, 19 | pub verbs: Vec, 20 | pub nouns: Vec, 21 | pub pronouns: Vec, 22 | pub antecedents: AntecedentBuffer, 23 | } 24 | 25 | static ENCLOSED_START_CHARS: &[char] = &['"', '\'', '(', '[', '<', '|']; 26 | 27 | impl Buffer { 28 | /// Creates a new Buffer instance with an initialized AntecedentBuffer using the provided coreference categories. 29 | pub fn new(coref: &CoreferenceCategories) -> Self { 30 | Self { 31 | antecedents: AntecedentBuffer::new(coref), 32 | ..Default::default() 33 | } 34 | } 35 | 36 | /// Adds a token to the buffer, updating verb, noun, pronoun, and antecedent tracking, and returns the token's index. 37 | pub fn add(&mut self, buf_token: &Token) -> usize { 38 | let mut token = buf_token.clone(); 39 | 40 | // Process token type 41 | if token.is_verb() { 42 | self.verbs.push(self.tokens.len()); 43 | if self.verbs.len() == 1 { 44 | self.position = self.tokens.len(); 45 | } 46 | self.is_locked = false; 47 | 48 | // Add noun 49 | } else if token.is_noun() { 50 | self.nouns.push(self.tokens.len()); 51 | self.is_locked = false; 52 | self.antecedents.add_noun(&token); 53 | 54 | // Add pronoun 55 | } else if token.is_pronoun() { 56 | self.pronouns.push(self.tokens.len()); 57 | self.antecedents.resolve_pronoun(&mut token); 58 | } 59 | 60 | // Add non-pronoun to antecedent buffer 61 | if !token.is_pronoun() { 62 | self.antecedents.add_non_noun(&token); 63 | } 64 | 65 | // Add token 66 | self.tokens.push(token); 67 | self.tokens.len() - 1 68 | } 69 | 70 | /// Checks if the buffer can be split at the given position based on token type, enclosed characters, and buffer state. 71 | pub fn can_split(&self, x: usize) -> bool { 72 | if self.tokens[x].pos == POSTag::SYM 73 | && self.enclosed_chars.is_empty() 74 | && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap()) 75 | { 76 | return true; 77 | } else if self.tokens[x].pos == POSTag::SYM 78 | && !self.enclosed_chars.is_empty() 79 | && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap() 80 | { 81 | return true; 82 | } else if !self.pronouns.is_empty() 83 | && !self.verbs.is_empty() 84 | && self.tokens[x].pos == POSTag::CC 85 | { 86 | return true; 87 | } else if self.verbs.len() < 2 || self.is_locked { 88 | return false; 89 | } else if self.nouns.is_empty() { 90 | return false; 91 | } 92 | 93 | true 94 | } 95 | /// Attempts to split the buffer into a Phrase if conditions are met, determining the split position and handling enclosed characters. 96 | pub fn split(&mut self) -> Option { 97 | // Check minimum requirements 98 | if !self.can_split(self.tokens.len() - 1) { 99 | return None; 100 | } 101 | 102 | // Determine split position 103 | let mut split_pos = None; 104 | for x in self.position..self.tokens.len() { 105 | self.position = x + 1; 106 | 107 | // Check enclosed char 108 | if self.tokens[x].pos == POSTag::SYM 109 | && self.enclosed_chars.is_empty() 110 | && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap()) 111 | { 112 | self.enclosed_chars = vec![self.tokens[x].word.chars().next().unwrap(), ' ']; 113 | self.enclosed_chars_num_phrases = 0; 114 | self.enclosed_chars[1] = match self.enclosed_chars[0] { 115 | '\'' => '\'', 116 | '(' => ')', 117 | '[' => ']', 118 | '{' => '}', 119 | '<' => '>', 120 | '|' => '|', 121 | _ => '"', 122 | }; 123 | split_pos = Some(x); 124 | break; 125 | } else if self.tokens[x].pos == POSTag::SYM 126 | && self.enclosed_chars.len() == 2 127 | && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap() 128 | { 129 | split_pos = Some(x); 130 | break; 131 | } 132 | 133 | // Unlock, if needed 134 | if self.is_locked && (self.tokens[x].is_noun() || self.tokens[x].is_verb()) { 135 | self.is_locked = false; 136 | } 137 | 138 | if (!self.nouns.is_empty() && self.nouns[0] >= x) || self.is_locked { 139 | continue; 140 | } else if self.tokens[x].is_sentence_stopper() { 141 | split_pos = Some(x); 142 | break; 143 | } 144 | 145 | // Lock if CC tag 146 | if self.tokens[x].pos == POSTag::CC { 147 | self.is_locked = true; 148 | continue; 149 | } 150 | 151 | // Handle previous comma 152 | if x > 0 && self.tokens[x - 1].word.as_str() == "," { 153 | if ["CS", "CA", "VBG"].contains(&self.tokens[x].pos.to_str().as_str()) { 154 | self.is_locked = true; 155 | continue; 156 | } else { 157 | split_pos = Some(x - 1); 158 | break; 159 | } 160 | } 161 | 162 | // Check for potential phrase splitter 163 | if self.tokens[x].word.as_str() == "," 164 | || !["CC", "DT", "PRP", "RB", "RBS", "RBR"] 165 | .contains(&self.tokens[x].pos.to_str().as_str()) 166 | { 167 | continue; 168 | } 169 | 170 | // If adjective, ensure following token is noun 171 | if self.tokens[x].is_adjective() && (self.check_determiner_offset(x) > 0) { 172 | continue; 173 | } else if x > 0 174 | && (self.tokens[x].is_pronoun() || self.tokens[x].is_determiner()) 175 | && self.tokens[x - 1].is_preposition() 176 | { 177 | self.is_locked = true; 178 | continue; 179 | } else if self.tokens[x].is_adverb() && x > 0 && self.tokens[x - 1].is_verb() { 180 | continue; 181 | } 182 | 183 | split_pos = Some(x); 184 | break; 185 | } 186 | 187 | // Split if needed 188 | if let Some(pos) = split_pos { 189 | let phrase = self.do_split(pos); 190 | return Some(phrase); 191 | } 192 | 193 | None 194 | } 195 | 196 | /// Performs the actual split at the specified position, creating a new Phrase and updating the buffer's token list. 197 | pub fn do_split(&mut self, split_pos: usize) -> Phrase { 198 | // Get phrase 199 | let remaining_tokens = self.tokens.split_off(split_pos); 200 | let phrase = Phrase::new(&0, None); 201 | 202 | // Drain buffer after a split 203 | self.tokens = remaining_tokens; 204 | //self.drain(phrase.tokens.len()); 205 | 206 | phrase 207 | } 208 | 209 | /// Drains the buffer after a split, updating verb, noun, and pronoun indices and resetting position and lock state. 210 | pub fn drain(&mut self, length: usize) { 211 | self.verbs = self.verbs.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 212 | self.nouns = self.nouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 213 | self.pronouns = 214 | self.pronouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 215 | self.position = if !self.verbs.is_empty() { 216 | self.verbs[0] 217 | } else { 218 | 0 219 | }; 220 | self.is_locked = false; 221 | } 222 | 223 | /// Checks for a determiner followed by a noun (with optional adjective) and returns the offset (2 or 3) or 0 if not found. 224 | pub fn check_determiner_offset(&self, pos: usize) -> usize { 225 | if self.tokens.len() < (pos + 1) { 226 | return 0; 227 | } 228 | 229 | if self.tokens[pos + 1].is_noun() { 230 | return 2; 231 | } else if self.tokens.len() >= (pos + 2) 232 | && self.tokens[pos + 1].is_adjective() 233 | && self.tokens[pos + 2].is_noun() 234 | { 235 | return 3; 236 | } 237 | 238 | 0 239 | } 240 | } 241 | 242 | impl fmt::Debug for Buffer { 243 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 244 | let words = self.tokens.iter().map(|token| token.word.to_string()).collect::>(); 245 | let verbs = self 246 | .verbs 247 | .iter() 248 | .map(|pos| format!("{} {}", self.tokens[*pos].word, pos)) 249 | .collect::>(); 250 | let nouns = self 251 | .nouns 252 | .iter() 253 | .map(|pos| self.tokens[*pos].word.to_string()) 254 | .collect::>(); 255 | write!( 256 | f, 257 | "[buffer] {} [verbs] {} [nouns] {}", 258 | words.join(" "), 259 | verbs.join(", "), 260 | nouns.join(", ") 261 | ) 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /src/sophia/src/interpret/buffer.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{AntecedentBuffer, CoreferenceCategories, Phrase}; 8 | use crate::pos_tagger::POSTag; 9 | use crate::tokenizer::Token; 10 | use std::fmt; 11 | 12 | /// A buffer for processing tokens, tracking verbs, nouns, pronouns, and antecedents, with support for phrase splitting and enclosed character handling. 13 | #[derive(Default)] 14 | pub struct Buffer { 15 | pub position: usize, 16 | pub tokens: Vec, 17 | pub is_locked: bool, 18 | pub enclosed_chars: Vec, 19 | pub enclosed_chars_num_phrases: usize, 20 | pub verbs: Vec, 21 | pub nouns: Vec, 22 | pub pronouns: Vec, 23 | pub antecedents: AntecedentBuffer, 24 | } 25 | 26 | static ENCLOSED_START_CHARS: &[char] = &['"', '\'', '(', '[', '<', '|']; 27 | 28 | impl Buffer { 29 | /// Creates a new Buffer instance with an initialized AntecedentBuffer using the provided coreference categories. 30 | pub fn new(coref: &CoreferenceCategories) -> Self { 31 | Self { 32 | antecedents: AntecedentBuffer::new(coref), 33 | ..Default::default() 34 | } 35 | } 36 | 37 | /// Adds a token to the buffer, updating verb, noun, pronoun, and antecedent tracking, and returns the token's index. 38 | pub fn add(&mut self, buf_token: &Token) -> usize { 39 | let mut token = buf_token.clone(); 40 | 41 | // Process token type 42 | if token.is_verb() { 43 | self.verbs.push(self.tokens.len()); 44 | if self.verbs.len() == 1 { 45 | self.position = self.tokens.len(); 46 | } 47 | self.is_locked = false; 48 | 49 | // Add noun 50 | } else if token.is_noun() { 51 | self.nouns.push(self.tokens.len()); 52 | self.is_locked = false; 53 | self.antecedents.add_noun(&token); 54 | 55 | // Add pronoun 56 | } else if token.is_pronoun() { 57 | self.pronouns.push(self.tokens.len()); 58 | self.antecedents.resolve_pronoun(&mut token); 59 | } 60 | 61 | // Add non-pronoun to antecedent buffer 62 | if !token.is_pronoun() { 63 | self.antecedents.add_non_noun(&token); 64 | } 65 | 66 | // Add token 67 | self.tokens.push(token); 68 | self.tokens.len() - 1 69 | } 70 | 71 | /// Checks if the buffer can be split at the given position based on token type, enclosed characters, and buffer state. 72 | pub fn can_split(&self, x: usize) -> bool { 73 | if self.tokens[x].pos == POSTag::SYM 74 | && self.enclosed_chars.is_empty() 75 | && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap()) 76 | { 77 | return true; 78 | } else if self.tokens[x].pos == POSTag::SYM 79 | && !self.enclosed_chars.is_empty() 80 | && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap() 81 | { 82 | return true; 83 | } else if !self.pronouns.is_empty() 84 | && !self.verbs.is_empty() 85 | && self.tokens[x].pos == POSTag::CC 86 | { 87 | return true; 88 | } else if self.verbs.len() < 2 || self.is_locked { 89 | return false; 90 | } else if self.nouns.is_empty() { 91 | return false; 92 | } 93 | 94 | true 95 | } 96 | /// Attempts to split the buffer into a Phrase if conditions are met, determining the split position and handling enclosed characters. 97 | pub fn split(&mut self) -> Option { 98 | // Check minimum requirements 99 | if !self.can_split(self.tokens.len() - 1) { 100 | return None; 101 | } 102 | 103 | // Determine split position 104 | let mut split_pos = None; 105 | for x in self.position..self.tokens.len() { 106 | self.position = x + 1; 107 | 108 | // Check enclosed char 109 | if self.tokens[x].pos == POSTag::SYM 110 | && self.enclosed_chars.is_empty() 111 | && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap()) 112 | { 113 | self.enclosed_chars = vec![self.tokens[x].word.chars().next().unwrap(), ' ']; 114 | self.enclosed_chars_num_phrases = 0; 115 | self.enclosed_chars[1] = match self.enclosed_chars[0] { 116 | '\'' => '\'', 117 | '(' => ')', 118 | '[' => ']', 119 | '{' => '}', 120 | '<' => '>', 121 | '|' => '|', 122 | _ => '"', 123 | }; 124 | split_pos = Some(x); 125 | break; 126 | } else if self.tokens[x].pos == POSTag::SYM 127 | && self.enclosed_chars.len() == 2 128 | && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap() 129 | { 130 | split_pos = Some(x); 131 | break; 132 | } 133 | 134 | // Unlock, if needed 135 | if self.is_locked && (self.tokens[x].is_noun() || self.tokens[x].is_verb()) { 136 | self.is_locked = false; 137 | } 138 | 139 | if (!self.nouns.is_empty() && self.nouns[0] >= x) || self.is_locked { 140 | continue; 141 | } else if self.tokens[x].is_sentence_stopper() { 142 | split_pos = Some(x); 143 | break; 144 | } 145 | 146 | // Lock if CC tag 147 | if self.tokens[x].pos == POSTag::CC { 148 | self.is_locked = true; 149 | continue; 150 | } 151 | 152 | // Handle previous comma 153 | if x > 0 && self.tokens[x - 1].word.as_str() == "," { 154 | if ["CS", "CA", "VBG"].contains(&self.tokens[x].pos.to_str().as_str()) { 155 | self.is_locked = true; 156 | continue; 157 | } else { 158 | split_pos = Some(x - 1); 159 | break; 160 | } 161 | } 162 | 163 | // Check for potential phrase splitter 164 | if self.tokens[x].word.as_str() == "," 165 | || !["CC", "DT", "PRP", "RB", "RBS", "RBR"] 166 | .contains(&self.tokens[x].pos.to_str().as_str()) 167 | { 168 | continue; 169 | } 170 | 171 | // If adjective, ensure following token is noun 172 | if self.tokens[x].is_adjective() && (self.check_determiner_offset(x) > 0) { 173 | continue; 174 | } else if x > 0 175 | && (self.tokens[x].is_pronoun() || self.tokens[x].is_determiner()) 176 | && self.tokens[x - 1].is_preposition() 177 | { 178 | self.is_locked = true; 179 | continue; 180 | } else if self.tokens[x].is_adverb() && x > 0 && self.tokens[x - 1].is_verb() { 181 | continue; 182 | } 183 | 184 | split_pos = Some(x); 185 | break; 186 | } 187 | 188 | // Split if needed 189 | if let Some(pos) = split_pos { 190 | let phrase = self.do_split(pos); 191 | return Some(phrase); 192 | } 193 | 194 | None 195 | } 196 | 197 | /// Performs the actual split at the specified position, creating a new Phrase and updating the buffer's token list. 198 | pub fn do_split(&mut self, split_pos: usize) -> Phrase { 199 | // Get phrase 200 | let remaining_tokens = self.tokens.split_off(split_pos); 201 | let phrase = Phrase::new(&0, None); 202 | 203 | // Drain buffer after a split 204 | self.tokens = remaining_tokens; 205 | //self.drain(phrase.tokens.len()); 206 | 207 | phrase 208 | } 209 | 210 | /// Drains the buffer after a split, updating verb, noun, and pronoun indices and resetting position and lock state. 211 | pub fn drain(&mut self, length: usize) { 212 | self.verbs = self.verbs.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 213 | self.nouns = self.nouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 214 | self.pronouns = 215 | self.pronouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect(); 216 | self.position = if !self.verbs.is_empty() { 217 | self.verbs[0] 218 | } else { 219 | 0 220 | }; 221 | self.is_locked = false; 222 | } 223 | 224 | /// Checks for a determiner followed by a noun (with optional adjective) and returns the offset (2 or 3) or 0 if not found. 225 | pub fn check_determiner_offset(&self, pos: usize) -> usize { 226 | if self.tokens.len() < (pos + 1) { 227 | return 0; 228 | } 229 | 230 | if self.tokens[pos + 1].is_noun() { 231 | return 2; 232 | } else if self.tokens.len() >= (pos + 2) 233 | && self.tokens[pos + 1].is_adjective() 234 | && self.tokens[pos + 2].is_noun() 235 | { 236 | return 3; 237 | } 238 | 239 | 0 240 | } 241 | } 242 | 243 | impl fmt::Debug for Buffer { 244 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 245 | let words = self.tokens.iter().map(|token| token.word.to_string()).collect::>(); 246 | let verbs = self 247 | .verbs 248 | .iter() 249 | .map(|pos| format!("{} {}", self.tokens[*pos].word, pos)) 250 | .collect::>(); 251 | let nouns = self 252 | .nouns 253 | .iter() 254 | .map(|pos| self.tokens[*pos].word.to_string()) 255 | .collect::>(); 256 | write!( 257 | f, 258 | "[buffer] {} [verbs] {} [nouns] {}", 259 | words.join(" "), 260 | verbs.join(", "), 261 | nouns.join(", ") 262 | ) 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/pos_tag.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use serde::{Deserialize, Serialize}; 8 | use std::fmt; 9 | 10 | /// Part-of-speech tags based on the Penn Treebank tagset with custom modifications. 11 | /// For details on added tags (e.g., CA, CS, NZ), modified tags (e.g., EX, CD), and removed punctuation tags (e.g., SS, PUNC, SYM), refer to the crate documentation. 12 | #[derive(Default, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] 13 | pub enum POSTag { 14 | CC, 15 | CS, 16 | CA, 17 | DT, 18 | EX, 19 | #[default] 20 | FW, 21 | IN, 22 | JJ, 23 | JJR, 24 | JJS, 25 | LS, 26 | MD, 27 | MWE, 28 | NN, 29 | NNS, 30 | NNP, 31 | NNPS, 32 | NM, 33 | NZ, 34 | PDT, 35 | PR, 36 | PRP, 37 | PUNC, 38 | RB, 39 | RBR, 40 | RBS, 41 | SS, 42 | SYM, 43 | SYS, 44 | UH, 45 | VB, 46 | VBD, 47 | VBG, 48 | VBN, 49 | VBP, 50 | VBZ, 51 | VF, 52 | VFG, 53 | VH, 54 | VHF, 55 | VHP, 56 | VHZ, 57 | WDT, 58 | WPR, 59 | WPRP, 60 | WRB, 61 | } 62 | 63 | impl POSTag { 64 | /// Convert a string into an instance of the POSTag enum 65 | pub fn from_str(tag: &str) -> Self { 66 | match tag.to_uppercase().as_str() { 67 | "CC" => Self::CC, 68 | "CS" => Self::CS, 69 | "CA" => Self::CA, 70 | "DT" => Self::DT, 71 | "EX" => Self::EX, 72 | "FW" => Self::FW, 73 | "IN" => Self::IN, 74 | "JJ" => Self::JJ, 75 | "JJR" => Self::JJR, 76 | "JJS" => Self::JJS, 77 | "LS" => Self::LS, 78 | "MD" => Self::MD, 79 | "MWE" => Self::MWE, 80 | "NN" => Self::NN, 81 | "NNS" => Self::NNS, 82 | "NNP" => Self::NNP, 83 | "NNPS" => Self::NNPS, 84 | "NM" => Self::NM, 85 | "NZ" => Self::NZ, 86 | "PDT" => Self::PDT, 87 | "PR" => Self::PR, 88 | "PRP" => Self::PRP, 89 | "PUNC" => Self::PUNC, 90 | "RB" => Self::RB, 91 | "RBR" => Self::RBR, 92 | "RBS" => Self::RBS, 93 | "SS" => Self::SS, 94 | "SYM" => Self::SYM, 95 | "SYS" => Self::SYS, 96 | "UH" => Self::UH, 97 | "VB" => Self::VB, 98 | "VBD" => Self::VBD, 99 | "VBG" => Self::VBG, 100 | "VBN" => Self::VBN, 101 | "VBP" => Self::VBP, 102 | "VBZ" => Self::VBZ, 103 | "VF" => Self::VF, 104 | "VFG" => Self::VFG, 105 | "VH" => Self::VH, 106 | "VHF" => Self::VHF, 107 | "VHP" => Self::VHP, 108 | "VHZ" => Self::VHZ, 109 | "WDT" => Self::WDT, 110 | "WPR" => Self::WPR, 111 | "WPRP" => Self::WPRP, 112 | "WRB" => Self::WRB, 113 | _ => Self::FW, 114 | } 115 | } 116 | 117 | /// Convert an instance of POStag into its string counterpart 118 | pub fn to_str(&self) -> String { 119 | match self { 120 | Self::CC => "CC".to_string(), 121 | Self::CS => "CS".to_string(), 122 | Self::CA => "CA".to_string(), 123 | Self::DT => "DT".to_string(), 124 | Self::EX => "EX".to_string(), 125 | Self::FW => "FW".to_string(), 126 | Self::IN => "IN".to_string(), 127 | Self::JJ => "JJ".to_string(), 128 | Self::JJR => "JJR".to_string(), 129 | Self::JJS => "JJS".to_string(), 130 | Self::LS => "LS".to_string(), 131 | Self::MD => "MD".to_string(), 132 | Self::MWE => "MWE".to_string(), 133 | Self::NN => "NN".to_string(), 134 | Self::NNS => "NNS".to_string(), 135 | Self::NNP => "NNP".to_string(), 136 | Self::NNPS => "NNPS".to_string(), 137 | Self::NM => "NM".to_string(), 138 | Self::NZ => "NZ".to_string(), 139 | Self::PDT => "PDT".to_string(), 140 | Self::PR => "PR".to_string(), 141 | Self::PRP => "PRP".to_string(), 142 | Self::PUNC => "PUNC".to_string(), 143 | Self::RB => "RB".to_string(), 144 | Self::RBR => "RBR".to_string(), 145 | Self::RBS => "RBS".to_string(), 146 | Self::SS => "SS".to_string(), 147 | Self::SYM => "SYM".to_string(), 148 | Self::SYS => "SYS".to_string(), 149 | Self::UH => "UH".to_string(), 150 | Self::VB => "VB".to_string(), 151 | Self::VBD => "VBD".to_string(), 152 | Self::VBG => "VBG".to_string(), 153 | Self::VBN => "VBN".to_string(), 154 | Self::VBP => "VBP".to_string(), 155 | Self::VBZ => "VBZ".to_string(), 156 | Self::VF => "VF".to_string(), 157 | Self::VFG => "VFG".to_string(), 158 | Self::VH => "VH".to_string(), 159 | Self::VHF => "VHF".to_string(), 160 | Self::VHP => "VHP".to_string(), 161 | Self::VHZ => "VHZ".to_string(), 162 | Self::WDT => "WDT".to_string(), 163 | Self::WPR => "WPR".to_string(), 164 | Self::WPRP => "WPRP".to_string(), 165 | Self::WRB => "WRB".to_string(), 166 | } 167 | } 168 | 169 | /// Convert a u8 into an instance of POSTag enum, mainly used by the POS tagger 170 | pub fn from_u8(value: u8) -> Self { 171 | match value { 172 | 1 => Self::CC, 173 | 2 => Self::CS, 174 | 3 => Self::CA, 175 | 4 => Self::DT, 176 | 5 => Self::EX, 177 | 6 => Self::FW, 178 | 7 => Self::IN, 179 | 8 => Self::JJ, 180 | 9 => Self::JJR, 181 | 10 => Self::JJS, 182 | 11 => Self::LS, 183 | 12 => Self::MD, 184 | 13 => Self::MWE, 185 | 14 => Self::NN, 186 | 15 => Self::NNS, 187 | 16 => Self::NNP, 188 | 17 => Self::NNPS, 189 | 18 => Self::NM, 190 | 19 => Self::NZ, 191 | 20 => Self::PDT, 192 | 21 => Self::PR, 193 | 22 => Self::PRP, 194 | 23 => Self::PUNC, 195 | 24 => Self::RB, 196 | 25 => Self::RBR, 197 | 26 => Self::RBS, 198 | 27 => Self::SS, 199 | 28 => Self::SYM, 200 | 29 => Self::SYS, 201 | 30 => Self::UH, 202 | 31 => Self::VB, 203 | 32 => Self::VBD, 204 | 33 => Self::VBG, 205 | 34 => Self::VBN, 206 | 35 => Self::VBP, 207 | 36 => Self::VBZ, 208 | 37 => Self::VF, 209 | 38 => Self::VFG, 210 | 39 => Self::VH, 211 | 40 => Self::VHF, 212 | 41 => Self::VHP, 213 | 42 => Self::VHZ, 214 | 43 => Self::WDT, 215 | 44 => Self::WPR, 216 | 45 => Self::WPRP, 217 | 46 => Self::WRB, 218 | _ => Self::FW, 219 | } 220 | } 221 | 222 | /// Converts an instance of the POSTag enum into its u8 counterpart, generally only used by the POS tagger 223 | pub fn to_u8(&self) -> u8 { 224 | match self { 225 | Self::CC => 1, 226 | Self::CS => 2, 227 | Self::CA => 3, 228 | Self::DT => 4, 229 | Self::EX => 5, 230 | Self::FW => 6, 231 | Self::IN => 7, 232 | Self::JJ => 8, 233 | Self::JJR => 9, 234 | Self::JJS => 10, 235 | Self::LS => 11, 236 | Self::MD => 12, 237 | Self::MWE => 13, 238 | Self::NN => 14, 239 | Self::NNS => 15, 240 | Self::NNP => 16, 241 | Self::NNPS => 17, 242 | Self::NM => 18, 243 | Self::NZ => 19, 244 | Self::PDT => 20, 245 | Self::PR => 21, 246 | Self::PRP => 22, 247 | Self::PUNC => 23, 248 | Self::RB => 24, 249 | Self::RBR => 25, 250 | Self::RBS => 26, 251 | Self::SS => 27, 252 | Self::SYM => 28, 253 | Self::SYS => 29, 254 | Self::UH => 30, 255 | Self::VB => 31, 256 | Self::VBD => 32, 257 | Self::VBG => 33, 258 | Self::VBN => 34, 259 | Self::VBP => 35, 260 | Self::VBZ => 36, 261 | Self::VF => 37, 262 | Self::VFG => 38, 263 | Self::VH => 39, 264 | Self::VHF => 40, 265 | Self::VHP => 41, 266 | Self::VHZ => 42, 267 | Self::WDT => 43, 268 | Self::WPR => 44, 269 | Self::WPRP => 45, 270 | Self::WRB => 46, 271 | } 272 | } 273 | 274 | /// Convert tag to a shortened version -- used for training 275 | /// of cohorts based model to assist with automated spelling corrections. 276 | pub fn to_short_tag(&self) -> Self { 277 | match *self { 278 | Self::CC | Self::CS | Self::CA => Self::CC, 279 | Self::DT | Self::PDT | Self::WDT => Self::DT, 280 | Self::IN => Self::IN, 281 | Self::JJ | Self::JJR | Self::JJS => Self::JJ, 282 | Self::NN | Self::NNS | Self::NNP | Self::NNPS | Self::NM | Self::NZ => Self::NN, 283 | Self::PR | Self::PRP | Self::WPR | Self::WPRP => Self::PR, 284 | Self::PUNC | Self::SS => Self::PUNC, 285 | Self::RB | Self::RBR | Self::RBS | Self::WRB => Self::RB, 286 | Self::VB | Self::VBD | Self::VBG | Self::VBN | Self::VBP | Self::VBZ | Self::MD => { 287 | Self::VB 288 | } 289 | _ => Self::FW, 290 | } 291 | } 292 | 293 | /// Check whether the POS tag belongs to a noun 294 | pub fn is_noun(&self) -> bool { 295 | self.to_str().starts_with("N") || *self == Self::SYS 296 | } 297 | 298 | /// Check whether the POS tag belongs to a verb 299 | pub fn is_verb(&self) -> bool { 300 | self.to_str().starts_with("V") 301 | } 302 | 303 | /// Check whether the POS tag belongs to a conjunction 304 | pub fn is_conjunction(&self) -> bool { 305 | self.to_str().starts_with("C") 306 | } 307 | /// Check whether the POS tag belongs to a base verb 308 | pub fn is_base_verb(&self) -> bool { 309 | *self == Self::VB || *self == Self::VBG 310 | } 311 | 312 | pub fn is_punctuation(&self) -> bool { 313 | *self == Self::SS || *self == Self::PUNC 314 | } 315 | 316 | /// Check whether or not tag belongs to a pronoun 317 | pub fn is_pronoun(&self) -> bool { 318 | *self == Self::PR || *self == Self::PRP 319 | } 320 | 321 | /// Check whether the POS tag belongs to a adjective 322 | pub fn is_adjective(&self) -> bool { 323 | self.to_str().starts_with("J") 324 | } 325 | 326 | /// Check whether the POS tag belongs to a adverb 327 | pub fn is_adverb(&self) -> bool { 328 | self.to_str().starts_with("R") 329 | } 330 | 331 | /// Check whether the POS tag belongs to a named entiy 332 | pub fn is_named_entity(&self) -> bool { 333 | self.to_str().starts_with("NNP") 334 | } 335 | } 336 | 337 | impl fmt::Display for POSTag { 338 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 339 | write!(f, "{}", self.to_str()) 340 | } 341 | } 342 | 343 | impl fmt::Debug for POSTag { 344 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 345 | write!(f, "{}", self.to_str()) 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/hmm.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{POSPrediction, POSPredictionMethod, POSTag, TokenKey}; 8 | use crate::tokenizer::Token; 9 | use crate::vocab::{Capitalization, VocabMWE}; 10 | use serde::{Deserialize, Serialize}; 11 | use std::collections::HashMap; 12 | use std::hash::Hash; 13 | 14 | pub const TOTAL_TAGS: usize = 47; 15 | 16 | #[derive(Default, Serialize, Deserialize)] 17 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")] 18 | pub struct HMM { 19 | pub vocab_size: f32, 20 | pub initial_probs: Vec, 21 | pub transmition_probs: Vec>, 22 | pub emission_probs: Vec>, 23 | pub smoothing: f64, 24 | } 25 | 26 | #[derive(Debug)] 27 | struct Probability { 28 | pub deterministic_tag_idx: usize, 29 | pub viterbi: Vec, 30 | pub backpointer: Vec, 31 | } 32 | 33 | impl HMM 34 | where 35 | S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>, 36 | Token: TokenKey, 37 | { 38 | pub fn new() -> Self { 39 | Self { 40 | vocab_size: 0.0, 41 | initial_probs: vec![0.0; TOTAL_TAGS], 42 | transmition_probs: vec![vec![0.0; TOTAL_TAGS]; TOTAL_TAGS], 43 | emission_probs: vec![HashMap::new(); TOTAL_TAGS], 44 | smoothing: 1.0, 45 | } 46 | } 47 | 48 | /// Apply hmm model to vector of tokens 49 | pub fn apply(&self, tokens: &mut [Token]) { 50 | let mut start_pos = 0; 51 | let mut end_pos: usize; 52 | loop { 53 | // Get end position 54 | end_pos = match tokens[start_pos..].iter().position(|token| token.pos == POSTag::SS) { 55 | Some(r) => r + start_pos + 1, 56 | None => tokens.len().saturating_sub(1) 57 | }; 58 | if start_pos >= end_pos { 59 | break; 60 | } 61 | 62 | // Apply viterbi 63 | self.viterbi_decode(start_pos, end_pos, tokens); 64 | start_pos = end_pos; 65 | if start_pos >= tokens.len() - 1 { 66 | break; 67 | } 68 | } 69 | } 70 | 71 | /// Predict tags for a sentence 72 | fn viterbi_decode(&self, start_pos: usize, end_pos: usize, tokens: &mut [Token]) { 73 | // Go through tokens 74 | let mut results: Vec = Vec::new(); 75 | for (offset, token) in tokens[start_pos..end_pos].iter().enumerate() { 76 | let position = offset + start_pos; 77 | 78 | // Initial token 79 | if offset == 0 { 80 | let tag_indices = if token.potential_pos.len() > 1 { 81 | token 82 | .potential_pos 83 | .iter() 84 | .filter(|&tag| *tag != POSTag::FW) 85 | .map(|tag| tag.to_u8() as usize) 86 | .collect::>() 87 | } else if token.pos == POSTag::FW { 88 | (1..47).filter(|x| *x != 6).collect::>() 89 | } else { 90 | vec![token.pos.to_u8() as usize] 91 | }; 92 | 93 | // Instantiate probability 94 | let current_tag_idx = if tag_indices.len() == 1 { 95 | tag_indices[0] 96 | } else { 97 | 0 98 | }; 99 | let mut probs = Probability::new(current_tag_idx); 100 | 101 | for tag_idx in tag_indices { 102 | probs.viterbi[tag_idx] = 103 | self.initial_probs[tag_idx] + self.get_emission_prob(tag_idx, token); 104 | } 105 | 106 | results.push(probs); 107 | continue; 108 | } 109 | 110 | // Forward pass 111 | let probs = self.calculate_viterbi(position, &results, tokens); 112 | results.push(probs); 113 | } 114 | 115 | // Find best final state 116 | let last_idx = results.len() - 1; 117 | let mut best_final_state = 0; 118 | let mut best_score = results[last_idx].viterbi[0]; 119 | 120 | for tag_idx in 1..TOTAL_TAGS { 121 | if tag_idx == 6 { 122 | continue; 123 | } 124 | 125 | if results[last_idx].viterbi[tag_idx] > best_score { 126 | best_score = results[last_idx].viterbi[tag_idx]; 127 | best_final_state = tag_idx; 128 | } 129 | } 130 | 131 | // Backtrack to find best path 132 | let mut path = vec![0; results.len()]; 133 | path[last_idx] = best_final_state; 134 | for idx in (0..results.len() - 1).rev() { 135 | path[idx] = results[idx + 1].backpointer[path[idx + 1]]; 136 | } 137 | 138 | // Update tokens with new POS tags 139 | let (mut is_initial, mut in_nnp) = (true, false); 140 | for (offset, tag_idx) in path.iter().enumerate() { 141 | let position = offset + start_pos; 142 | 143 | if tokens[position].potential_pos.len() < 2 && tokens[position].index > 0 { 144 | tokens[position].pos_prediction.confidence = 1.0; 145 | tokens[position].pos_prediction.tag = tokens[position].pos; 146 | tokens[position].pos_prediction.prev_tag = tokens[position].pos; 147 | } else { 148 | let tag = POSTag::from_u8(*tag_idx as u8); 149 | let confidence = self.get_confidence_score(position, offset, &results, tokens); 150 | 151 | tokens[position].pos_prediction = POSPrediction::new( 152 | POSPredictionMethod::hmm, 153 | &tokens[position].word, 154 | tokens[position].pos, 155 | tag, 156 | confidence, 157 | &HashMap::new(), 158 | &[], 159 | ); 160 | tokens[position].pos = tag; 161 | } 162 | 163 | // Check for named entity 164 | let tag = tokens[position].pos; 165 | if tag == POSTag::NN 166 | && VocabMWE::classify_capitalization(&tokens[position].word) 167 | != Capitalization::lower 168 | && !is_initial 169 | { 170 | //tokens[position].pos = POSTag::NNP; 171 | in_nnp = true; 172 | } else if tag == POSTag::NN && in_nnp { 173 | //tokens[position].pos = POSTag::NNP; 174 | } else { 175 | in_nnp = false; 176 | } 177 | is_initial = tokens[position].pos == POSTag::SS; 178 | } 179 | } 180 | 181 | // Calculate the viterbi for a single token. 182 | fn calculate_viterbi( 183 | &self, 184 | position: usize, 185 | results: &[Probability], 186 | tokens: &[Token], 187 | ) -> Probability { 188 | // Instantiate probability 189 | let token = &tokens[position]; 190 | let deterministic_tag_idx = if token.potential_pos.len() > 1 || token.pos == POSTag::FW { 191 | 0 192 | } else { 193 | token.pos.to_u8() as usize 194 | }; 195 | let mut probs = Probability::new(deterministic_tag_idx); 196 | 197 | // Initialize 198 | let prev_probs = &results.last().unwrap(); 199 | 200 | // Get tag indices 201 | let tag_indices = if token.potential_pos.len() > 1 { 202 | token 203 | .potential_pos 204 | .iter() 205 | .filter(|&tag| *tag != POSTag::FW) 206 | .map(|tag| tag.to_u8() as usize) 207 | .collect::>() 208 | } else if token.pos == POSTag::FW { 209 | (1..47).filter(|x| *x != 6).collect::>() 210 | } else { 211 | vec![token.pos.to_u8() as usize] 212 | }; 213 | 214 | // Get previous tag indices 215 | let prev_tag_indices = if tokens[position - 1].potential_pos.len() > 1 { 216 | tokens[position - 1] 217 | .potential_pos 218 | .iter() 219 | .filter(|&tag| *tag != POSTag::FW) 220 | .map(|tag| tag.to_u8() as usize) 221 | .collect::>() 222 | } else if tokens[position - 1].pos == POSTag::FW || prev_probs.deterministic_tag_idx == 6 { 223 | (1..47).filter(|x| *x != 6).collect::>() 224 | } else { 225 | vec![prev_probs.deterministic_tag_idx] 226 | }; 227 | 228 | // Calculate scores 229 | for tag_idx in tag_indices.iter() { 230 | let emission_prob = self.get_emission_prob(*tag_idx, token); 231 | 232 | for prev_tag_idx in prev_tag_indices.iter() { 233 | let score = prev_probs.viterbi[*prev_tag_idx] 234 | + self.transmition_probs[*prev_tag_idx][*tag_idx] 235 | + emission_prob; 236 | if score > probs.viterbi[*tag_idx] { 237 | probs.viterbi[*tag_idx] = score; 238 | probs.backpointer[*tag_idx] = *prev_tag_idx; 239 | } 240 | } 241 | } 242 | 243 | probs 244 | } 245 | 246 | /// Get emission probabilities 247 | fn get_emission_prob(&self, tag_idx: usize, token: &Token) -> f32 { 248 | match self.emission_probs[tag_idx].get(&token.get_key()) { 249 | Some(&prob) => prob, 250 | None => { 251 | let tag_vocab_size = self.emission_probs[tag_idx].len() as f32; 252 | (self.smoothing as f32 / (tag_vocab_size + self.vocab_size * self.smoothing as f32)) 253 | .ln() 254 | } 255 | } 256 | } 257 | 258 | /// Returns a value between 0.0 and 1.0, where 1.0 means completely certain 259 | fn get_confidence_score( 260 | &self, 261 | position: usize, 262 | offset: usize, 263 | results: &[Probability], 264 | tokens: &[Token], 265 | ) -> f32 { 266 | let token = &tokens[position]; 267 | let prob_result = &results[offset]; 268 | 269 | let tag_indices: Vec = token 270 | .potential_pos 271 | .iter() 272 | .filter(|&tag| *tag != POSTag::FW) 273 | .map(|tag| tag.to_u8() as usize) 274 | .collect(); 275 | 276 | // Get scores for all possible tags at this position 277 | let mut scores: Vec = tag_indices 278 | .iter() 279 | .map(|&idx| prob_result.viterbi[idx]) 280 | .filter(|&score| score != f32::NEG_INFINITY) 281 | .collect(); 282 | 283 | if scores.len() <= 1 { 284 | return 1.0; 285 | } 286 | 287 | // Sort scores in descending order 288 | scores.sort_by(|a, b| b.partial_cmp(a).unwrap()); 289 | 290 | // Convert log probabilities to actual probabilities 291 | let max_score = scores[0]; 292 | let prob_scores: Vec = scores.iter().map(|&score| (score - max_score).exp()).collect(); 293 | 294 | let total_prob: f32 = prob_scores.iter().sum(); 295 | let normalized_probs: Vec = 296 | prob_scores.iter().map(|&prob| prob / total_prob).collect(); 297 | 298 | // Confidence is the probability of the best choice 299 | normalized_probs[0] 300 | } 301 | } 302 | 303 | impl Probability { 304 | pub fn new(deterministic_tag_idx: usize) -> Self { 305 | Self { 306 | deterministic_tag_idx, 307 | viterbi: vec![f32::NEG_INFINITY; TOTAL_TAGS], 308 | backpointer: vec![0; TOTAL_TAGS], 309 | } 310 | } 311 | } 312 | -------------------------------------------------------------------------------- /src/sophia/src/tokenizer/token.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use crate::pos_tagger::{POSPrediction, POSTag}; 8 | use crate::vocab::{ 9 | f8::f8, 10 | {Pronoun, VocabDatabase}, 11 | }; 12 | use serde::{Deserialize, Serialize}; 13 | use std::collections::HashMap; 14 | use std::fmt; 15 | use std::ops::Range; 16 | 17 | /// Represents a token with linguistic properties, including word, part-of-speech, categories, pronoun details, and scoring information. 18 | #[derive(Default, Clone, Debug, Serialize, Deserialize)] 19 | pub struct Token { 20 | pub word: String, 21 | #[serde(skip)] 22 | pub index: i32, 23 | pub stem: i32, 24 | pub potential_stem: Vec, 25 | pub is_name: bool, 26 | #[serde(skip)] 27 | pub token_type: TokenType, 28 | #[serde(skip)] 29 | pub is_possessive: bool, 30 | #[serde(skip)] 31 | pub is_negative: bool, 32 | pub pos: POSTag, 33 | #[serde(skip)] 34 | pub pos_prediction: POSPrediction, 35 | #[serde(skip)] 36 | pub potential_pos: Vec, 37 | pub categories: Vec, 38 | pub ner: Vec, 39 | pub synonyms: Vec, 40 | pub hypernyms: Vec, 41 | pub hyponyms: Vec, 42 | pub classification_scores: HashMap, 43 | pub pronoun: Option, 44 | #[serde(skip)] 45 | pub antecedent: Option, 46 | #[serde(skip)] 47 | pub inner_word: String, 48 | #[serde(skip)] 49 | pub inner_value: String, 50 | #[serde(skip)] 51 | pub inner_unit: String, 52 | } 53 | 54 | /// Defines the type of a token, which can be a word, prefix, or suffix. 55 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Clone, Debug)] 56 | pub enum TokenType { 57 | #[default] 58 | word, 59 | prefix, 60 | suffix, 61 | } 62 | 63 | impl Token { 64 | /// Creates a new Token from a word using the vocabulary database, initializing its properties. 65 | pub fn new(query_word: &str, vocab: &VocabDatabase) -> Token { 66 | if query_word.is_empty() { 67 | return Self::default(); 68 | } 69 | 70 | // Get word lookup table 71 | let (word, lookup) = match vocab.lookup_word(query_word) { 72 | Some(r) => r, 73 | None => return Self::unknown(query_word), 74 | }; 75 | let (_, token_id) = lookup.iter().next().unwrap(); 76 | 77 | // Get oken by id 78 | let mut token = Self::from_id(*token_id, vocab); 79 | token.word = word; 80 | token.token_type = TokenType::word; 81 | token.potential_pos = lookup.keys().copied().collect(); 82 | 83 | token 84 | } 85 | 86 | /// Creates a prefix Token from a word using the vocabulary database. 87 | pub fn prefix(word: &str, vocab: &VocabDatabase) -> Token { 88 | let mut token = Self::new(word, vocab); 89 | token.token_type = TokenType::prefix; 90 | token 91 | } 92 | 93 | /// Creates a suffix Token from a word using the vocabulary database. 94 | pub fn suffix(word: &str, vocab: &VocabDatabase) -> Token { 95 | let mut token = Self::new(word, vocab); 96 | token.token_type = TokenType::suffix; 97 | token 98 | } 99 | 100 | /// Creates a numeric Token with the specified word, setting inner word and value. 101 | pub fn numeric(word: &str, vocab: &VocabDatabase) -> Token { 102 | let mut token = Self::new("|num|", vocab); 103 | token.inner_word = word.to_string(); 104 | token.inner_value = word.to_string(); 105 | token 106 | } 107 | 108 | /// Creates a special Token for system tags, with specified word, tag, value, and unit. 109 | pub fn special(word: &str, tag: &str, value: &str, unit: &str, vocab: &VocabDatabase) -> Token { 110 | let mut token = Self::new(tag, vocab); 111 | token.inner_word = word.to_string(); 112 | token.inner_value = value.to_string(); 113 | token.inner_unit = unit.to_string(); 114 | token 115 | } 116 | 117 | /// Creates an unknown Token with the specified word and default properties. 118 | pub fn unknown(word: &str) -> Token { 119 | Self { 120 | word: word.to_string(), 121 | ..Default::default() 122 | } 123 | } 124 | 125 | /// Creates a Token from a token ID using the vocabulary database, setting its index. 126 | pub fn from_id(token_id: i32, vocab: &VocabDatabase) -> Token { 127 | let mut token = match vocab.words.id2token.get(&token_id) { 128 | Some(r) => r.clone(), 129 | None => Self::default(), 130 | }; 131 | token.index = token_id; 132 | 133 | token 134 | } 135 | 136 | /// Updates the POS tag of the Token, returning a new Token if the tag is valid in the vocabulary. 137 | pub fn update_pos(&self, pos_code: POSTag, vocab: &VocabDatabase) -> Option { 138 | // Get map 139 | let index_map = vocab.words.wordlist.get(&self.word)?; 140 | 141 | // Get token id 142 | let index = index_map.get(&pos_code)?; 143 | 144 | // Return token 145 | let token = Self::from_id(*index, vocab); 146 | Some(token) 147 | } 148 | 149 | /// Forces the Token to a verb POS tag if possible, returning a new Token or None if no verb tag is available. 150 | pub fn force_verb(&self, vocab: &VocabDatabase) -> Option { 151 | if self.is_verb() { 152 | return None; 153 | } 154 | 155 | for code in self.potential_pos.iter() { 156 | if !code.to_str().starts_with("V") { 157 | continue; 158 | } 159 | return self.update_pos(*code, vocab); 160 | } 161 | 162 | None 163 | } 164 | 165 | /// Checks if the Token has a category within the specified range. 166 | pub fn has_category(&self, category_range: &Range) -> bool { 167 | self.categories.iter().any(|&x| category_range.contains(&x)) 168 | } 169 | 170 | /// Checks if the Token has a named entity recognition (NER) category within the specified range. 171 | pub fn has_ner(&self, category_range: &Range) -> bool { 172 | self.ner.iter().any(|&x| category_range.contains(&x)) 173 | } 174 | 175 | /// Checks if the Token is a noun (starts with 'N' or is SYS). 176 | pub fn is_noun(&self) -> bool { 177 | self.pos.to_str().starts_with("N") || self.pos == POSTag::SYS 178 | } 179 | 180 | /// Checks if the Token is a verb (starts with 'V'). 181 | pub fn is_verb(&self) -> bool { 182 | self.pos.to_str().starts_with("V") 183 | } 184 | 185 | /// Checks if the Token is a base verb (VB or VBG). 186 | pub fn is_base_verb(&self) -> bool { 187 | ["VB", "VBG"].contains(&self.pos.to_str().as_str()) 188 | } 189 | 190 | /// Checks if the Token is a past verb (VBD, VBN, or VHP). 191 | pub fn is_past_verb(&self) -> bool { 192 | ["VBD", "VBN", "VHP"].contains(&self.pos.to_str().as_str()) 193 | } 194 | 195 | /// Checks if the Token is a present verb (VB, VBG, VBZ, VH, or VHZ). 196 | pub fn is_present_verb(&self) -> bool { 197 | ["VB", "VBG", "VBZ", "VH", "VHZ"].contains(&self.pos.to_str().as_str()) 198 | } 199 | 200 | /// Checks if the Token is a future verb (VF, VFG, or VHF). 201 | pub fn is_future_verb(&self) -> bool { 202 | ["VF", "VFG", "VHF"].contains(&self.pos.to_str().as_str()) 203 | } 204 | 205 | /// Checks if the Token is an adjective (starts with 'JJ'). 206 | pub fn is_adjective(&self) -> bool { 207 | self.pos.to_str().starts_with("JJ") 208 | } 209 | 210 | /// Checks if the Token is an adverb (starts with 'RB'). 211 | pub fn is_adverb(&self) -> bool { 212 | self.pos.to_str().starts_with("RB") 213 | } 214 | 215 | /// Checks if the Token is a named entity (starts with 'NNP'). 216 | pub fn is_named_entity(&self) -> bool { 217 | self.pos.to_str().starts_with("NNP") 218 | } 219 | 220 | /// Checks if the Token is an n-gram (MWE). 221 | pub fn is_ngram(&self) -> bool { 222 | self.pos == POSTag::MWE 223 | } 224 | 225 | /// Checks if the Token is a conjunction (starts with 'C'). 226 | pub fn is_conjunction(&self) -> bool { 227 | self.pos.to_str().starts_with("C") 228 | } 229 | 230 | /// Checks if the Token is a determiner (DT). 231 | pub fn is_determiner(&self) -> bool { 232 | self.pos == POSTag::DT 233 | } 234 | 235 | /// Checks if the Token is a pronoun (PR or PRP). 236 | pub fn is_pronoun(&self) -> bool { 237 | self.pos == POSTag::PR || self.pos == POSTag::PRP 238 | } 239 | 240 | /// Checks if the Token is a modal verb (MD). 241 | pub fn is_modal_verb(&self) -> bool { 242 | self.pos == POSTag::MD 243 | } 244 | 245 | /// Checks if the Token is a preposition (IN). 246 | pub fn is_preposition(&self) -> bool { 247 | self.pos == POSTag::IN 248 | } 249 | 250 | /// Checks if the Token is a sentence stopper (SS). 251 | pub fn is_sentence_stopper(&self) -> bool { 252 | self.pos == POSTag::SS 253 | } 254 | 255 | /// Check if the token is a punctuation mark 256 | pub fn is_punctuation(&self) -> bool { 257 | self.pos == POSTag::SS || self.pos == POSTag::PUNC 258 | } 259 | 260 | /// Checks if the Token is a potential phrase splitter (PUNC). 261 | pub fn is_phrase_splitter(&self) -> bool { 262 | self.pos == POSTag::PUNC 263 | } 264 | 265 | /// Retrieves the category vectors for the Token from the vocabulary database. 266 | pub fn get_category_vec(&self, vocab: &VocabDatabase) -> Vec> { 267 | let mut res: Vec> = Vec::new(); 268 | for category_id in self.categories.iter() { 269 | let cat = match vocab.categories.get(category_id) { 270 | Some(r) => r, 271 | None => continue, 272 | }; 273 | res.push(cat.fqn.clone()); 274 | } 275 | 276 | res 277 | } 278 | 279 | /// Calculates the semantic distance between two Tokens based on their category vectors. 280 | pub fn get_distance(&self, token2: &Token, vocab: &VocabDatabase) -> f32 { 281 | // Get category vectors 282 | let token1_categories = self.get_category_vec(vocab); 283 | let token2_categories = token2.get_category_vec(vocab); 284 | 285 | // Initialize 286 | let mut total_score = 0.0; 287 | let mut comparisons = 0; 288 | 289 | // Go through categories, calculate distance / score 290 | for cat1 in token1_categories.iter() { 291 | for cat2 in token2_categories.iter() { 292 | let depth = self.get_common_category_depth(cat1, cat2); 293 | //let depth = 1; 294 | total_score += depth as f32; 295 | comparisons += 1; 296 | } 297 | } 298 | 299 | if comparisons > 0 { 300 | total_score / comparisons as f32 301 | } else { 302 | 0.0 303 | } 304 | } 305 | 306 | /// Calculates the common depth between two category paths. 307 | fn get_common_category_depth(&self, path1: &[i16], path2: &[i16]) -> usize { 308 | let mut depth = 0; 309 | for (p1, p2) in path1.iter().zip(path2.iter()) { 310 | if p1 == p2 { 311 | depth += 1; 312 | } else { 313 | break; 314 | } 315 | } 316 | 317 | depth 318 | } 319 | } 320 | 321 | impl fmt::Display for Token { 322 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 323 | if let Some(antecedent) = &self.antecedent { 324 | write!( 325 | f, 326 | "{} ({}), antecedent: {}", 327 | self.word, self.pos, antecedent 328 | ) 329 | } else if self.pos == POSTag::SYS && !self.inner_word.is_empty() { 330 | write!( 331 | f, 332 | "{} ({}), inner word: {}, value: {}, unit{}", 333 | self.word, self.pos, self.inner_word, self.inner_value, self.inner_unit 334 | ) 335 | } else { 336 | write!(f, "{} ({})", self.word, self.pos) 337 | } 338 | } 339 | } 340 | 341 | impl fmt::Display for TokenType { 342 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 343 | match self { 344 | TokenType::word => write!(f, "word"), 345 | TokenType::prefix => write!(f, "prefix"), 346 | TokenType::suffix => write!(f, "suffix"), 347 | } 348 | } 349 | } 350 | -------------------------------------------------------------------------------- /src/sophia/src/vocab/spell_check.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::VocabDatabase; 8 | use crate::pos_tagger::{POSPrefix, POSSuffix, POSTag}; 9 | use crate::tokenizer::Token; 10 | use serde::{Deserialize, Serialize}; 11 | use std::collections::HashMap; 12 | use std::fmt; 13 | 14 | const MAX_FREQUENCY: usize = 3; 15 | const FREQUENCY_WEIGHT: f32 = 0.40; 16 | const DISTANCE_WEIGHT: f32 = 0.85; 17 | const TAG_BEFORE_WEIGHT: f32 = 0.55; 18 | const WORD_BEFORE_WEIGHT: f32 = 0.65; 19 | const SUFFIX_BONUS: f32 = 0.75; 20 | const PREFIX_BONUS: f32 = 0.75; 21 | const DOUBLE_LETTER_BONUS: f32 = 0.35; 22 | 23 | /// Represents the automated spell checker, namely the various cohorts 24 | /// that are based on POS / word length and used to minimize the search space of possible corrections. 25 | #[derive(Default, Clone, Serialize, Deserialize)] 26 | pub struct SpellChecker { 27 | pub cohorts: HashMap>, 28 | } 29 | 30 | /// Individual entry for a candidate, stores 31 | /// preceeding tag / word frequency for weighted scoring. 32 | #[derive(Default, Clone, Serialize, Deserialize)] 33 | pub struct SpellCheckerEntry { 34 | pub word_index: i32, 35 | pub tag_before: Vec, 36 | pub word_before: Vec, 37 | } 38 | 39 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] 40 | pub struct SpellCheckerCohort { 41 | pub pos: SpellCheckerCohortPOS, 42 | pub length: SpellCheckerCohortSize, 43 | } 44 | 45 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] 46 | pub enum SpellCheckerCohortPOS { 47 | noun, 48 | verb, 49 | adverb, 50 | adjective, 51 | entity, 52 | other, 53 | } 54 | 55 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)] 56 | pub enum SpellCheckerCohortSize { 57 | short, // <= 4 chars 58 | short_medium, // 5 or 6 chars 59 | medium, // 7 or 8 chars 60 | medium_long, // 9 or 10 chars 61 | long, // 11+ chars 62 | } 63 | 64 | /// Candidate spelling correct, used to rank and 65 | // score possible corrections. 66 | #[derive(Default, Clone)] 67 | struct Candidate { 68 | pub token: Token, 69 | pub item: SpellCheckerEntry, 70 | pub score: f32, 71 | pub frequency: usize, 72 | pub distance: usize, 73 | pub tag_before: usize, 74 | pub word_before: usize, 75 | pub same_suffix: bool, 76 | pub same_prefix: bool, 77 | pub has_double_letter: bool, 78 | } 79 | 80 | impl SpellChecker { 81 | /// Check word for corrected spelling 82 | pub fn try_correct( 83 | &self, 84 | position: usize, 85 | tokens: &[Token], 86 | vocab: &VocabDatabase, 87 | ) -> Option { 88 | // Get candidates 89 | let mut candidates = self.get_candidates(&tokens[position], vocab); 90 | 91 | // Look for spelling correction 92 | for queue in &mut candidates { 93 | if queue.is_empty() { 94 | continue; 95 | } 96 | 97 | // Score candidates 98 | self.score_candidates(queue, position, tokens); 99 | 100 | // Sort candidates 101 | queue.sort_unstable_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); 102 | return Some(queue[0].token.clone()); 103 | } 104 | 105 | None 106 | } 107 | 108 | /// Get cohort based on POS tag and length 109 | fn get_cohorts(&self, token: &Token) -> Vec { 110 | // Get tags 111 | let tags = token 112 | .pos_prediction 113 | .probabilities 114 | .iter() 115 | .filter(|(_, score)| **score >= 0.2) 116 | .map(|(tag, _)| *tag) 117 | .collect::>(); 118 | 119 | // Go through tags 120 | let cohorts: Vec = tags 121 | .iter() 122 | .flat_map(|tag| { 123 | let pos = SpellCheckerCohortPOS::from(*tag); 124 | let sizes = SpellCheckerCohortSize::get_sizes(token.word.len()); 125 | 126 | sizes 127 | .iter() 128 | .map(|length| SpellCheckerCohort { 129 | pos: pos.clone(), 130 | length: length.clone(), 131 | }) 132 | .collect::>() 133 | }) 134 | .collect(); 135 | 136 | cohorts 137 | } 138 | 139 | // Get initial candidates, sorted by distance 140 | fn get_candidates(&self, token: &Token, vocab: &VocabDatabase) -> Vec> { 141 | // Get cohorts 142 | let cohorts = self.get_cohorts(token); 143 | let mut candidates: Vec> = vec![vec![]; 4]; 144 | let word = token.word.to_lowercase(); 145 | 146 | // Go through cohorts 147 | for cohort in cohorts.iter() { 148 | let search = match self.cohorts.get(cohort) { 149 | Some(r) => r, 150 | None => continue, 151 | }; 152 | 153 | // Initialize variables 154 | let mut frequency = MAX_FREQUENCY; 155 | let freq_interval = search.len() / 3; 156 | 157 | // Gather candidates 158 | for (x, item) in search.iter().enumerate() { 159 | if x > 0 && x % freq_interval == 0 { 160 | frequency -= 1; 161 | } 162 | let s_token = vocab.from_int(item.word_index); 163 | 164 | // GEt distance 165 | let lev_distance = self.levenshtein(&word, &s_token.word); 166 | let distance = candidates.len().saturating_sub(lev_distance); 167 | if distance > 0 && lev_distance > 0 { 168 | candidates[lev_distance - 1] 169 | .push(Candidate::new(frequency, distance, &s_token, item)); 170 | } 171 | } 172 | } 173 | 174 | candidates 175 | } 176 | 177 | // Score candidates 178 | fn score_candidates(&self, candidates: &mut [Candidate], position: usize, tokens: &[Token]) { 179 | // Iterate through candidates 180 | for cand in candidates.iter_mut() { 181 | // Get preceding tag and word score 182 | if position > 0 { 183 | if let Some(idx) = 184 | cand.item.tag_before.iter().position(|&tag| tag == tokens[position - 1].pos) 185 | { 186 | cand.tag_before = self.get_frequency_idx(idx, cand.item.tag_before.len()); 187 | } 188 | 189 | if let Some(idx) = cand 190 | .item 191 | .word_before 192 | .iter() 193 | .position(|&w_idx| w_idx == tokens[position - 1].index) 194 | { 195 | cand.word_before = self.get_frequency_idx(idx, cand.item.word_before.len()); 196 | } 197 | } 198 | 199 | // Check suffix 200 | if let Ok(suffix) = POSSuffix::try_from(&tokens[position]) 201 | && let Ok(chk_suffix) = POSSuffix::try_from(&cand.token) 202 | { 203 | cand.same_suffix = suffix == chk_suffix; 204 | } 205 | 206 | // Check prefix 207 | if let Ok(prefix) = POSPrefix::try_from(&tokens[position]) 208 | && let Ok(chk_prefix) = POSPrefix::try_from(&cand.token) 209 | { 210 | cand.same_prefix = prefix == chk_prefix; 211 | } 212 | 213 | // Check double letter 214 | cand.has_double_letter = 215 | self.check_double_letter(&tokens[position].word.to_lowercase(), &cand.token.word); 216 | 217 | // Score candidate 218 | cand.score = cand.calculate_score(); 219 | } 220 | } 221 | 222 | // Check whether or not word has double letter typo 223 | fn check_double_letter(&self, word: &str, candidate_word: &str) -> bool { 224 | let letters: Vec = word.chars().collect(); 225 | for (x, char) in letters[1..].iter().enumerate() { 226 | if *char == letters[x] { 227 | // Check if candidate has double letter 228 | let chk = format!("{}{}", char, char); 229 | if !candidate_word.contains(&chk) { 230 | return true; 231 | } 232 | } 233 | } 234 | 235 | false 236 | } 237 | 238 | /// Get frequency based score 239 | fn get_frequency_idx(&self, idx: usize, total: usize) -> usize { 240 | let interval = total / MAX_FREQUENCY; 241 | if idx == 0 || interval == 0 { 242 | MAX_FREQUENCY 243 | } else { 244 | MAX_FREQUENCY.saturating_sub(idx / interval) 245 | } 246 | } 247 | /// Calculate levenshtein distance 248 | fn levenshtein(&self, s1: &str, s2: &str) -> usize { 249 | let len1 = s1.len(); 250 | let len2 = s2.len(); 251 | 252 | if len1 == 0 { 253 | return len2; 254 | } 255 | if len2 == 0 { 256 | return len1; 257 | } 258 | 259 | let mut prev_row = vec![0; len2 + 1]; 260 | let mut curr_row = vec![0; len2 + 1]; 261 | 262 | for j in 0..=len2 { 263 | prev_row[j] = j; 264 | } 265 | 266 | for (i, c1) in s1.chars().enumerate() { 267 | curr_row[0] = i + 1; 268 | 269 | for (j, c2) in s2.chars().enumerate() { 270 | let cost = if c1 == c2 { 0 } else { 1 }; 271 | curr_row[j + 1] = std::cmp::min( 272 | std::cmp::min(prev_row[j + 1] + 1, curr_row[j] + 1), 273 | prev_row[j] + cost, 274 | ); 275 | } 276 | 277 | std::mem::swap(&mut prev_row, &mut curr_row); 278 | } 279 | 280 | prev_row[len2] 281 | } 282 | } 283 | 284 | impl Candidate { 285 | pub fn new(frequency: usize, distance: usize, token: &Token, item: &SpellCheckerEntry) -> Self { 286 | Self { 287 | token: token.clone(), 288 | item: item.clone(), 289 | frequency, 290 | distance, 291 | ..Default::default() 292 | } 293 | } 294 | 295 | /// Score the candidate 296 | pub fn calculate_score(&mut self) -> f32 { 297 | let mut score = 298 | (self.frequency as f32 * FREQUENCY_WEIGHT) + (self.distance as f32 * DISTANCE_WEIGHT); 299 | score += (self.tag_before as f32 * TAG_BEFORE_WEIGHT) 300 | + (self.word_before as f32 * WORD_BEFORE_WEIGHT); 301 | 302 | if self.same_suffix { 303 | score += SUFFIX_BONUS; 304 | } 305 | if self.same_prefix { 306 | score += PREFIX_BONUS; 307 | } 308 | if self.has_double_letter { 309 | score += DOUBLE_LETTER_BONUS; 310 | } 311 | 312 | score 313 | } 314 | } 315 | 316 | impl fmt::Debug for Candidate { 317 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 318 | write!( 319 | f, 320 | "word {} pos {} score {:.2} frequency {} distance {} tag_before {} word before {} suffix {} prefix {}", 321 | self.token.word, 322 | self.token.pos, 323 | self.score, 324 | self.frequency, 325 | self.distance, 326 | self.tag_before, 327 | self.word_before, 328 | self.same_suffix, 329 | self.same_prefix 330 | ) 331 | } 332 | } 333 | 334 | impl From for SpellCheckerCohortPOS { 335 | fn from(tag: POSTag) -> Self { 336 | match tag { 337 | t if t.is_named_entity() => Self::entity, 338 | t if t.is_noun() => Self::noun, 339 | t if t.is_verb() => Self::verb, 340 | t if t.is_adverb() => Self::adverb, 341 | t if t.is_adjective() => Self::adjective, 342 | _ => Self::other, 343 | } 344 | } 345 | } 346 | 347 | impl From for SpellCheckerCohortSize { 348 | fn from(length: usize) -> Self { 349 | match length { 350 | len if len <= 4 => Self::short, 351 | len if len <= 6 => Self::short_medium, 352 | len if len <= 8 => Self::medium, 353 | len if len <= 10 => Self::medium_long, 354 | _ => Self::long, 355 | } 356 | } 357 | } 358 | 359 | impl SpellCheckerCohortSize { 360 | pub fn get_sizes(length: usize) -> Vec { 361 | match length { 362 | len if len <= 3 => vec![Self::short], 363 | len if len <= 5 => vec![Self::short, Self::short_medium], 364 | len if len <= 7 => vec![Self::short_medium, Self::medium], 365 | len if len <= 11 => vec![Self::medium, Self::medium_long, Self::long], 366 | _ => vec![Self::medium_long, Self::long], 367 | } 368 | } 369 | } 370 | -------------------------------------------------------------------------------- /src/sophia/src/pos_tagger/model.rs: -------------------------------------------------------------------------------- 1 | // Copyright 2025 Aquila Labs of Alberta, Canada 2 | // Licensed under the PolyForm Noncommercial License 1.0.0 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/ 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/ 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND. 6 | 7 | use super::{ 8 | POSContext, POSFeature, POSPrediction, POSPredictionMethod, POSPrefix, POSSuffix, POSTag, 9 | SIBLING_TAGS_AFTER, SIBLING_TAGS_BEFORE, TokenKey, 10 | }; 11 | use crate::tokenizer::Token; 12 | use serde::{Deserialize, Serialize}; 13 | use std::collections::{HashMap, HashSet}; 14 | use std::hash::Hash; 15 | 16 | pub trait POSModelInterface { 17 | fn predict(&self, position: usize, tokens: &[Token]) -> Option; 18 | } 19 | 20 | #[derive(Default, Clone, Serialize, Deserialize)] 21 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")] 22 | pub struct POSModel { 23 | pub word: String, 24 | pub target_tags: Vec, 25 | pub tag_freq: HashMap, 26 | pub features: HashMap, POSWeight>, 27 | pub conjunctions: HashMap, Vec>>, 28 | } 29 | 30 | #[derive(Default, Debug, Clone, Serialize, Deserialize)] 31 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")] 32 | pub struct POSConjunction { 33 | pub weight: POSWeight, 34 | pub deterministic_tag: Option, 35 | pub siblings: Vec>, 36 | pub exceptions: Vec<(POSFeature, Option)>, 37 | } 38 | 39 | #[derive(Default, Debug, Clone, Serialize, Deserialize)] 40 | pub struct POSWeight { 41 | pub tags: HashMap, 42 | pub weight: f32, 43 | pub mi_score: f32, 44 | } 45 | 46 | struct POSPositionTracker { 47 | primary: Vec>>, 48 | secondary: Vec>, 49 | } 50 | 51 | #[derive(Clone)] 52 | struct POSScore { 53 | feature: POSFeature, 54 | tags: HashMap, 55 | } 56 | 57 | #[derive(Default, Clone, Serialize, Deserialize)] 58 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")] 59 | pub struct POSTagModel { 60 | pub target_tags: Vec, 61 | pub global: POSModel, 62 | pub words: HashMap>, 63 | } 64 | 65 | #[derive(Default, Serialize, Deserialize)] 66 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")] 67 | pub struct POSTagModelRepo { 68 | pub tags: HashMap>, 69 | pub models: HashMap>, 70 | } 71 | 72 | impl POSModelInterface for POSModel 73 | where 74 | S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>, 75 | Token: TokenKey, 76 | { 77 | /// Resolve an ambiguous word 78 | fn predict(&self, position: usize, tokens: &[Token]) -> Option { 79 | // Get context 80 | let context = POSContext::from_tokens(position, tokens); 81 | let context_vec: HashSet> = context.iter_ft().collect(); 82 | 83 | // Check conjunctions 84 | if let Some(pred) = self.check_conjunctions(position, &context, &context_vec, tokens) { 85 | return Some(pred); 86 | } 87 | 88 | // Check confidence score 89 | 90 | // Fallback to individual features 91 | let mut tracker = POSPositionTracker::new(); 92 | for feature in context.iter_ft() { 93 | if let Some(weight) = self.features.get(&feature) { 94 | tracker.add_feature(&feature, weight); 95 | } 96 | } 97 | 98 | // Combine scores 99 | let scores = tracker.combine(&self.tag_freq); 100 | if scores.is_empty() { 101 | return None; 102 | } 103 | 104 | // Get highest score 105 | let mut scores_vec = scores.iter().collect::>(); 106 | scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap()); 107 | 108 | Some(POSPrediction::new( 109 | POSPredictionMethod::standard, 110 | &tokens[position].word, 111 | tokens[position].pos, 112 | *scores_vec[0].0, 113 | *scores_vec[0].1, 114 | &scores, 115 | &[], 116 | )) 117 | } 118 | } 119 | 120 | impl POSModel 121 | where 122 | S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>, 123 | Token: TokenKey, 124 | { 125 | /// Check conjunctions 126 | fn check_conjunctions( 127 | &self, 128 | position: usize, 129 | context: &POSContext, 130 | context_vec: &HashSet>, 131 | tokens: &[Token], 132 | ) -> Option { 133 | let mut scores: HashMap = HashMap::new(); 134 | 135 | // Go through context 136 | for feature in context_vec.iter() { 137 | let conjunction_set = match self.conjunctions.get(feature) { 138 | Some(r) => r, 139 | None => continue, 140 | }; 141 | 142 | // Find strongest matching conjunction, if any 143 | for conjunction in conjunction_set.iter() { 144 | // Check exceptions 145 | if let Some(tag) = self.check_exceptions(conjunction, context_vec) { 146 | if tag.is_none() { 147 | continue; 148 | } 149 | return Some(POSPrediction::new( 150 | POSPredictionMethod::exception, 151 | &tokens[position].word, 152 | tokens[position].pos, 153 | tag.unwrap(), 154 | 1.0, 155 | &HashMap::new(), 156 | &[], 157 | )); 158 | } 159 | 160 | // Check siblings 161 | if !conjunction.siblings.iter().all(|sib| { 162 | let offset = 163 | ((feature.offset + sib.offset) + (SIBLING_TAGS_BEFORE as i8)) as usize; 164 | context.0[offset].contains(&sib.feature_token) 165 | }) { 166 | continue; 167 | } 168 | 169 | // Check for deterministic tag 170 | if let Some(tag) = conjunction.deterministic_tag { 171 | return Some(POSPrediction::new( 172 | POSPredictionMethod::deterministic_rule, 173 | &tokens[position].word, 174 | tokens[position].pos, 175 | tag, 176 | 1.0, 177 | &HashMap::new(), 178 | &[], 179 | )); 180 | } 181 | 182 | // Add to results 183 | for (tag, score) in conjunction.weight.tags.iter() { 184 | *scores.entry(*tag).or_insert(0.0) += *score * conjunction.weight.weight; 185 | } 186 | break; 187 | } 188 | } 189 | 190 | // Check for none 191 | if scores.is_empty() { 192 | return None; 193 | } 194 | 195 | // Get highest score 196 | let mut scores_vec = scores.iter().collect::>(); 197 | scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap()); 198 | 199 | Some(POSPrediction::new( 200 | POSPredictionMethod::conjunction, 201 | &tokens[position].word, 202 | tokens[position].pos, 203 | *scores_vec[0].0, 204 | *scores_vec[0].1, 205 | &scores, 206 | &[], 207 | )) 208 | } 209 | 210 | /// Check exceptions 211 | fn check_exceptions( 212 | &self, 213 | conjunction: &POSConjunction, 214 | context_vec: &HashSet>, 215 | ) -> Option> { 216 | for (exception, opt_tag) in conjunction.exceptions.iter() { 217 | if !context_vec.contains(exception) { 218 | continue; 219 | } 220 | 221 | if let Some(_tag) = opt_tag { 222 | //return Some(Some(*tag)); 223 | return Some(None); 224 | } else { 225 | return Some(None); 226 | } 227 | } 228 | 229 | None 230 | } 231 | 232 | /// Predict cohort, used for automated spelling corrections to narraow search space of candidates 233 | pub fn predict_cohort(&self, position: usize, tokens: &[Token]) -> Option { 234 | // Get initial prediction 235 | let mut pred = self.predict(position, tokens)?; 236 | let max_value = pred.probabilities.values().sum::(); 237 | 238 | // Scale and normalize probabilities 239 | for (tag, score) in pred.probabilities.iter_mut() { 240 | let overall_score = self.tag_freq.get(tag).unwrap_or(&0.0); 241 | *score /= max_value; 242 | *score *= (0.10 / overall_score.max(1e-6)).sqrt(); 243 | //*score *= (0.10 / *overall_score); 244 | } 245 | 246 | // Add suffix bonus 247 | if let Ok(suffix) = POSSuffix::try_from(&tokens[position]) { 248 | let (suffix_tag, suffix_bonus) = match suffix { 249 | POSSuffix::ed | POSSuffix::ing => (POSTag::VB, 0.10), 250 | POSSuffix::day 251 | | POSSuffix::ion 252 | | POSSuffix::tion 253 | | POSSuffix::ness 254 | | POSSuffix::ment 255 | | POSSuffix::ity 256 | | POSSuffix::ty 257 | | POSSuffix::ance 258 | | POSSuffix::ence 259 | | POSSuffix::age 260 | | POSSuffix::ship 261 | | POSSuffix::hood => (POSTag::NN, 0.15), 262 | POSSuffix::wise => (POSTag::RB, 0.15), 263 | POSSuffix::ly | POSSuffix::ward => (POSTag::RB, 0.15), 264 | POSSuffix::er 265 | | POSSuffix::est 266 | | POSSuffix::ous 267 | | POSSuffix::less 268 | | POSSuffix::ful 269 | | POSSuffix::able 270 | | POSSuffix::ible => (POSTag::JJ, 0.15), 271 | POSSuffix::al | POSSuffix::ive => (POSTag::JJ, 0.10), 272 | _ => (POSTag::FW, 0.0), 273 | }; 274 | 275 | if pred.probabilities.contains_key(&suffix_tag) { 276 | *pred.probabilities.get_mut(&suffix_tag).unwrap() += suffix_bonus; 277 | } 278 | } 279 | 280 | // Add prefix bonus 281 | if let Ok(prefix) = POSPrefix::try_from(&tokens[position]) { 282 | let (prefix_tag, prefix_bonus) = match prefix { 283 | POSPrefix::non 284 | | POSPrefix::anti 285 | | POSPrefix::semi 286 | | POSPrefix::uni 287 | | POSPrefix::bi 288 | | POSPrefix::tri 289 | | POSPrefix::quad 290 | | POSPrefix::mono 291 | | POSPrefix::pseudo 292 | | POSPrefix::quasi => (POSTag::JJ, 0.075), 293 | POSPrefix::en | POSPrefix::em | POSPrefix::mis => (POSTag::VB, 0.075), 294 | POSPrefix::sub | POSPrefix::inter | POSPrefix::intra | POSPrefix::trans => { 295 | (POSTag::NN, 0.075) 296 | } 297 | POSPrefix::un 298 | | POSPrefix::pre 299 | | POSPrefix::over 300 | | POSPrefix::micro 301 | | POSPrefix::mega 302 | | POSPrefix::extra 303 | | POSPrefix::poly => (POSTag::JJ, 0.05), 304 | POSPrefix::re | POSPrefix::dis | POSPrefix::de => (POSTag::VB, 0.05), 305 | POSPrefix::co | POSPrefix::com | POSPrefix::post | POSPrefix::fore => { 306 | (POSTag::NN, 0.05) 307 | } 308 | _ => (POSTag::FW, 0.0), 309 | }; 310 | 311 | if pred.probabilities.contains_key(&prefix_tag) { 312 | *pred.probabilities.get_mut(&prefix_tag).unwrap() += prefix_bonus; 313 | } 314 | } 315 | 316 | // Normalize 317 | let total = pred.probabilities.values().sum::(); 318 | for (_, score) in pred.probabilities.iter_mut() { 319 | *score /= total; 320 | } 321 | 322 | // GEt highest ranking probability 323 | let mut scores_vec = pred.probabilities.iter().collect::>(); 324 | scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap()); 325 | 326 | // Set new tag 327 | pred.tag = *scores_vec[0].0; 328 | pred.confidence = *scores_vec[0].1; 329 | 330 | Some(pred) 331 | } 332 | } 333 | 334 | impl POSModelInterface for POSTagModel 335 | where 336 | S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>, 337 | Token: TokenKey, 338 | { 339 | /// Predict tag for an ambiguous word 340 | fn predict(&self, position: usize, tokens: &[Token]) -> Option { 341 | // Check per-word models 342 | if let Some(model) = self.words.get(&tokens[position].get_key()) 343 | && let Some(pred) = model.predict(position, tokens) 344 | && pred.confidence >= 0.85 345 | { 346 | return Some(pred); 347 | } 348 | 349 | self.global.predict(position, tokens) 350 | } 351 | } 352 | 353 | impl POSPositionTracker 354 | where 355 | S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>, 356 | Token: TokenKey, 357 | { 358 | pub fn new() -> Self { 359 | let length = SIBLING_TAGS_BEFORE + SIBLING_TAGS_AFTER + 1; 360 | 361 | Self { 362 | primary: vec![None; length], 363 | secondary: vec![HashMap::new(); length], 364 | } 365 | } 366 | 367 | // Update with a new features 368 | pub fn add_feature(&mut self, feature: &POSFeature, weight: &POSWeight) { 369 | let index = (SIBLING_TAGS_BEFORE as i8 + feature.offset) as usize; 370 | 371 | // Secondary feature 372 | if !feature.feature_token.is_primary() { 373 | for (tag, score) in weight.tags.iter() { 374 | *self.secondary[index].entry(*tag).or_insert(0.0) += weight.weight * *score; 375 | } 376 | 377 | // Primary feature 378 | } else if self.primary[index].is_none() 379 | || feature.get_score() > self.primary[index].as_ref().unwrap().feature.get_score() 380 | { 381 | let tags: HashMap = 382 | weight.tags.iter().map(|(tag, score)| (*tag, (weight.weight * *score))).collect(); 383 | 384 | self.primary[index] = Some(POSScore { 385 | feature: feature.clone(), 386 | tags, 387 | }); 388 | } 389 | } 390 | 391 | /// Combine scores 392 | pub fn combine(&self, tag_freq: &HashMap) -> HashMap { 393 | let mut scores: HashMap = HashMap::new(); 394 | for (x, score_opt) in self.primary.iter().enumerate() { 395 | let score = match score_opt { 396 | Some(r) => r, 397 | None => continue, 398 | }; 399 | 400 | for (tag, tag_score) in score.tags.iter() { 401 | *scores.entry(*tag).or_insert(0.0) += *tag_score; 402 | } 403 | 404 | // Add secondary 405 | for (tag, tag_score) in self.secondary[x].iter() { 406 | *scores.entry(*tag).or_insert(0.0) += *tag_score; 407 | } 408 | } 409 | 410 | // Add tag freq 411 | for (tag, score) in scores.iter_mut() { 412 | if let Some(freq_score) = tag_freq.get(tag) { 413 | *score = (*score * 0.8) + (freq_score * 0.2); 414 | } 415 | } 416 | 417 | scores 418 | } 419 | } 420 | --------------------------------------------------------------------------------