├── src
    ├── sophia
    │   ├── .ignore
    │   ├── build.rs
    │   ├── rustfmt.toml
    │   ├── src
    │   │   ├── lib.rs
    │   │   ├── tokenizer
    │   │   │   ├── mod.rs
    │   │   │   ├── input.rs
    │   │   │   ├── cleaner.rs
    │   │   │   └── token.rs
    │   │   ├── interpreter
    │   │   │   ├── mod.rs
    │   │   │   ├── interpretation.rs
    │   │   │   ├── coref_categories.rs
    │   │   │   ├── interpreter.rs
    │   │   │   ├── antecedent_buffer.rs
    │   │   │   └── buffer.rs
    │   │   ├── interpret
    │   │   │   ├── mod.rs
    │   │   │   ├── interpretation.rs
    │   │   │   ├── coref_categories.rs
    │   │   │   ├── interpreter.rs
    │   │   │   ├── antecedent_buffer.rs
    │   │   │   └── buffer.rs
    │   │   ├── error.rs
    │   │   ├── vocab
    │   │   │   ├── mod.rs
    │   │   │   ├── future_verbs.rs
    │   │   │   ├── pronoun.rs
    │   │   │   ├── cache.rs
    │   │   │   ├── stats.rs
    │   │   │   ├── f8.rs
    │   │   │   ├── phrase_intents.rs
    │   │   │   ├── mwe.rs
    │   │   │   ├── database.rs
    │   │   │   ├── category.rs
    │   │   │   └── spell_check.rs
    │   │   ├── pos_tagger
    │   │   │   ├── mod.rs
    │   │   │   ├── schema.rs
    │   │   │   ├── tagger.rs
    │   │   │   ├── pos_tag.rs
    │   │   │   ├── hmm.rs
    │   │   │   └── model.rs
    │   │   └── sophia.rs
    │   ├── Cargo.toml
    │   ├── LICENSE
    │   ├── examples
    │   │   ├── tokenize.rs
    │   │   └── interpret.rs
    │   └── README.md
    ├── rustfmt.toml
    └── Cargo.toml
├── .gitignore
├── contribute.md
├── README.md
└── LICENSE


/src/sophia/.ignore:
--------------------------------------------------------------------------------
1 | /target/
2 | 
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 | Cargo.lock
3 | 
4 | 


--------------------------------------------------------------------------------
/src/rustfmt.toml:
--------------------------------------------------------------------------------
1 | 
2 | # Keep lines longer for screen reader accessibility
3 | max_width = 100
4 | chain_width = 100
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/src/sophia/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     println!("cargo:rustc-link-lib=bz2");
3 |     println!("cargo:rustc-link-lib=zstd");
4 | }
5 | 


--------------------------------------------------------------------------------
/src/sophia/rustfmt.toml:
--------------------------------------------------------------------------------
1 | 
2 | # Keep lines longer for screen reader accessibility
3 | max_width = 100
4 | chain_width = 100
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/contribute.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Code of Conduct
 3 | 
 4 | 
 5 | ## rustfmt.toml  
 6 | 
 7 | The rustfmt.toml   has been configured to retain longer lines, instead of the standard of splitting every chain link onto a new line.
 8 | 
 9 | PRs submitted with that configuration removed will be rejected.  I know this will be a pet peeve for some, but I'm blind and maintainer on this project, so that stays as working with tiny lines via screen reader is a nightmare.
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/src/Cargo.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | [workspace]
 3 | members = [
 4 |     #"cicero/client",
 5 |     #"cicero/lib",
 6 |     #"cicero/server",
 7 |     #"evolve",
 8 |     #"quadris",
 9 |     #"sdk",
10 |     "sophia",
11 | #    "verax/lib",
12 |     #"plugins/core",
13 |     #"deps/atlas",
14 |     #"deps/falcon-cli",
15 |     #"deps/http",
16 |     #"deps/opus",
17 |     #"deps/parsex",
18 |     #"deps/sermo"
19 | ]
20 | resolver = "2"
21 | 
22 | [profile.release]
23 | strip="symbols"
24 | debug=true
25 | lto = true
26 | opt-level = 3
27 | codegen-units = 1
28 | panic = "abort"
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/src/sophia/src/lib.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | #![allow(non_camel_case_types)]
 8 | 
 9 | pub use self::error::Error;
10 | pub use self::sophia::Sophia;
11 | 
12 | pub mod error;
13 | pub mod interpret;
14 | pub mod pos_tagger;
15 | pub mod sophia;
16 | pub mod tokenizer;
17 | pub mod vocab;
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Cicero is your self-hosted AI, built in Rust to understand, remember, and handle your digital tasks. Open-source and fiercely private, it locks big tech out, keeping your data and life yours.
 3 | 
 4 | General release coming shortly.  For full details and to stay updated, please visit [https://cicero.sh/](https://cicero.sh/).
 5 | 
 6 | Sophia NLU Engine is now open source and available within the /src/sophia/ directory of this repository.  Details at:  [https://cicero.sh/sophia/](https://cicero.sh/sophia/).
 7 | 
 8 | 
 9 | ## Contact
10 | 
11 | Join the community discussion at: [https://cicero.sh/forums/](https://cicero.sh/forums/).
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/src/sophia/src/tokenizer/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | pub use self::input::{MWE, TokenizedInput};
 8 | pub use self::token::{Token, TokenType};
 9 | pub use self::tokenizer::{Buffer, Tokenizer};
10 | pub use cleaner::TokenCleaner;
11 | 
12 | mod cleaner;
13 | mod input;
14 | pub mod token;
15 | mod tokenizer;
16 | 


--------------------------------------------------------------------------------
/src/sophia/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "cicero-sophia"
 3 | description = "High-performance NLU (natural language understanding) engine built in Rust for speed, accuracy, and privacy. "
 4 | version = "0.6.5"
 5 | edition = "2024"
 6 | authors = ["Aquila Labs"]
 7 | homepage = "https://cicero.sh/sophia/"
 8 | keywords = ["nlu", "natural-language", "chat", "nlp"]
 9 | categories = ["text-processing"]
10 | readme = "README.md"
11 | license="PolyForm-Noncommercial-1.0.0"
12 | repository = "https://github.com/cicero-ai/cicero"
13 | 
14 | [lib]
15 | name="sophia"
16 | 
17 | [dependencies]
18 | bincode = "1.3.3"
19 | indexmap = { version = "2.11.0", features = ["serde"] }
20 | regex = "1.11.2"
21 | serde = { version = "1.0.219", features = ["derive"] }
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/sophia/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Sophia NLU Engine
 3 | Copyright (c) 2025 Aquila Labs of Alberta, Canada
 4 | 
 5 | Licensed under the PolyForm Noncommercial License 1.0.0.
 6 | 
 7 | You may use, copy, modify, and distribute this software for noncommercial purposes,
 8 | including personal projects, research, educational use, hobby projects, and internal
 9 | use by nonprofit organizations. 
10 | 
11 | Commercial use — including use in a for-profit company, in a product or service
12 | offered for sale or subscription, or in any revenue-generating application —
13 | requires a separate commercial license.
14 | 
15 | For commercial licensing information, visit:
16 | https://cicero.sh/sophia/
17 | 
18 | Full license text:
19 | https://polyformproject.org/licenses/noncommercial/1.0.0/
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
 3 | // See the full license at: https://cicero.sh/license.txt
 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 5 | 
 6 | pub use self::antecedent_buffer::AntecedentBuffer;
 7 | pub use self::buffer::Buffer;
 8 | pub use self::coref_categories::CoreferenceCategories;
 9 | pub use self::interpretation::Interpretation;
10 | pub use self::interpreter::Interpreter;
11 | pub use self::phrase::{
12 |     Adjective, Adverb, Noun, NounModifier, NounOwner, NounSibling, Phrase, PhraseTense, Verb,
13 |     VerbModifier, VerbSibling,
14 | };
15 | pub use self::phrase_buffer::PhraseBuffer;
16 | 
17 | mod antecedent_buffer;
18 | mod buffer;
19 | mod coref_categories;
20 | mod interpretation;
21 | mod interpreter;
22 | mod phrase;
23 | mod phrase_buffer;
24 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | pub use self::antecedent_buffer::AntecedentBuffer;
 8 | pub use self::buffer::Buffer;
 9 | pub use self::coref_categories::CoreferenceCategories;
10 | pub use self::interpretation::Interpretation;
11 | pub use self::interpreter::Interpreter;
12 | pub use self::phrase::{
13 |     Adjective, Adverb, Noun, NounModifier, NounOwner, NounSibling, Phrase, PhraseTense, Verb,
14 |     VerbModifier, VerbSibling,
15 | };
16 | pub use self::phrase_buffer::PhraseBuffer;
17 | 
18 | mod antecedent_buffer;
19 | mod buffer;
20 | mod coref_categories;
21 | mod interpretation;
22 | mod interpreter;
23 | mod phrase;
24 | mod phrase_buffer;
25 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/interpretation.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
 3 | // See the full license at: https://cicero.sh/license.txt
 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 5 | 
 6 | use super::{Buffer, Phrase};
 7 | use crate::tokenizer::Token;
 8 | use std::collections::HashMap;
 9 | 
10 | /// Represents the result of interpreting input, containing classification scores, tokens, multi-word expressions (MWE), and phrases.
11 | pub struct Interpretation {
12 |     pub scores: HashMap<i8, f32>,
13 |     pub tokens: Vec<Token>,
14 |     pub mwe: Vec<Token>,
15 |     pub phrases: Vec<Phrase>,
16 | }
17 | 
18 | impl Interpretation {
19 |     /// Adds a phrase to the interpretation, checking for enclosed character phrases in the buffer before appending.
20 |     pub fn push_phrase(&mut self, phrase: Phrase, buffer: &mut Buffer) {
21 |         // Combine enclosed phrases, if needed
22 |         buffer.enclosed_chars.is_empty();
23 | 
24 |         self.phrases.push(phrase);
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/interpretation.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use super::{Buffer, Phrase};
 8 | use crate::tokenizer::Token;
 9 | use std::collections::HashMap;
10 | 
11 | /// Represents the result of interpreting input, containing classification scores, tokens, multi-word expressions (MWE), and phrases.
12 | pub struct Interpretation {
13 |     pub scores: HashMap<i8, f32>,
14 |     pub tokens: Vec<Token>,
15 |     pub mwe: Vec<Token>,
16 |     pub phrases: Vec<Phrase>,
17 | }
18 | 
19 | impl Interpretation {
20 |     /// Adds a phrase to the interpretation, checking for enclosed character phrases in the buffer before appending.
21 |     pub fn push_phrase(&mut self, phrase: Phrase, buffer: &mut Buffer) {
22 |         // Combine enclosed phrases, if needed
23 |         buffer.enclosed_chars.is_empty();
24 | 
25 |         self.phrases.push(phrase);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/sophia/src/error.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use std::fmt;
 8 | 
 9 | #[derive(Debug)]
10 | pub enum Error {
11 |     Save(String),
12 |     Load(String),
13 |     POSPrediction(String),
14 |     Generic(String),
15 | }
16 | 
17 | impl std::error::Error for Error {}
18 | 
19 | impl fmt::Display for Error {
20 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
21 |         match self {
22 |             Error::Save(err) => write!(f, "Save error: {}", err),
23 |             Error::Load(err) => write!(f, "Load error: {}", err),
24 |             Error::POSPrediction(err) => write!(f, "POS tagger logistic regression error: {}", err),
25 |             Error::Generic(msg) => write!(f, "{}", msg),
26 |         }
27 |     }
28 | }
29 | 
30 | impl From<std::io::Error> for Error {
31 |     fn from(err: std::io::Error) -> Self {
32 |         Error::Generic(err.to_string())
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | pub use self::cache::VocabCache;
 8 | pub use self::category::{VocabCategory, VocabCategoryDatabase, VocabCategoryIndex};
 9 | pub use self::database::{
10 |     VocabDatabase, VocabDatabaseMeta, VocabPreProcessDatabase, VocabWordDatabase,
11 | };
12 | pub use self::future_verbs::FutureVerbPhrases;
13 | pub use self::mwe::{Capitalization, MWEType, VocabMWE};
14 | pub use self::phrase_intents::{PhraseIntent, PhraseIntents};
15 | pub use self::pronoun::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson};
16 | pub use self::spell_check::{
17 |     SpellChecker, SpellCheckerCohort, SpellCheckerCohortPOS, SpellCheckerCohortSize,
18 |     SpellCheckerEntry,
19 | };
20 | pub use self::stats::VocabStats;
21 | 
22 | mod cache;
23 | mod category;
24 | mod database;
25 | pub mod f8;
26 | mod future_verbs;
27 | pub mod mwe;
28 | mod phrase_intents;
29 | mod pronoun;
30 | mod spell_check;
31 | mod stats;
32 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/mod.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | pub use self::context::{
 8 |     AUXILLARY_VERBS, COMMON_ADVERBS, PASSIVE_INDICATORS, POSContext, POSFeature, POSFeatureToken,
 9 |     POSPrefix, POSSuffix, POSTagGroup, POSWordGroup, SIBLING_TAGS_AFTER, SIBLING_TAGS_BEFORE,
10 |     TEMPORAL_ADVERBS,
11 | };
12 | pub use self::hmm::{HMM, TOTAL_TAGS};
13 | pub use self::model::{
14 |     POSConjunction, POSModel, POSModelInterface, POSTagModel, POSTagModelRepo, POSWeight,
15 | };
16 | pub use self::pos_tag::POSTag;
17 | pub use self::tagger::{POSPrediction, POSPredictionMethod, POSTagger};
18 | use crate::tokenizer::Token;
19 | 
20 | mod context;
21 | mod hmm;
22 | mod model;
23 | mod pos_tag;
24 | mod tagger;
25 | 
26 | pub trait TokenKey<S> {
27 |     fn get_key(&self) -> S;
28 | }
29 | 
30 | impl TokenKey<i32> for Token {
31 |     fn get_key(&self) -> i32 {
32 |         self.index
33 |     }
34 | }
35 | 
36 | impl TokenKey<String> for Token {
37 |     fn get_key(&self) -> String {
38 |         self.word.to_string()
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Cicero Monorepo License Overview
 4 | Copyright (c) 2025 Aquila Labs of Alberta, Canada
 5 | 
 6 | This repository contains multiple crates. Each crate may specify its own license in its `Cargo.toml` or its own `LICENSE` file.
 7 | 
 8 | **General Rule:**
 9 | 
10 | * Unless otherwise noted in a crate's own `Cargo.toml` or `LICENSE` file, all source code in this repository (except code under `/src/deps/`) is licensed under the **PolyForm Noncommercial License 1.0.0**.
11 | * Code under `/src/deps/` is third-party code and is licensed under its original license, which may be MIT, Apache 2.0, GPL, or a similar open-source license. See individual files for details.
12 | 
13 | **PolyForm Noncommercial License Summary:**
14 | You may use, copy, modify, and distribute this software for **noncommercial purposes**, including personal projects, research, educational use, hobby projects, and internal use by nonprofit organizations.
15 | 
16 | **Commercial use** — including use in a for-profit company, in a product or service offered for sale or subscription, or in any revenue-generating application — requires a separate commercial license.
17 | 
18 | For commercial licensing information, visit:
19 | 🔗 **[https://cicero.sh/sophia/](https://cicero.sh/sophia/)**
20 | 
21 | Full license text:
22 | 🔗 **[https://polyformproject.org/licenses/noncommercial/1.0.0/](https://polyformproject.org/licenses/noncommercial/1.0.0/)**
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/future_verbs.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use serde::{Deserialize, Serialize};
 8 | use std::collections::HashMap;
 9 | 
10 | /// A trie-like structure for storing future verb phrases, tracking completion status and expected verb POS tags.
11 | #[derive(Serialize, Deserialize)]
12 | pub struct FutureVerbPhrases {
13 |     pub is_complete: bool,
14 |     pub expected_verb_pos: Option<String>,
15 |     pub children: HashMap<String, Box<FutureVerbPhrases>>,
16 | }
17 | 
18 | impl FutureVerbPhrases {
19 |     /// Creates a new FutureVerbPhrases node with an optional expected verb POS tag and empty children.
20 |     pub fn new(expected_verb_pos: Option<String>) -> Self {
21 |         Self {
22 |             is_complete: false,
23 |             expected_verb_pos,
24 |             children: HashMap::new(),
25 |         }
26 |     }
27 | 
28 |     /// Inserts a phrase into the trie, marking the final node as complete and handling verb placeholders.
29 |     pub fn insert(&mut self, phrase: &str) {
30 |         let mut current = self;
31 |         for word in phrase.split(" ").collect::<Vec<&str>>().iter() {
32 |             let child = if word.starts_with("V") {
33 |                 "[verb]".to_string()
34 |             } else {
35 |                 word.to_string()
36 |             };
37 |             let expected_verb = if word.starts_with("V") {
38 |                 Some(word.to_string())
39 |             } else {
40 |                 None
41 |             };
42 | 
43 |             current = current
44 |                 .children
45 |                 .entry(child.to_lowercase())
46 |                 .or_insert(Box::new(FutureVerbPhrases::new(expected_verb)));
47 |         }
48 | 
49 |         current.is_complete = true;
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/pronoun.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use serde::{Deserialize, Serialize};
 8 | 
 9 | /// Represents a pronoun with its linguistic properties, including category, gender, person, and number.
10 | #[derive(Serialize, Deserialize, Clone, Debug)]
11 | pub struct Pronoun {
12 |     pub category: PronounCategory,
13 |     pub sub_category: PronounCategory,
14 |     pub gender: PronounGender,
15 |     pub person: PronounPerson,
16 |     pub number: PronounNumber,
17 | }
18 | 
19 | /// Defines the category of a pronoun, such as personal, possessive, or indefinite.
20 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
21 | pub enum PronounCategory {
22 |     none,
23 |     personal,
24 |     possessive,
25 |     indefinite,
26 |     reflexive,
27 |     demonstrative,
28 |     interrogative,
29 |     relative,
30 | }
31 | 
32 | /// Defines the gender of a pronoun, which can be neutral, male, or female.
33 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
34 | pub enum PronounGender {
35 |     neutral,
36 |     male,
37 |     female,
38 | }
39 | 
40 | /// Defines the person of a pronoun, which can be neutral, first, second, or third.
41 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
42 | pub enum PronounPerson {
43 |     neutral,
44 |     first,
45 |     second,
46 |     third,
47 | }
48 | 
49 | /// Defines the number of a pronoun, which can be neutral, singular, or plural.
50 | #[derive(Serialize, Deserialize, Clone, PartialEq, Eq, Debug, Hash)]
51 | pub enum PronounNumber {
52 |     neutral,
53 |     singular,
54 |     plural,
55 | }
56 | 
57 | impl Pronoun {
58 |     /// Checks if the pronoun requires anaphora resolution, based on its category and person.
59 |     pub fn is_anaphora(&self) -> bool {
60 |         if ![
61 |             PronounCategory::personal,
62 |             PronounCategory::possessive,
63 |             PronounCategory::reflexive,
64 |         ]
65 |         .contains(&self.category)
66 |         {
67 |             return false;
68 |         }
69 | 
70 |         if self.person == PronounPerson::first {
71 |             return false;
72 |         }
73 | 
74 |         true
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/cache.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use crate::error::Error;
 8 | use bincode;
 9 | use serde::{Deserialize, Serialize};
10 | use std::collections::HashMap;
11 | use std::fs;
12 | use std::path::Path;
13 | 
14 | /// A cache for storing vocabulary-related data, such as typos, to improve processing efficiency.
15 | #[derive(Serialize, Deserialize, Default)]
16 | pub struct VocabCache {
17 |     pub typos: HashMap<String, String>,
18 | }
19 | 
20 | impl VocabCache {
21 |     /// Loads the vocabulary cache from a file in the specified directory, returning a default cache if the file does not exist.
22 |     pub fn load(vocab_dir: &str) -> Result<VocabCache, Error> {
23 |         let cache_file = format!("{}/cache.dat", vocab_dir);
24 |         if !Path::new(&cache_file).exists() {
25 |             return Ok(Self::default());
26 |         }
27 | 
28 |         let encoded = fs::read(&cache_file)?;
29 |         let cache: VocabCache = match bincode::deserialize(&encoded[..]) {
30 |             Ok(r) => r,
31 |             Err(e) => {
32 |                 return Err(Error::Load(format!(
33 |                     "Unable to load vocabulary cache, {}",
34 |                     e
35 |                 )));
36 |             }
37 |         };
38 | 
39 |         Ok(cache)
40 |     }
41 | 
42 |     /// Saves the vocabulary cache to a file in the specified directory using bincode serialization.
43 |     pub fn save(&self, vocab_dir: &str) -> Result<(), Error> {
44 |         let cache_file = format!("{}/cache.dat", vocab_dir);
45 |         let encoded = match bincode::serialize(&self) {
46 |             Ok(r) => r,
47 |             Err(e) => {
48 |                 return Err(Error::Save(format!(
49 |                     "Unable to serialize vocabulary cache, {}",
50 |                     e
51 |                 )));
52 |             }
53 |         };
54 |         fs::write(&cache_file, &encoded)?;
55 |         Ok(())
56 |     }
57 | 
58 |     /// Adds a typo mapping to the cache, associating the original word with its correct form.
59 |     pub fn add_typo(&mut self, original: &str, correct: &str) {
60 |         self.typos.insert(original.to_string(), correct.to_string());
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/stats.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the PolyForm Noncommercial License 1.0.0
 3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
 4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
 5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 6 | 
 7 | use super::VocabDatabase;
 8 | use crate::pos_tagger::POSTag;
 9 | use std::collections::HashMap;
10 | 
11 | /// Populated with basic statistical and meta information regarding the vocabulary data store
12 | /// including number of words, MWEs, ambiguous words, named entities, categories, and so on.
13 | #[derive(Debug, Default)]
14 | pub struct VocabStats {
15 |     pub singular_words: usize,
16 |     pub ambiguous_words: usize,
17 |     pub mwes: usize,
18 |     pub nouns: usize,
19 |     pub verbs: usize,
20 |     pub adverbs: usize,
21 |     pub adjectives: usize,
22 |     pub named_entities: usize,
23 |     pub synonyms: usize,
24 |     pub hypernyms: usize,
25 |     pub hyponyms: usize,
26 |     pub pos_tags: HashMap<POSTag, usize>,
27 | }
28 | 
29 | impl VocabStats {
30 |     pub fn compile(vocab: &VocabDatabase) -> Self {
31 |         let mut stats = Self::default();
32 | 
33 |         // GO through wordlist
34 |         for (_, pos_map) in vocab.words.wordlist.iter() {
35 |             // Singular or ambiguous?
36 |             if pos_map.len() > 1 {
37 |                 stats.ambiguous_words += 1;
38 |             }
39 | 
40 |             // POS tags
41 |             for (tag, _) in pos_map.iter() {
42 |                 *stats.pos_tags.entry(*tag).or_insert(0) += 1;
43 |             }
44 |         }
45 | 
46 |         // Go through all tokens
47 |         for (_, token) in vocab.words.id2token.iter() {
48 |             // MWE?
49 |             if token.word.contains(" ") {
50 |                 stats.mwes += 1;
51 |             } else {
52 |                 stats.singular_words += 1;
53 |             }
54 | 
55 |             // Counts
56 |             stats.synonyms += token.synonyms.len();
57 |             stats.hypernyms += token.hypernyms.len();
58 |             stats.hyponyms += token.hyponyms.len();
59 | 
60 |             // Part of speech
61 |             if token.is_noun() {
62 |                 stats.nouns += 1;
63 |             } else if token.is_verb() {
64 |                 stats.verbs += 1;
65 |             } else if token.is_adverb() {
66 |                 stats.adverbs += 1;
67 |             } else if token.is_adjective() {
68 |                 stats.adjectives += 1;
69 |             }
70 | 
71 |             if token.is_named_entity() {
72 |                 stats.named_entities += 1;
73 |             }
74 |         }
75 | 
76 |         stats
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/sophia/examples/tokenize.rs:
--------------------------------------------------------------------------------
 1 | use sophia::{Error, Sophia};
 2 | use std::env;
 3 | use std::process;
 4 | 
 5 | /// Tokenizes a sample sentence using the Sophia NLP library, demonstrating token and MWE iteration.
 6 | ///
 7 | /// This example reads the vocabulary data directory from the first command-line argument.
 8 | /// If no argument is provided, it exits with an error message.
 9 | ///
10 | /// # Usage
11 | ///
12 | /// ```bash
13 | /// cargo run --example tokenize -- ./vocab_data
14 | /// ```
15 | ///
16 | /// The example processes the sentence "The quick brown fox jumps over the lazy dog" and prints:
17 | /// - Individual tokens with their indices, words, and part-of-speech (POS) tags.
18 | /// - Multi-word entities (MWEs) with their indices, words, and POS tags.
19 | /// - Tokens with stopwords removed, showing only content-bearing words.
20 | fn main() {
21 |     // Retrieve the data directory from the first command-line argument
22 |     let datadir = env::args().nth(1).unwrap_or_else(|| {
23 |         eprintln!("Error: No data directory provided.");
24 |         eprintln!("Usage: cargo run --example tokenize -- <data_directory>");
25 |         process::exit(1);
26 |     });
27 | 
28 |     // Run the tokenization example, handling errors
29 |     if let Err(e) = run(&datadir) {
30 |         eprintln!("Error: {}", e);
31 |         process::exit(1);
32 |     }
33 | }
34 | 
35 | /// Runs the tokenization example with the provided data directory.
36 | fn run(datadir: &str) -> Result<(), Error> {
37 |     // Initialize Sophia with the vocabulary directory and language
38 |     let sophia = Sophia::new(datadir, "en")?;
39 | 
40 |     // Tokenize the input text
41 |     let input = "The quick brown fox jumps over the lazy dog";
42 |     let output = sophia.tokenize(input)?;
43 | 
44 |     // Print individual tokens
45 |     println!("\nIndividual Tokens:");
46 |     println!("{:-<50}", "");
47 |     println!("{:>6}  {:<15}  {}", "Index", "Word", "POS");
48 |     println!("{:-<50}", "");
49 |     for token in output.iter() {
50 |         println!("{:>6}  {:<15}  {}", token.index, token.word, token.pos);
51 |     }
52 | 
53 |     // Print multi-word entities (MWEs)
54 |     println!("\nMulti-Word Entities (MWEs):");
55 |     println!("{:-<50}", "");
56 |     println!("{:>6}  {:<15}  {}", "Index", "Word", "POS");
57 |     println!("{:-<50}", "");
58 |     for token in output.mwe() {
59 |         println!("{:>6}  {:<15}  {}", token.index, token.word, token.pos);
60 |     }
61 | 
62 |     // Print tokens with stopwords removed
63 |     println!("\nTokens with Stopwords Removed:");
64 |     println!("{:-<50}", "");
65 |     println!("{:>6}  {:<15}  {}", "Index", "Word", "POS");
66 |     println!("{:-<50}", "");
67 |     for token in output.remove_stop_words().iter() {
68 |         println!("{:>6}  {:<15}  {}", token.index, token.word, token.pos);
69 |     }
70 | 
71 |     Ok(())
72 | }
73 | 


--------------------------------------------------------------------------------
/src/sophia/examples/interpret.rs:
--------------------------------------------------------------------------------
 1 | use cicero_sophia::{Error, Sophia};
 2 | use std::env;
 3 | use std::process;
 4 | 
 5 | /// Interprets a sample sentence using the Sophia NLP library, demonstrating phrase, token, and score analysis.
 6 | ///
 7 | /// This example reads the vocabulary data directory from the first command-line argument.
 8 | /// If no argument is provided, it exits with an error message.
 9 | ///
10 | /// # Usage
11 | ///
12 | /// ```bash
13 | /// cargo run --example interpret -- ./vocab_data
14 | /// ```
15 | ///
16 | /// The example processes the sentence "The quick brown fox jumps over the lazy dog" and prints:
17 | /// - Phrases with their token contents (debug format).
18 | /// - Individual tokens with their indices, words, and part-of-speech (POS) tags.
19 | /// - Multi-word entities (MWEs) with their indices, words, and POS tags.
20 | /// - Classification scores with their labels and floating-point values.
21 | fn main() {
22 |     // Retrieve the data directory from the first command-line argument
23 |     let datadir = env::args().nth(1).unwrap_or_else(|| {
24 |         eprintln!("Error: No data directory provided.");
25 |         eprintln!("Usage: cargo run --example interpret -- <data_directory>");
26 |         process::exit(1);
27 |     });
28 | 
29 |     // Run the interpretation example, handling errors
30 |     if let Err(e) = run(&datadir) {
31 |         eprintln!("Error: {}", e);
32 |         process::exit(1);
33 |     }
34 | }
35 | 
36 | /// Runs the interpretation example with the provided data directory.
37 | fn run(datadir: &str) -> Result<(), Error> {
38 |     // Initialize Sophia with the vocabulary directory and language
39 |     let sophia = Sophia::new(datadir, "en")?;
40 | 
41 |     // Interpret the input text
42 |     let input = "The quick brown fox jumps over the lazy dog";
43 |     let output = sophia.interpret(input)?;
44 | 
45 |     // Print phrases
46 |     println!("\nPhrases:");
47 |     println!("{:-<50}", "");
48 |     for (i, phrase) in output.phrases.iter().enumerate() {
49 |         println!("Phrase {}: {:?}", i + 1, phrase);
50 |     }
51 | 
52 |     // Print individual tokens
53 |     println!("\nIndividual Tokens:");
54 |     println!("{:-<50}", "");
55 |     println!("{:>6}  {:<15}  {}", "Index", "Word", "POS");
56 |     println!("{:-<50}", "");
57 |     for token in output.tokens.iter() {
58 |         println!("{:>6}  {:<15}  {}", token.index, token.word, token.pos);
59 |     }
60 | 
61 |     // Print multi-word entities (MWEs)
62 |     println!("\nMulti-Word Entities (MWEs):");
63 |     println!("{:-<50}", "");
64 |     println!("{:>6}  {:<15}  {}", "Index", "Word", "POS");
65 |     println!("{:-<50}", "");
66 |     for token in output.mwe() {
67 |         println!("{:>6}  {:<15}  {}", token.index, token.word, token.pos);
68 |     }
69 | 
70 |     // Print classification scores
71 |     println!("\nClassification Scores:");
72 |     println!("{:-<50}", "");
73 |     println!("{:>6}  {:<15}", "Label", "Score");
74 |     println!("{:-<50}", "");
75 |     for (label, score) in output.scores.iter() {
76 |         println!("{:>6}  {:<15.4}", label, score.to_f32());
77 |     }
78 | 
79 |     Ok(())
80 | }
81 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/schema.rs:
--------------------------------------------------------------------------------
 1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
 2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
 3 | // See the full license at: https://cicero.sh/license.txt
 4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
 5 | 
 6 | use super::POSTag;
 7 | use crate::vocab::f8::f8;
 8 | use serde::{Deserialize, Serialize};
 9 | use std::collections::HashMap;
10 | use std::hash::Hash;
11 | use std::ops::Add;
12 | 
13 | /// A trait for types that can be used as scores in the POS tagger, requiring default, addition, and serialization capabilities.
14 | pub trait Score: Default + Add + Serialize + for<'de> Deserialize<'de> {}
15 | 
16 | /// A part-of-speech tagger structure that maps tags to words and tracks tag, initial, before, and after scores.
17 | #[derive(Default, Serialize, Deserialize)]
18 | #[serde(
19 |     bound = "T: Score, S: Default + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>"
20 | )]
21 | pub struct POSTagger<T: Score, S> {
22 |     pub tag2tag: POSTaggerLayer<T>,
23 |     pub tag2word: HashMap<S, POSTaggerLayer<T>>,
24 |     pub word2word: HashMap<u16, u16>,
25 | }
26 | 
27 | /// A layer of the POS tagger, containing tag scores and initial, before, and after scoring structures.
28 | #[derive(Serialize, Deserialize, Clone)]
29 | #[serde(bound = "T: Score")]
30 | pub struct POSTaggerLayer<T: Score> {
31 |     pub tags: HashMap<POSTag, T>,
32 |     pub initial: POSTaggerScores<T>,
33 |     pub before: POSTaggerScores<T>,
34 |     pub after: POSTaggerScores<T>,
35 | }
36 | 
37 | /// Stores exact match trie and bigram scores for POS tagging.
38 | #[derive(Default, Serialize, Deserialize, Clone)]
39 | #[serde(bound = "T: Score")]
40 | pub struct POSTaggerScores<T: Score> {
41 |     pub exact_matches: HashMap<u32, POSTag>,
42 |     pub bigrams: Vec<POSTaggerBigramScores<T>>,
43 | }
44 | 
45 | /// Stores bigram scores as a mapping from bigram identifiers to lists of tag-score pairs.
46 | #[derive(Default, Clone, Serialize, Deserialize)]
47 | #[serde(bound = "T: Score")]
48 | pub struct POSTaggerBigramScores<T: Score>(pub HashMap<u16, Vec<(POSTag, T)>>);
49 | 
50 | impl<T: Score, S: Default + Hash + Eq + Serialize + for<'de> Deserialize<'de>> POSTagger<T, S> {
51 |     /// Creates a new POSTagger instance with default values.
52 |     pub fn new() -> Self {
53 |         Self::default()
54 |     }
55 | }
56 | 
57 | impl Score for usize {}
58 | impl Score for f32 {}
59 | impl Score for f8 {}
60 | 
61 | impl<T: Score> POSTaggerScores<T> {
62 |     pub fn new(size: usize) -> Self {
63 |         Self {
64 |             exact_matches: HashMap::new(),
65 |             bigrams: (0..size)
66 |                 .map(|_| POSTaggerBigramScores::<T>::default())
67 |                 .collect::<Vec<POSTaggerBigramScores<T>>>(),
68 |         }
69 |     }
70 | }
71 | 
72 | impl POSTaggerBigramScores<usize> {
73 |     /// Increments the score for a given bigram and tag, adding a new entry if the tag is not present.
74 |     pub fn incr(&mut self, bigram: u16, tag: POSTag) {
75 |         let scores = self.0.entry(bigram).or_default();
76 |         let index = match scores.iter().position(|score| score.0 == tag) {
77 |             Some(r) => r,
78 |             None => {
79 |                 scores.push((tag, 0));
80 |                 scores.len() - 1
81 |             }
82 |         };
83 |         scores[index].1 += 1;
84 |     }
85 | }
86 | 
87 | impl<T: Score> Default for POSTaggerLayer<T> {
88 |     fn default() -> Self {
89 |         Self {
90 |             tags: HashMap::new(),
91 |             initial: POSTaggerScores::new(2),
92 |             before: POSTaggerScores::new(4),
93 |             after: POSTaggerScores::new(2),
94 |         }
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/f8.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use serde::{Deserialize, Serialize};
  8 | use std::convert::From;
  9 | use std::fmt;
 10 | use std::ops::{Add, AddAssign, Mul};
 11 | 
 12 | /// A fixed-point 8-bit representation for floating-point values in the range [0.0, 1.0], storing a sum and value history.
 13 | #[derive(Clone, Default, Debug, PartialEq, Eq)]
 14 | pub struct f8 {
 15 |     values: Vec<u8>,
 16 |     pub sum: u8,
 17 | }
 18 | 
 19 | impl f8 {
 20 |     /// Creates a new f8 instance from a u8 value, initializing the sum and value history.
 21 |     pub fn new(value: u8) -> Self {
 22 |         f8 {
 23 |             values: vec![value],
 24 |             sum: value,
 25 |         }
 26 |     }
 27 | 
 28 |     /// Converts the f8 value to a f32 in the range [0.0, 1.0].
 29 |     pub fn to_f32(&self) -> f32 {
 30 |         self.sum as f32 / 255.0
 31 |     }
 32 | 
 33 |     /// Calculates the sum of all values in the f8 instance as a u16.
 34 |     fn calculate_sum(&self) -> u16 {
 35 |         self.values.iter().map(|&v| v as u16).sum()
 36 |     }
 37 | 
 38 |     /// Returns the sum as a u8, representing the quantized value.
 39 |     fn to_u8(&self) -> u8 {
 40 |         self.sum
 41 |     }
 42 | }
 43 | 
 44 | impl From<f32> for f8 {
 45 |     /// Converts a f32 value in [0.0, 1.0] to an f8 instance, quantizing to a u8.
 46 |     fn from(value: f32) -> Self {
 47 |         // Map the range [0.0, 1.0] to [0, 255]
 48 |         let quantized = (value * 255.0).round() as u8;
 49 |         f8::new(quantized)
 50 |     }
 51 | }
 52 | 
 53 | impl From<f8> for f32 {
 54 |     /// Converts an f8 instance to a f32 value in [0.0, 1.0].
 55 |     fn from(val: f8) -> Self {
 56 |         (val.to_u8() as f32) / 255.0
 57 |     }
 58 | }
 59 | 
 60 | impl Add for f8 {
 61 |     /// Adds two f8 instances, combining their value histories and capping the sum at 255.
 62 |     type Output = Self;
 63 | 
 64 |     fn add(mut self, other: Self) -> Self {
 65 |         self.values.extend(other.values);
 66 |         let new_sum = self.calculate_sum().min(255) as u8;
 67 |         f8 {
 68 |             values: self.values,
 69 |             sum: new_sum,
 70 |         }
 71 |     }
 72 | }
 73 | 
 74 | impl AddAssign<usize> for f8 {
 75 |     fn add_assign(&mut self, rhs: usize) {
 76 |         self.values.push(rhs as u8);
 77 |         self.sum += rhs as u8;
 78 |     }
 79 | }
 80 | 
 81 | impl Mul<f32> for f8 {
 82 |     type Output = Self;
 83 | 
 84 |     fn mul(self, rhs: f32) -> Self {
 85 |         // Convert the F8 value to f32, multiply, then convert back to F8
 86 |         let result_f32 = (self.to_u8() as f32 / 255.0) * rhs;
 87 |         f8::from(result_f32)
 88 |     }
 89 | }
 90 | 
 91 | impl Serialize for f8 {
 92 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 93 |     where
 94 |         S: serde::Serializer,
 95 |     {
 96 |         serializer.serialize_u8(self.sum)
 97 |     }
 98 | }
 99 | 
100 | impl<'de> Deserialize<'de> for f8 {
101 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
102 |     where
103 |         D: serde::Deserializer<'de>,
104 |     {
105 |         let value = u8::deserialize(deserializer)?;
106 |         Ok(f8::new(value))
107 |     }
108 | }
109 | 
110 | impl PartialOrd for f8 {
111 |     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
112 |         Some(self.calculate_sum().cmp(&other.calculate_sum()))
113 |     }
114 | }
115 | 
116 | impl Ord for f8 {
117 |     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
118 |         self.calculate_sum().cmp(&other.calculate_sum())
119 |     }
120 | }
121 | 
122 | impl fmt::Display for f8 {
123 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124 |         write!(f, "{}", self.to_f32())
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/coref_categories.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
  3 | // See the full license at: https://cicero.sh/license.txt
  4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  5 | 
  6 | use crate::pos_tagger::POSTag;
  7 | use crate::tokenizer::Token;
  8 | use crate::vocab::VocabDatabase;
  9 | use std::ops::Range;
 10 | 
 11 | /// Stores category ranges for coreference resolution, including named entity recognition (NER) and noun-based person and entity classifications.
 12 | #[derive(Clone, Default)]
 13 | pub struct CoreferenceCategories {
 14 |     ner_person: Range<i16>,
 15 |     noun_person: Vec<Range<i16>>,
 16 |     ner_entity: Vec<Range<i16>>,
 17 |     noun_entity: Vec<Range<i16>>,
 18 | }
 19 | 
 20 | impl CoreferenceCategories {
 21 |     /// Creates a new CoreferenceCategories instance, initializing ranges from the provided vocabulary database.
 22 |     pub fn new(vocab: &VocabDatabase) -> Self {
 23 |         Self {
 24 |             ner_person: vocab.categories.ner.path2range("person").unwrap_or(0..0),
 25 |             noun_person: Self::compile_noun_person(vocab),
 26 |             ner_entity: Self::compile_ner_entity(vocab),
 27 |             noun_entity: Self::compile_noun_entity(vocab),
 28 |         }
 29 |     }
 30 | 
 31 |     /// Compiles a list of NER entity category ranges for facilities, organizations, and businesses from the vocabulary database.
 32 |     pub fn compile_ner_entity(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 33 |         let mut res: Vec<Range<i16>> = Vec::new();
 34 |         for label in ["facility", "organization", "business"].iter() {
 35 |             if let Some(r) = vocab.categories.ner.path2range(label) {
 36 |                 res.push(r);
 37 |             }
 38 |         }
 39 | 
 40 |         res
 41 |     }
 42 | 
 43 |     /// Compiles a list of noun person category ranges for military ranks, family relations, occupations, corporate jobs, and individuals.
 44 |     pub fn compile_noun_person(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 45 |         // Set paths
 46 |         let paths = [
 47 |             "military/military_rank",
 48 |             "health_and_human/family_relation",
 49 |             "education/occupation",
 50 |             "business_and_finance/corporate_job",
 51 |             "personnel/individual",
 52 |         ];
 53 | 
 54 |         let mut res: Vec<Range<i16>> = Vec::new();
 55 |         for path in paths.iter() {
 56 |             if let Some(r) = vocab.categories.nouns.path2range(path) {
 57 |                 res.push(r);
 58 |             }
 59 |         }
 60 | 
 61 |         res
 62 |     }
 63 | 
 64 |     /// Compiles a list of noun entity category ranges for transportation, military vehicles, landforms, infrastructure, and groups.
 65 |     pub fn compile_noun_entity(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 66 |         // Set paths
 67 |         let paths = [
 68 |             "transportation/aircraft",
 69 |             "transportation/automobile",
 70 |             "transportation/bycycle",
 71 |             "transportation/_public_transportation",
 72 |             "transportation/ship",
 73 |             "military/vehicle",
 74 |             "environment/landform",
 75 |             "architecture_and_construction/infrastructure",
 76 |             "personnel/group",
 77 |         ];
 78 | 
 79 |         let mut res: Vec<Range<i16>> = Vec::new();
 80 |         for path in paths.iter() {
 81 |             if let Some(r) = vocab.categories.nouns.path2range(path) {
 82 |                 res.push(r);
 83 |             }
 84 |         }
 85 | 
 86 |         res
 87 |     }
 88 | 
 89 |     /// Checks if a token represents a person, based on NER person range or noun person categories.
 90 |     pub fn is_person(&self, token: &Token) -> bool {
 91 |         if token.is_named_entity() {
 92 |             return token.has_ner(&self.ner_person);
 93 |         }
 94 | 
 95 |         self.noun_person.iter().any(|r| token.has_category(r))
 96 |     }
 97 | 
 98 |     /// Checks if a token represents an entity, based on NER entity ranges, plural noun person categories, or noun entity categories.
 99 |     pub fn is_entity(&self, token: &Token) -> bool {
100 |         if token.is_named_entity() {
101 |             return self.ner_entity.iter().any(|r| token.has_ner(r));
102 |         }
103 | 
104 |         if token.pos == POSTag::NNS && self.noun_person.iter().any(|r| token.has_category(r)) {
105 |             return true;
106 |         }
107 | 
108 |         self.noun_entity.iter().any(|r| token.has_category(r))
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/phrase_intents.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use crate::error::Error;
  8 | use crate::tokenizer::{Token, TokenizedInput};
  9 | use serde::{Deserialize, Serialize};
 10 | use std::collections::HashMap;
 11 | use std::fmt;
 12 | 
 13 | /// A trie-like structure for storing phrase intents
 14 | #[derive(Serialize, Deserialize)]
 15 | pub struct PhraseIntents {
 16 |     pub intent: Option<PhraseIntent>,
 17 |     pub children: HashMap<i32, Box<PhraseIntents>>,
 18 | }
 19 | 
 20 | #[derive(Default, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
 21 | pub enum PhraseIntent {
 22 |     acknowledgment,
 23 |     affirmation,
 24 |     emphasis,
 25 |     hesitation,
 26 |     negation,
 27 |     #[default]
 28 |     neutral,
 29 |     rejection,
 30 |     request,
 31 | }
 32 | 
 33 | impl Default for PhraseIntents {
 34 |     fn default() -> Self {
 35 |         Self::new()
 36 |     }
 37 | }
 38 | 
 39 | impl PhraseIntents {
 40 |     /// Creates a new phrase intents node
 41 |     pub fn new() -> Self {
 42 |         Self {
 43 |             intent: None,
 44 |             children: HashMap::new(),
 45 |         }
 46 |     }
 47 | 
 48 |     /// Inserts a phrase into the trie
 49 |     pub fn insert(&mut self, intent: PhraseIntent, tokens: &[Token]) {
 50 |         let mut current = self;
 51 |         for token in tokens.iter() {
 52 |             current = current.children.entry(token.index).or_insert(Box::new(PhraseIntents::new()));
 53 |         }
 54 |         current.intent = Some(intent);
 55 |     }
 56 | 
 57 |     /// Check a vector of tokens as to whether or not a phrase intent entry exists
 58 |     pub fn check(
 59 |         &self,
 60 |         mut position: usize,
 61 |         output: &TokenizedInput,
 62 |     ) -> Option<(PhraseIntent, usize)> {
 63 |         let mut length = 0;
 64 |         let mut index = if let Some(token) = &output.mwe[position].token {
 65 |             token.index
 66 |         } else {
 67 |             output.tokens[output.mwe[position].position].index
 68 |         };
 69 |         let mut current = self;
 70 |         while let Some(node) = current.children.get(&index) {
 71 |             length += 1;
 72 |             if let Some(intent) = node.intent {
 73 |                 return Some((intent, length));
 74 |             }
 75 | 
 76 |             position += 1;
 77 |             if position >= output.mwe.len() {
 78 |                 return None;
 79 |             }
 80 |             index = if let Some(child_token) = &output.mwe[position].token {
 81 |                 child_token.index
 82 |             } else {
 83 |                 output.tokens[output.mwe[position].position].index
 84 |             };
 85 |             current = node;
 86 |         }
 87 | 
 88 |         None
 89 |     }
 90 | }
 91 | 
 92 | impl PhraseIntent {
 93 |     pub fn from_str(value: &str) -> Result<Self, Error> {
 94 |         match value {
 95 |             "acknowledgment" => Ok(PhraseIntent::acknowledgment),
 96 |             "affirmation" => Ok(PhraseIntent::affirmation),
 97 |             "emphasis" => Ok(PhraseIntent::emphasis),
 98 |             "hesitation" => Ok(PhraseIntent::hesitation),
 99 |             "negation" => Ok(PhraseIntent::negation),
100 |             "neutral" => Ok(PhraseIntent::neutral),
101 |             "rejection" => Ok(PhraseIntent::rejection),
102 |             "request" => Ok(PhraseIntent::request),
103 |             _ => Err(Error::Generic(format!(
104 |                 "Invalid phrase intent value, {}",
105 |                 value
106 |             ))),
107 |         }
108 |     }
109 | }
110 | 
111 | impl fmt::Display for PhraseIntent {
112 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
113 |         let value = match self {
114 |             PhraseIntent::acknowledgment => "acknowledgment".to_string(),
115 |             PhraseIntent::affirmation => "affirmation".to_string(),
116 |             PhraseIntent::emphasis => "emphasis".to_string(),
117 |             PhraseIntent::hesitation => "hesitation".to_string(),
118 |             PhraseIntent::negation => "negation".to_string(),
119 |             PhraseIntent::neutral => "neutral".to_string(),
120 |             PhraseIntent::rejection => "rejection".to_string(),
121 |             PhraseIntent::request => "request".to_string(),
122 |         };
123 |         write!(f, "{}", value)
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/coref_categories.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use crate::pos_tagger::POSTag;
  8 | use crate::tokenizer::Token;
  9 | use crate::vocab::VocabDatabase;
 10 | use std::ops::Range;
 11 | 
 12 | /// Stores category ranges for coreference resolution, including named entity recognition (NER) and noun-based person and entity classifications.
 13 | #[derive(Clone, Default)]
 14 | pub struct CoreferenceCategories {
 15 |     ner_person: Range<i16>,
 16 |     noun_person: Vec<Range<i16>>,
 17 |     ner_entity: Vec<Range<i16>>,
 18 |     noun_entity: Vec<Range<i16>>,
 19 | }
 20 | 
 21 | impl CoreferenceCategories {
 22 |     /// Creates a new CoreferenceCategories instance, initializing ranges from the provided vocabulary database.
 23 |     pub fn new(vocab: &VocabDatabase) -> Self {
 24 |         Self {
 25 |             ner_person: vocab.categories.ner.path2range("person").unwrap_or(0..0),
 26 |             noun_person: Self::compile_noun_person(vocab),
 27 |             ner_entity: Self::compile_ner_entity(vocab),
 28 |             noun_entity: Self::compile_noun_entity(vocab),
 29 |         }
 30 |     }
 31 | 
 32 |     /// Compiles a list of NER entity category ranges for facilities, organizations, and businesses from the vocabulary database.
 33 |     pub fn compile_ner_entity(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 34 |         let mut res: Vec<Range<i16>> = Vec::new();
 35 |         for label in ["facility", "organization", "business"].iter() {
 36 |             if let Some(r) = vocab.categories.ner.path2range(label) {
 37 |                 res.push(r);
 38 |             }
 39 |         }
 40 | 
 41 |         res
 42 |     }
 43 | 
 44 |     /// Compiles a list of noun person category ranges for military ranks, family relations, occupations, corporate jobs, and individuals.
 45 |     pub fn compile_noun_person(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 46 |         // Set paths
 47 |         let paths = [
 48 |             "military/military_rank",
 49 |             "health_and_human/family_relation",
 50 |             "education/occupation",
 51 |             "business_and_finance/corporate_job",
 52 |             "personnel/individual",
 53 |         ];
 54 | 
 55 |         let mut res: Vec<Range<i16>> = Vec::new();
 56 |         for path in paths.iter() {
 57 |             if let Some(r) = vocab.categories.nouns.path2range(path) {
 58 |                 res.push(r);
 59 |             }
 60 |         }
 61 | 
 62 |         res
 63 |     }
 64 | 
 65 |     /// Compiles a list of noun entity category ranges for transportation, military vehicles, landforms, infrastructure, and groups.
 66 |     pub fn compile_noun_entity(vocab: &VocabDatabase) -> Vec<Range<i16>> {
 67 |         // Set paths
 68 |         let paths = [
 69 |             "transportation/aircraft",
 70 |             "transportation/automobile",
 71 |             "transportation/bycycle",
 72 |             "transportation/_public_transportation",
 73 |             "transportation/ship",
 74 |             "military/vehicle",
 75 |             "environment/landform",
 76 |             "architecture_and_construction/infrastructure",
 77 |             "personnel/group",
 78 |         ];
 79 | 
 80 |         let mut res: Vec<Range<i16>> = Vec::new();
 81 |         for path in paths.iter() {
 82 |             if let Some(r) = vocab.categories.nouns.path2range(path) {
 83 |                 res.push(r);
 84 |             }
 85 |         }
 86 | 
 87 |         res
 88 |     }
 89 | 
 90 |     /// Checks if a token represents a person, based on NER person range or noun person categories.
 91 |     pub fn is_person(&self, token: &Token) -> bool {
 92 |         if token.is_named_entity() {
 93 |             return token.has_ner(&self.ner_person);
 94 |         }
 95 | 
 96 |         self.noun_person.iter().any(|r| token.has_category(r))
 97 |     }
 98 | 
 99 |     /// Checks if a token represents an entity, based on NER entity ranges, plural noun person categories, or noun entity categories.
100 |     pub fn is_entity(&self, token: &Token) -> bool {
101 |         if token.is_named_entity() {
102 |             return self.ner_entity.iter().any(|r| token.has_ner(r));
103 |         }
104 | 
105 |         if token.pos == POSTag::NNS && self.noun_person.iter().any(|r| token.has_category(r)) {
106 |             return true;
107 |         }
108 | 
109 |         self.noun_entity.iter().any(|r| token.has_category(r))
110 |     }
111 | }
112 | 


--------------------------------------------------------------------------------
/src/sophia/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Sophia NLU Engine (cicero-sophia)
  3 | 
  4 | High-performance NLU (natural language understanding) engine built in Rust for speed, accuracy, and privacy.
  5 | 
  6 | ![Crates.io](https://img.shields.io/crates/v/cicero-sophia.svg)
  7 | ![Docs.rs](https://docs.rs/cicero-sophia/badge.svg)
  8 | ![License](https://img.shields.io/badge/license-GPLv3-blue.svg) (LICENSE)
  9 | 
 10 | ## Features
 11 | 
 12 | **Core Capabilities**
 13 | 
 14 | * Industry-leading vocabulary with 914,000 (full) or 145,000 (lite) words
 15 | * Sophisticated categorization system spanning 8,700+ hierarchical categories, allowing for easy word to action mapping
 16 | * Advanced language processing including POS tagging, anaphora resolution, and named entity recognition
 17 | * Intelligent phrase parsing with automated spelling correction
 18 | 
 19 | **Performance**
 20 | 
 21 | * Process ~25,000 words per second on a single thread
 22 | * Lightweight deployment: Single 79MB (lite) or 177MB (full) data store
 23 | * Zero external dependencies or API calls required
 24 | * Privacy-focused with all processing done locally
 25 | 
 26 | Github: https://github.com/cicero-ai/cicero/
 27 | 
 28 | ## License
 29 | 
 30 | Typical dual license model, free and open source for individual use via the GPLv3 license, but premium license required for commercial use.  For full details including online demo, please visit: [https://cicero.sh/sophia/](https://cicero.sh/sophia/).
 31 | 
 32 | 
 33 | ## Installation
 34 | 
 35 | Add cicero-sophia to your project by including it in your Cargo.toml:
 36 | 
 37 | toml
 38 | 
 39 | [dependencies]
 40 | cicero-sophia = "0.3.0"
 41 | 
 42 | 
 43 | ## Vocabulary Data Store
 44 | 
 45 | To use Sophia, you must obtain the vocabulary data store, which is available free of charge.  Simply visit [https://cicero.sh/](https://ciciero.sh/) register for a free account, and the vocabulary data store is available for download within the member's area.
 46 | 
 47 | ## Usage
 48 | 
 49 | **Example 1: Tokenizing Text**
 50 | 
 51 | ```rust
 52 | use sophia::{Sophia, Error};
 53 | 
 54 | fn main() -> Result<(), Error> {
 55 |     // Initialize Sophia
 56 |     let datadir = "./vocab_data";
 57 |     let sophia = Sophia::new(datadir, "en")?;
 58 | 
 59 |     // Tokenize the input text
 60 |     let output = sophia.tokenize("The quick brown fox jumps over the lazy dog")?;
 61 | 
 62 |     // Print individual tokens
 63 |     println!("Individual Tokens:");
 64 |     for token in output.iter() {
 65 |         println!("  Word: {} POS: {}", token.word, token.pos);
 66 |     }
 67 | 
 68 |     // Print MWEs
 69 |     println!("\nMulti-Word Entities (MWEs):");
 70 |     for token in output.mwe() {
 71 |         println!("  Word: {} POS: {}", token.word, token.pos);
 72 |     }
 73 | 
 74 |     Ok(())
 75 | }
 76 | ```
 77 | 
 78 | **Example 2: Interpreting Text**
 79 | 
 80 | ```rust
 81 | 
 82 | use sophia::{Sophia, Error};
 83 | 
 84 | fn main() -> Result<(), Error> {
 85 |     // Initialize Sophia
 86 |     let datadir = "./vocab_data";
 87 |     let sophia = Sophia::new(datadir, "en")?;
 88 | 
 89 |     // Interpret the input text
 90 |     let output = sophia.interpret("The quick brown fox jumps over the lazy dog")?;
 91 | 
 92 |     // Print phrases
 93 |     println!("Phrases:");
 94 |     for phrase in output.phrases.iter() {
 95 |         println!("  {:?}", phrase);
 96 |     }
 97 | 
 98 |     // Print individual tokens
 99 |     println!("\nIndividual Tokens:");
100 |     for token in output.tokens.iter() {
101 |         println!("  Word: {} POS: {}", token.word, token.pos);
102 |     }
103 | 
104 |     Ok(())
105 | }
106 | ```
107 | 
108 | 
109 | **Example 3: Retrieve individual word / toekn**
110 | 
111 | ```rust
112 | 
113 | use sophia::{Sophia, Error};
114 | 
115 | fn main() -> Result<(), Error> {
116 |     // Initialize Sophia
117 |     let datadir = "./vocab_data";
118 |     let sophia = Sophia::new(datadir, "en")?;
119 | 
120 |     // Get word
121 |     let token = sophia.get_word("future").unwrap();
122 |     println!("Got word {}, id {}, pos {}", token.word, token.index, token.pos);
123 | 
124 |     // Get specific token
125 |     let token = sophia.get_token(82251).unwrap();
126 |     println!("Got word {}, id {}, pos {}", token.word, token.index, token.pos);
127 | 
128 |     Ok(())
129 | }
130 | ```
131 | 
132 | **Example 4: Retrieve Category**
133 | 
134 | ```rust
135 | 
136 | use sophia::{Sophia, Error};
137 | 
138 | fn main() -> Result<(), Error> {
139 |     // Initialize Sophia
140 |     let datadir = "./vocab_data";
141 |     let sophia = Sophia::new(datadir, "en")?;
142 | 
143 |     // Get category
144 |     let cat = sophia.get_category("verbs/action/travel/depart").unwrap();
145 |     println!("name {}", cat.name);
146 |     println!("fqn: {}", cat.fqn);
147 |     println!("word ids: {:?}", cat.words);
148 | 
149 |     Ok(())
150 | }
151 | ```
152 | 
153 | ## Contact
154 | 
155 | For all inquiries, please complete the contact form at: https://cicero.sh/contact
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/mwe.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use serde::{Deserialize, Serialize};
  8 | use std::collections::HashMap;
  9 | 
 10 | /// Represents a multi-word entity (MWE) node in a trie-like structure, with index, type, capitalization, and child nodes.
 11 | #[derive(Default, Serialize, Deserialize)]
 12 | pub struct VocabMWE {
 13 |     pub index: i32,
 14 |     pub mwe_type: MWEType,
 15 |     pub capitalization: Capitalization,
 16 |     pub orig_word: String,
 17 |     pub children: HashMap<String, Box<VocabMWE>>,
 18 | }
 19 | 
 20 | /// Defines the type of a multi-word entity, which can be standard, scoring, or both.
 21 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Clone)]
 22 | pub enum MWEType {
 23 |     #[default]
 24 |     standard,
 25 |     scoring,
 26 |     both,
 27 | }
 28 | 
 29 | /// Defines the capitalization style of a word, which can be lowercase, uppercase, title case, or other.
 30 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Hash, Clone)]
 31 | pub enum Capitalization {
 32 |     #[default]
 33 |     lower,
 34 |     upper,
 35 |     title,
 36 |     other(String),
 37 | }
 38 | 
 39 | impl VocabMWE {
 40 |     /// Creates a new VocabMWE node for a word with the specified MWE type and inferred capitalization.
 41 |     pub fn new(word: &str, mwe_type: MWEType) -> VocabMWE {
 42 |         let capitalization = Self::classify_capitalization(word);
 43 | 
 44 |         VocabMWE {
 45 |             index: 0,
 46 |             mwe_type,
 47 |             capitalization,
 48 |             orig_word: String::new(),
 49 |             children: HashMap::new(),
 50 |         }
 51 |     }
 52 | 
 53 |     /// Inserts a phrase into the MWE trie, assigning the given index and MWE type, and returns the index.
 54 |     pub fn insert(&mut self, phrase: &str, index: i32, mwe_type: MWEType) -> i32 {
 55 |         let mut current = self;
 56 |         for word in phrase.split(" ").collect::<Vec<&str>>().iter() {
 57 |             current = current
 58 |                 .children
 59 |                 .entry(word.to_lowercase().to_string())
 60 |                 .or_insert(Box::new(VocabMWE::new(word, mwe_type.clone())));
 61 | 
 62 |             if current.mwe_type == MWEType::standard && mwe_type == MWEType::scoring {
 63 |                 current.mwe_type = MWEType::both;
 64 |             } else if current.mwe_type == MWEType::scoring && mwe_type == MWEType::standard {
 65 |                 current.mwe_type = MWEType::both;
 66 |             }
 67 |         }
 68 | 
 69 |         current.index = index;
 70 |         index
 71 |     }
 72 | 
 73 |     /// Retrieves the index of a multi-word entity phrase from the trie, returning 0 if not found.
 74 |     pub fn get(&self, phrase: &str) -> i32 {
 75 |         let mut current = self;
 76 |         for word in phrase.to_lowercase().split(" ").collect::<Vec<&str>>().iter() {
 77 |             match current.children.get(*word) {
 78 |                 Some(next) => current = next.as_ref(),
 79 |                 None => return 0,
 80 |             }
 81 |         }
 82 |         current.index
 83 |     }
 84 | 
 85 |     /// Classifies the capitalization style of a string (lowercase, uppercase, title case, or other).
 86 |     pub fn classify_capitalization(s: &str) -> Capitalization {
 87 |         if s.to_lowercase() == s {
 88 |             Capitalization::lower
 89 |         } else if s.to_uppercase() == s {
 90 |             Capitalization::upper
 91 |         } else if s.chars().all(|c| c.is_uppercase()) && s.chars().any(|c| c.is_lowercase()) {
 92 |             Capitalization::title
 93 |         } else {
 94 |             Capitalization::other(s.to_string())
 95 |         }
 96 |     }
 97 | 
 98 |     /// Formats a word according to the node's capitalization style (lowercase, uppercase, title case, or original).
 99 |     pub fn format(&self, word: &str) -> String {
100 |         match self.capitalization {
101 |             Capitalization::lower => word.to_lowercase(),
102 |             Capitalization::upper => word.to_uppercase(),
103 |             Capitalization::title => format!(
104 |                 "{}{}",
105 |                 word.chars().next().unwrap().to_uppercase(),
106 |                 &word[1..].to_lowercase()
107 |             ),
108 |             _ => self.orig_word.to_string(),
109 |         }
110 |     }
111 | }
112 | 
113 | impl Capitalization {
114 |     /// Creates a Capitalization variant from a string value, using the original string for 'other' cases.
115 |     pub fn from_str(value: &str, orig: &str) -> Self {
116 |         match value {
117 |             "lower" => Self::lower,
118 |             "upper" => Self::upper,
119 |             "title" => Self::title,
120 |             "other" => Self::other(orig.to_string()),
121 |             _ => panic!("Invalid capitalization value, {}", value),
122 |         }
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/sophia/src/tokenizer/input.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::Token;
  8 | 
  9 | /// Represents the tokenized output of input text, including tokens, multi-word entities (MWEs), and iteration state.
 10 | #[derive(Default, Clone)]
 11 | pub struct TokenizedInput {
 12 |     pub original: String,
 13 |     pub tokens: Vec<Token>,
 14 |     pub mwe: Vec<MWE>,
 15 |     pub mwe_scoring: Vec<MWE>,
 16 |     position: usize,
 17 |     filter_mwe: bool,
 18 |     filter_mwe_scoring: bool,
 19 |     filter_stopwords: bool,
 20 | }
 21 | 
 22 | /// Represents a multi-word entity (MWE) with a position and an optional associated token.
 23 | #[derive(Clone)]
 24 | pub struct MWE {
 25 |     pub position: usize,
 26 |     pub token: Option<Token>,
 27 | }
 28 | 
 29 | impl TokenizedInput {
 30 |     /// Creates a new TokenizedInput instance with the provided original text and empty token/MWE lists.
 31 |     pub fn new(original: &str) -> Self {
 32 |         Self {
 33 |             original: original.to_string(),
 34 |             tokens: Vec::new(),
 35 |             mwe: Vec::new(),
 36 |             mwe_scoring: Vec::new(),
 37 |             position: 0,
 38 |             filter_mwe: false,
 39 |             filter_mwe_scoring: false,
 40 |             filter_stopwords: false,
 41 |         }
 42 |     }
 43 | 
 44 |     /// Returns a new TokenizedInput configured to iterate over individual tokens.
 45 |     pub fn iter(&self) -> Self {
 46 |         let mut c = self.clone();
 47 |         c.filter_mwe = false;
 48 |         c.position = 0;
 49 |         c
 50 |     }
 51 | 
 52 |     /// Returns a new TokenizedInput configured to iterate over MWEs.
 53 |     pub fn mwe(&self) -> Self {
 54 |         let mut c = self.clone();
 55 |         c.filter_mwe = true;
 56 |         c.position = 0;
 57 |         c
 58 |     }
 59 | 
 60 |     /// Returns a new TokenizedInput configured to iterate over MWE scoring tokens.
 61 |     pub fn mwe_scoring(&self) -> Self {
 62 |         let mut c = self.clone();
 63 |         c.filter_mwe_scoring = true;
 64 |         c.position = 0;
 65 |         c
 66 |     }
 67 | 
 68 |     /// Configures the TokenizedInput to filter out stopwords during iteration.
 69 |     pub fn remove_stop_words(mut self) -> Self {
 70 |         self.filter_stopwords = true;
 71 |         self
 72 |     }
 73 | 
 74 |     /// Configures the TokenizedInput to include stopwords during iteration.
 75 |     pub fn add_stop_words(mut self) -> Self {
 76 |         self.filter_stopwords = false;
 77 |         self
 78 |     }
 79 | 
 80 |     /// Retrieves the next MWE token, either from the MWE's token or the token at the MWE's position.
 81 |     fn next_mwe(&mut self) -> Option<Token> {
 82 |         if self.position >= self.mwe.len() {
 83 |             return None;
 84 |         }
 85 |         let mwe = self.mwe.get(self.position).unwrap();
 86 |         self.position += 1;
 87 | 
 88 |         let token = match mwe.token.clone() {
 89 |             Some(r) => r,
 90 |             None => self.tokens.get(mwe.position).unwrap().clone(),
 91 |         };
 92 | 
 93 |         Some(token)
 94 |     }
 95 | 
 96 |     /// Retrieves the next MWE scoring token, either from the MWE scoring's token or the token at its position.
 97 |     fn next_mwe_scoring(&mut self) -> Option<Token> {
 98 |         if self.position >= self.mwe_scoring.len() {
 99 |             return None;
100 |         }
101 |         let mwe = self.mwe_scoring.get(self.position).unwrap();
102 |         self.position += 1;
103 | 
104 |         let token = match mwe.token.clone() {
105 |             Some(r) => r,
106 |             None => self.tokens.get(mwe.position).unwrap().clone(),
107 |         };
108 | 
109 |         Some(token)
110 |     }
111 | }
112 | 
113 | impl std::ops::Index<usize> for TokenizedInput {
114 |     type Output = Token;
115 | 
116 |     /// Provides read-only indexing into the token vector by position.
117 |     fn index(&self, index: usize) -> &Self::Output {
118 |         &self.tokens[index]
119 |     }
120 | }
121 | 
122 | impl std::ops::IndexMut<usize> for TokenizedInput {
123 |     /// Provides mutable indexing into the token vector by position.
124 |     fn index_mut(&mut self, index: usize) -> &mut Self::Output {
125 |         &mut self.tokens[index]
126 |     }
127 | }
128 | 
129 | impl Iterator for TokenizedInput {
130 |     /// Advances the iterator, returning the next token based on the current filter (MWE, MWE scoring, or individual tokens).
131 |     type Item = Token;
132 | 
133 |     fn next(&mut self) -> Option<Self::Item> {
134 |         // MWE
135 |         if self.filter_mwe {
136 |             return self.next_mwe();
137 |         } else if self.filter_mwe_scoring {
138 |             return self.next_mwe_scoring();
139 |         }
140 | 
141 |         if self.position >= self.tokens.len() {
142 |             return None;
143 |         }
144 |         let token = self.tokens.get(self.position).unwrap();
145 |         self.position += 1;
146 | 
147 |         Some(token.clone())
148 |     }
149 | }
150 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/interpreter.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
  3 | // See the full license at: https://cicero.sh/license.txt
  4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  5 | 
  6 | use super::{CoreferenceCategories, Interpretation, PhraseBuffer};
  7 | use crate::interpreter::phrase::{Adjective, Adverb};
  8 | use crate::pos_tagger::POSTag;
  9 | use crate::tokenizer::{TokenizedInput, Tokenizer};
 10 | use crate::vocab::VocabDatabase;
 11 | use std::collections::HashMap;
 12 | 
 13 | pub struct Interpreter {
 14 |     coref_categories: CoreferenceCategories,
 15 | }
 16 | 
 17 | impl Interpreter {
 18 |     /// Creates a new Interpreter instance from the provided vocabulary database.
 19 |     pub fn new(vocab: &VocabDatabase) -> Self {
 20 |         Self {
 21 |             coref_categories: CoreferenceCategories::new(vocab),
 22 |         }
 23 |     }
 24 | 
 25 |     /// Interprets user input by tokenizing, processing, and categorizing tokens into an Interpretation struct.
 26 |     /// Returns the constructed Interpretation with scores, tokens, multi-word expressions, and phrases.
 27 |     pub fn interpret(
 28 |         &self,
 29 |         input: &str,
 30 |         tokenizer: &Tokenizer,
 31 |         vocab: &VocabDatabase,
 32 |     ) -> Interpretation {
 33 |         // Tokenize input
 34 |         let mut tokens = tokenizer.encode(input, vocab);
 35 |         let mut buffer = PhraseBuffer::new(&self.coref_categories, vocab);
 36 | 
 37 |         // GO through tokens
 38 |         for (x, token) in tokens.mwe().enumerate() {
 39 |             buffer.tokens.push(token.clone());
 40 | 
 41 |             // Check for phrase intent
 42 |             if let Some((intent, length)) = vocab.words.phrase_intents.check(x, &tokens) {
 43 |                 buffer.add_intent(intent, length);
 44 |             }
 45 | 
 46 |             if token.is_sentence_stopper() {
 47 |                 buffer.hard_split(x);
 48 |             } else if token.is_noun()
 49 |                 && buffer.last_pos == POSTag::VBG
 50 |                 && !buffer.current_verbs.is_empty()
 51 |             {
 52 |                 buffer.current_verbs.last_mut().unwrap().objects.push(x);
 53 |             } else if token.is_noun() {
 54 |                 buffer.add_noun(x);
 55 |             } else if vocab.preprocess.auxillary_verbs.contains(&token.index)
 56 |                 || vocab.preprocess.predicative_verbs.contains(&token.index)
 57 |             {
 58 |                 if vocab.preprocess.auxillary_verbs.contains(&token.index) {
 59 |                     buffer.auxillary_verbs.push(x);
 60 |                 }
 61 |                 if vocab.preprocess.predicative_verbs.contains(&token.index) {
 62 |                     buffer.predicative_verbs.push(x);
 63 |                 }
 64 |             } else if token.is_verb() {
 65 |                 buffer.add_verb(x);
 66 |             } else if token.is_adverb() {
 67 |                 buffer.adverbs.push(Adverb::new(x, &token, vocab));
 68 |             } else if token.is_adjective() {
 69 |                 buffer.adjectives.push(Adjective::new(x, &token, vocab));
 70 |             } else if token.is_pronoun() {
 71 |                 buffer.add_pronoun(x);
 72 |             } else if token.is_preposition() {
 73 |                 buffer.prepositions.push(x);
 74 |             } else if token.is_determiner() {
 75 |                 buffer.determiners.push(x);
 76 |             } else if token.pos == POSTag::CC || [",", ";", "-", "+"].contains(&token.word.as_str())
 77 |             {
 78 |                 buffer.noun_seperators.push(x);
 79 |             } else if !token.is_conjunction() {
 80 |                 buffer.noise.push(x);
 81 |             }
 82 | 
 83 |             // Linker
 84 |             if token.is_conjunction() {
 85 |                 buffer.linkers.push(x);
 86 |             }
 87 | 
 88 |             // Splitter
 89 |             if token.is_preposition() || token.is_conjunction() || token.word.as_str() == "," {
 90 |                 buffer.splitters.push(x);
 91 |             }
 92 | 
 93 |             // Add non-pronoun to antecedent buffer
 94 |             if !token.is_pronoun() {
 95 |                 buffer.antecedents.add_non_noun(&token);
 96 |             }
 97 |             buffer.last_pos = token.pos;
 98 |         }
 99 | 
100 |         // Finish buffer
101 |         buffer.hard_split(buffer.tokens.len() - 1);
102 | 
103 |         // Instantiate interpretation
104 |         Interpretation {
105 |             scores: self.get_scores(&tokens),
106 |             tokens: std::mem::take(&mut tokens.tokens),
107 |             mwe: std::mem::take(&mut buffer.tokens),
108 |             phrases: std::mem::take(&mut buffer.phrases),
109 |         }
110 |     }
111 | 
112 |     /// Computes classification scores for tokens by averaging scores per code from multi-word expression scoring.
113 |     /// Returns a HashMap mapping classification codes to their average scores.
114 |     fn get_scores(&self, tokens: &TokenizedInput) -> HashMap<i8, f32> {
115 |         let mut res: HashMap<i8, Vec<f32>> = HashMap::new();
116 |         for token in tokens.mwe_scoring() {
117 |             for (code, score) in token.classification_scores.iter() {
118 |                 res.entry(*code).or_default().push(score.to_f32());
119 |             }
120 |         }
121 | 
122 |         // Average scores
123 |         let mut scores: HashMap<i8, f32> = HashMap::new();
124 |         for (code, vec_scores) in res.iter() {
125 |             let avg = vec_scores.iter().sum::<f32>() / (vec_scores.len() as f32);
126 |             scores.insert(*code, avg);
127 |         }
128 | 
129 |         scores
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/interpreter.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{CoreferenceCategories, Interpretation, PhraseBuffer};
  8 | use crate::interpret::phrase::{Adjective, Adverb};
  9 | use crate::pos_tagger::POSTag;
 10 | use crate::tokenizer::{TokenizedInput, Tokenizer};
 11 | use crate::vocab::VocabDatabase;
 12 | use std::collections::HashMap;
 13 | 
 14 | pub struct Interpreter {
 15 |     coref_categories: CoreferenceCategories,
 16 | }
 17 | 
 18 | impl Interpreter {
 19 |     /// Creates a new Interpreter instance from the provided vocabulary database.
 20 |     pub fn new(vocab: &VocabDatabase) -> Self {
 21 |         Self {
 22 |             coref_categories: CoreferenceCategories::new(vocab),
 23 |         }
 24 |     }
 25 | 
 26 |     /// Interprets user input by tokenizing, processing, and categorizing tokens into an Interpretation struct.
 27 |     /// Returns the constructed Interpretation with scores, tokens, multi-word expressions, and phrases.
 28 |     pub fn interpret(
 29 |         &self,
 30 |         input: &str,
 31 |         tokenizer: &Tokenizer,
 32 |         vocab: &VocabDatabase,
 33 |     ) -> Interpretation {
 34 |         // Tokenize input
 35 |         let mut tokens = tokenizer.encode(input, vocab);
 36 |         let mut buffer = PhraseBuffer::new(&self.coref_categories, vocab);
 37 | 
 38 |         // GO through tokens
 39 |         for (x, token) in tokens.mwe().enumerate() {
 40 |             buffer.tokens.push(token.clone());
 41 | 
 42 |             // Check for phrase intent
 43 |             if let Some((intent, length)) = vocab.words.phrase_intents.check(x, &tokens) {
 44 |                 buffer.add_intent(intent, length);
 45 |             }
 46 | 
 47 |             if token.is_sentence_stopper() {
 48 |                 buffer.hard_split(x);
 49 |             } else if token.is_noun()
 50 |                 && buffer.last_pos == POSTag::VBG
 51 |                 && !buffer.current_verbs.is_empty()
 52 |             {
 53 |                 buffer.current_verbs.last_mut().unwrap().objects.push(x);
 54 |             } else if token.is_noun() {
 55 |                 buffer.add_noun(x);
 56 |             } else if vocab.preprocess.auxillary_verbs.contains(&token.index)
 57 |                 || vocab.preprocess.predicative_verbs.contains(&token.index)
 58 |             {
 59 |                 if vocab.preprocess.auxillary_verbs.contains(&token.index) {
 60 |                     buffer.auxillary_verbs.push(x);
 61 |                 }
 62 |                 if vocab.preprocess.predicative_verbs.contains(&token.index) {
 63 |                     buffer.predicative_verbs.push(x);
 64 |                 }
 65 |             } else if token.is_verb() {
 66 |                 buffer.add_verb(x);
 67 |             } else if token.is_adverb() {
 68 |                 buffer.adverbs.push(Adverb::new(x, &token, vocab));
 69 |             } else if token.is_adjective() {
 70 |                 buffer.adjectives.push(Adjective::new(x, &token, vocab));
 71 |             } else if token.is_pronoun() {
 72 |                 buffer.add_pronoun(x);
 73 |             } else if token.is_preposition() {
 74 |                 buffer.prepositions.push(x);
 75 |             } else if token.is_determiner() {
 76 |                 buffer.determiners.push(x);
 77 |             } else if token.pos == POSTag::CC || [",", ";", "-", "+"].contains(&token.word.as_str())
 78 |             {
 79 |                 buffer.noun_seperators.push(x);
 80 |             } else if !token.is_conjunction() {
 81 |                 buffer.noise.push(x);
 82 |             }
 83 | 
 84 |             // Linker
 85 |             if token.is_conjunction() {
 86 |                 buffer.linkers.push(x);
 87 |             }
 88 | 
 89 |             // Splitter
 90 |             if token.is_preposition() || token.is_conjunction() || token.word.as_str() == "," {
 91 |                 buffer.splitters.push(x);
 92 |             }
 93 | 
 94 |             // Add non-pronoun to antecedent buffer
 95 |             if !token.is_pronoun() {
 96 |                 buffer.antecedents.add_non_noun(&token);
 97 |             }
 98 |             buffer.last_pos = token.pos;
 99 |         }
100 | 
101 |         // Finish buffer
102 |         buffer.hard_split(buffer.tokens.len() - 1);
103 | 
104 |         // Instantiate interpretation
105 |         Interpretation {
106 |             scores: self.get_scores(&tokens),
107 |             tokens: std::mem::take(&mut tokens.tokens),
108 |             mwe: std::mem::take(&mut buffer.tokens),
109 |             phrases: std::mem::take(&mut buffer.phrases),
110 |         }
111 |     }
112 | 
113 |     /// Computes classification scores for tokens by averaging scores per code from multi-word expression scoring.
114 |     /// Returns a HashMap mapping classification codes to their average scores.
115 |     fn get_scores(&self, tokens: &TokenizedInput) -> HashMap<i8, f32> {
116 |         let mut res: HashMap<i8, Vec<f32>> = HashMap::new();
117 |         for token in tokens.mwe_scoring() {
118 |             for (code, score) in token.classification_scores.iter() {
119 |                 res.entry(*code).or_default().push(score.to_f32());
120 |             }
121 |         }
122 | 
123 |         // Average scores
124 |         let mut scores: HashMap<i8, f32> = HashMap::new();
125 |         for (code, vec_scores) in res.iter() {
126 |             let avg = vec_scores.iter().sum::<f32>() / (vec_scores.len() as f32);
127 |             scores.insert(*code, avg);
128 |         }
129 | 
130 |         scores
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/tagger.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{HMM, POSModel, POSModelInterface, POSTag, POSTagModelRepo};
  8 | use crate::tokenizer::{Token, TokenizedInput};
  9 | use crate::vocab::VocabDatabase;
 10 | use serde::{Deserialize, Serialize};
 11 | use std::collections::HashMap;
 12 | 
 13 | /// The POS tagger itself including the base HMM,
 14 | /// along with tag and word based post processing models
 15 | #[derive(Default, Serialize, Deserialize)]
 16 | pub struct POSTagger {
 17 |     pub hmm: HMM<i32>,
 18 |     pub cohort: POSModel<i32>,
 19 |     pub tags: POSTagModelRepo<i32>,
 20 |     pub words: HashMap<i32, POSModel<i32>>,
 21 | }
 22 | 
 23 | #[derive(Default, Debug, Clone, Serialize, Deserialize)]
 24 | pub struct POSPrediction {
 25 |     pub method: POSPredictionMethod,
 26 |     pub word: String,
 27 |     pub prev_tag: POSTag,
 28 |     pub tag: POSTag,
 29 |     pub confidence: f32,
 30 |     pub probabilities: HashMap<POSTag, f32>,
 31 |     pub conjunctions: Vec<String>,
 32 | }
 33 | 
 34 | #[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize)]
 35 | pub enum POSPredictionMethod {
 36 |     #[default]
 37 |     non_ambiguous,
 38 |     hmm,
 39 |     standard,
 40 |     conjunction,
 41 |     deterministic_rule,
 42 |     exception,
 43 | }
 44 | 
 45 | impl POSTagger {
 46 |     pub fn new() -> Self {
 47 |         Self::default()
 48 |     }
 49 | 
 50 |     /// Applies part-of-speech tagging to the tokenized input, resolving ambiguous words
 51 |     pub fn apply(&self, output: &mut TokenizedInput, vocab: &VocabDatabase) {
 52 |         // Fix spelling typos
 53 |         self.fix_spelling_typos(output, vocab);
 54 | 
 55 |         // Resolve via HMM model
 56 |         self.hmm.apply(&mut output.tokens);
 57 | 
 58 |         // Iterate through words
 59 |         for position in 0..output.tokens.len() {
 60 |             if output.tokens[position].potential_pos.len() < 2 {
 61 |                 continue;
 62 |             }
 63 | 
 64 |             // Resolve ambiguity
 65 |             if let Some(pred) = self.resolve(position, output) {
 66 |                 output.tokens[position].pos_prediction = pred.clone();
 67 |                 if output.tokens[position].pos != pred.tag
 68 |                     && let Some(new_token) = output.tokens[position].update_pos(pred.tag, vocab)
 69 |                 {
 70 |                     output.tokens[position] = new_token;
 71 |                 }
 72 |             }
 73 |         }
 74 |     }
 75 | 
 76 |     /// Fix spelling typos
 77 |     fn fix_spelling_typos(&self, output: &mut TokenizedInput, vocab: &VocabDatabase) {
 78 |         for position in 0..output.tokens.len() {
 79 |             if output.tokens[position].pos != POSTag::FW {
 80 |                 continue;
 81 |             }
 82 | 
 83 |             // Get initial prediction
 84 |             if let Some(pred) = self.cohort.predict_cohort(position, &output.tokens) {
 85 |                 output.tokens[position].pos_prediction = pred;
 86 | 
 87 |                 // Get spelling correction
 88 |                 if let Some(correction) =
 89 |                     vocab.preprocess.spellchecker.try_correct(position, &output.tokens, vocab)
 90 |                 {
 91 |                     output.tokens[position] = correction;
 92 |                 }
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     // Resolve ambiguity
 98 |     fn resolve(&self, position: usize, output: &TokenizedInput) -> Option<POSPrediction> {
 99 |         // Check word models
100 |         if let Some(model) = self.words.get(&output.tokens[position].index)
101 |             && let Some(pred) = model.predict(position, &output.tokens)
102 |         {
103 |             return Some(pred);
104 |         }
105 | 
106 |         // Check tag models
107 |         if let Some(pred) = self.check_tag_models(position, &output.tokens) {
108 |             return Some(pred);
109 |         }
110 | 
111 |         None
112 |     }
113 | 
114 |     /// Check the tag models
115 |     fn check_tag_models(&self, position: usize, tokens: &[Token]) -> Option<POSPrediction> {
116 |         let tag = tokens[position].pos;
117 | 
118 |         // Check tag models
119 |         if let Some(model_names) = self.tags.tags.get(&tag) {
120 |             for name in model_names.iter() {
121 |                 let model = self.tags.models.get(&name.to_string()).unwrap();
122 | 
123 |                 // Ensure token is valid for model
124 |                 if !model.target_tags.contains(&tag) {
125 |                     continue;
126 |                 }
127 |                 if !tokens[position]
128 |                     .potential_pos
129 |                     .iter()
130 |                     .filter(|&p_tag| *p_tag != tag)
131 |                     .any(|p_tag| model.target_tags.contains(p_tag))
132 |                 {
133 |                     continue;
134 |                 }
135 | 
136 |                 if let Some(pred) = model.predict(position, tokens) {
137 |                     return Some(pred);
138 |                 }
139 |             }
140 |         }
141 | 
142 |         None
143 |     }
144 | }
145 | 
146 | impl POSPrediction {
147 |     pub fn new(
148 |         method: POSPredictionMethod,
149 |         word: &str,
150 |         prev_tag: POSTag,
151 |         tag: POSTag,
152 |         confidence: f32,
153 |         probabilities: &HashMap<POSTag, f32>,
154 |         conjunctions: &[String],
155 |     ) -> Self {
156 |         Self {
157 |             method,
158 |             word: word.to_string(),
159 |             prev_tag,
160 |             tag,
161 |             confidence,
162 |             probabilities: probabilities.clone(),
163 |             conjunctions: conjunctions.to_vec(),
164 |         }
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/database.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{
  8 |     FutureVerbPhrases, PhraseIntents, SpellChecker, VocabCache, VocabCategoryDatabase, VocabMWE,
  9 | };
 10 | use crate::error::Error;
 11 | use crate::pos_tagger::{POSTag, POSTagger};
 12 | use crate::tokenizer::Token;
 13 | use crate::vocab::mwe::Capitalization;
 14 | use bincode;
 15 | use indexmap::IndexMap;
 16 | use serde::{Deserialize, Serialize};
 17 | use std::collections::HashMap;
 18 | use std::fs;
 19 | use std::path::Path;
 20 | use std::sync::Mutex;
 21 | 
 22 | /// A comprehensive vocabulary database for natural language processing, containing metadata, preprocessing data, words, categories, and a cache.
 23 | #[derive(Serialize, Deserialize)]
 24 | pub struct VocabDatabase {
 25 |     pub meta: VocabDatabaseMeta,
 26 |     pub preprocess: VocabPreProcessDatabase,
 27 |     pub words: VocabWordDatabase,
 28 |     pub categories: VocabCategoryDatabase,
 29 |     #[serde(skip_serializing, skip_deserializing)]
 30 |     pub cache: Mutex<VocabCache>,
 31 | }
 32 | 
 33 | /// Metadata for the vocabulary database, including version, language, author, and integrity details.
 34 | #[derive(Serialize, Deserialize)]
 35 | pub struct VocabDatabaseMeta {
 36 |     version: (i8, i8, i8),
 37 |     language: String,
 38 |     author: String,
 39 |     creation_time: String,
 40 |     sha256_hash: String,
 41 |     signature: String,
 42 |     comment: String,
 43 | }
 44 | 
 45 | /// Preprocessing data for the vocabulary, including hashes, typos, spellchecker, verb prefixes, and other linguistic resources.
 46 | #[derive(Serialize, Deserialize, Clone)]
 47 | pub struct VocabPreProcessDatabase {
 48 |     pub hashes: HashMap<String, (String, String)>,
 49 |     pub spellchecker: SpellChecker,
 50 |     pub future_verb_prefixes: Vec<String>,
 51 |     pub stop_words: Vec<i32>,
 52 |     pub predicative_verbs: Vec<i32>,
 53 |     pub auxillary_verbs: Vec<i32>,
 54 |     pub infinitive_prefixes: Vec<i32>,
 55 | }
 56 | 
 57 | /// Word-specific data for the vocabulary, including word lists, POS tagger, MWEs, capitalization, future verbs, and token mappings.
 58 | #[derive(Serialize, Deserialize)]
 59 | pub struct VocabWordDatabase {
 60 |     pub wordlist: HashMap<String, IndexMap<POSTag, i32>>,
 61 |     pub pos_tagger: POSTagger,
 62 |     pub mwe: VocabMWE,
 63 |     pub capitalization: HashMap<i32, Capitalization>,
 64 |     pub future_verbs: FutureVerbPhrases,
 65 |     pub phrase_intents: PhraseIntents,
 66 |     pub id2token: HashMap<i32, Token>,
 67 |     pub plural: HashMap<i32, i32>,
 68 | }
 69 | 
 70 | impl VocabDatabase {
 71 |     /// Saves the vocabulary database to a file using bincode serialization.
 72 |     pub fn save(&mut self, filename: &str) -> Result<(), Error> {
 73 |         let encoded: Vec<u8> = match bincode::serialize(&self) {
 74 |             Ok(r) => r,
 75 |             Err(e) => {
 76 |                 return Err(Error::Save(format!(
 77 |                     "Unable to serialize vocabulary data store, {}",
 78 |                     e
 79 |                 )));
 80 |             }
 81 |         };
 82 |         fs::write(filename, &encoded)?;
 83 |         Ok(())
 84 |     }
 85 | 
 86 |     /// Loads a vocabulary database from a file in the specified directory, initializing the cache.
 87 |     pub fn load(datadir: &str, language: &str) -> Result<VocabDatabase, Error> {
 88 |         let filename = format!("{}/{}.dat", datadir, language);
 89 |         if !Path::new(&filename).exists() {
 90 |             return Err(Error::Load(format!(
 91 |                 "No vocabulary file exists at, {}",
 92 |                 filename
 93 |             )));
 94 |         }
 95 |         let contents = fs::read(&filename)?;
 96 | 
 97 |         let mut vocab: VocabDatabase = match bincode::deserialize(&contents[..]) {
 98 |             Ok(r) => r,
 99 |             Err(e) => {
100 |                 return Err(Error::Load(format!(
101 |                     "Unable to load the vocabulary file.  Please ensure correct file is in place, and re-download from secure client area if necessary.  Contact customer support if the problem persists.  Error: {}",
102 |                     e
103 |                 )));
104 |             }
105 |         };
106 | 
107 |         vocab.cache = Mutex::new(VocabCache::load(datadir)?);
108 |         Ok(vocab)
109 |     }
110 | 
111 |     /// Looks up a word by string, returning a Token based on its vocabulary entry.
112 |     pub fn from_str(&self, word: &str) -> Token {
113 |         let (_, lookup) = match self.lookup_word(word) {
114 |             Some(r) => r,
115 |             None => return Token::default(),
116 |         };
117 | 
118 |         // get token
119 |         let token_id = lookup.values().next().unwrap();
120 |         self.words.id2token.get(&token_id.clone()).unwrap().clone()
121 |     }
122 | 
123 |     /// Converts a word to its corresponding token ID.
124 |     pub fn to_int(&self, word: &str) -> i32 {
125 |         let token = self.from_str(word);
126 |         token.index
127 |     }
128 | 
129 |     /// Looks up a word in the vocabulary, returning its string and POS-to-ID mapping if found.
130 |     pub fn lookup_word(&self, word: &str) -> Option<(String, IndexMap<POSTag, i32>)> {
131 |         // Check mwe
132 |         if word.contains(" ") {
133 |             return None;
134 |         }
135 | 
136 |         // Straight lookup
137 |         if let Some(pos_map) = self.words.wordlist.get(&word.to_lowercase().to_string()) {
138 |             return Some((word.to_string(), pos_map.clone()));
139 |         }
140 | 
141 |         // Lowercase lookup
142 |         if let Some(index) = self.words.wordlist.get(&word.to_lowercase()) {
143 |             return Some((word.to_string(), index.clone()));
144 |         }
145 | 
146 |         None
147 |     }
148 | 
149 |     /// Creates a Token from a given token ID using the vocabulary database.
150 |     pub fn from_int(&self, token_id: i32) -> Token {
151 |         Token::from_id(token_id, self)
152 |     }
153 | }
154 | 
155 | impl Default for VocabDatabaseMeta {
156 |     fn default() -> VocabDatabaseMeta {
157 |         VocabDatabaseMeta {
158 |             version: (1, 0, 0),
159 |             language: "en".to_string(),
160 |             author: "Aquila Labs".to_string(),
161 |             creation_time: String::new(),
162 |             sha256_hash: String::new(),
163 |             signature: String::new(),
164 |             comment: String::new(),
165 |         }
166 |     }
167 | }
168 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/category.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::VocabDatabase;
  8 | use indexmap::IndexMap;
  9 | use serde::{Deserialize, Serialize};
 10 | use std::collections::HashMap;
 11 | use std::fmt;
 12 | use std::ops::Range;
 13 | 
 14 | /// A database for storing vocabulary categories, including nouns, verbs, adverbs, adjectives, and named entity recognition (NER) indices.
 15 | #[derive(Default, Serialize, Deserialize, Clone)]
 16 | pub struct VocabCategoryDatabase {
 17 |     pub counter: i16,
 18 |     pub nodes: HashMap<i16, VocabCategory>,
 19 |     pub nouns: VocabCategoryIndex,
 20 |     pub verbs: VocabCategoryIndex,
 21 |     pub adverbs: VocabCategoryIndex,
 22 |     pub adjectives: VocabCategoryIndex,
 23 |     pub ner: VocabCategoryIndex,
 24 | }
 25 | 
 26 | /// A trie-like index for vocabulary categories, mapping paths to category indices and their children.
 27 | #[derive(Serialize, Deserialize, Clone)]
 28 | pub struct VocabCategoryIndex {
 29 |     pub index: i16,
 30 |     pub children: IndexMap<String, Box<VocabCategoryIndex>>,
 31 | }
 32 | 
 33 | /// Represents a single category with its fully qualified name (FQN), depth, name, and child categories.
 34 | #[derive(Serialize, Deserialize, Clone)]
 35 | pub struct VocabCategory {
 36 |     pub fqn: Vec<i16>,
 37 |     pub depth: i8,
 38 |     pub name: String,
 39 |     pub children: IndexMap<String, i16>,
 40 |     #[serde(skip)]
 41 |     pub pos: String,
 42 |     #[serde(skip)]
 43 |     pub words: Vec<i32>,
 44 | }
 45 | 
 46 | impl VocabCategoryDatabase {
 47 |     /// Get category by path name
 48 |     pub fn get_category_by_path(&self, path: &str, vocab: &VocabDatabase) -> Option<VocabCategory> {
 49 |         // Split path
 50 |         let parts: Vec<&str> = path.split("/").collect::<Vec<&str>>();
 51 |         if parts.len() < 2 {
 52 |             return None;
 53 |         }
 54 |         let remaining_path = parts[1..].join("/").to_string();
 55 | 
 56 |         // Get index
 57 |         let cat_index = match parts[0] {
 58 |             "nouns" => self.nouns.index_by_path(&remaining_path)?,
 59 |             "verbs" => self.verbs.index_by_path(&remaining_path)?,
 60 |             "adverbs" => self.adverbs.index_by_path(&remaining_path)?,
 61 |             "adjectives" => self.adjectives.index_by_path(&remaining_path)?,
 62 |             "ner" => self.ner.index_by_path(&remaining_path)?,
 63 |             _ => return None,
 64 |         };
 65 |         let index = cat_index.index;
 66 | 
 67 |         // Get category
 68 |         let mut cat: VocabCategory = self.nodes.get(&index)?.clone();
 69 |         cat.pos = parts[0].to_string();
 70 |         cat.words = vocab
 71 |             .words
 72 |             .id2token
 73 |             .iter()
 74 |             .filter(|(_, token)| token.categories.contains(&index))
 75 |             .map(|(id, _)| *id)
 76 |             .collect();
 77 | 
 78 |         Some(cat)
 79 |     }
 80 | }
 81 | 
 82 | impl VocabCategoryIndex {
 83 |     /// Creates a new VocabCategoryIndex with default values.
 84 |     pub fn new() -> Self {
 85 |         Self {
 86 |             index: 0,
 87 |             children: IndexMap::new(),
 88 |         }
 89 |     }
 90 | 
 91 |     /// Inserts a category path into the index, assigning the given index and returning it.
 92 |     pub fn insert(&mut self, path: &str, index: i16) -> i16 {
 93 |         let mut current = self;
 94 |         for word in path.split("/").collect::<Vec<&str>>().iter() {
 95 |             current = current
 96 |                 .children
 97 |                 .entry(word.to_lowercase().to_string())
 98 |                 .or_insert(Box::new(VocabCategoryIndex::new()));
 99 |         }
100 | 
101 |         current.index = index;
102 |         index
103 |     }
104 | 
105 |     /// Retrieves the category index for a given path, if it exists.
106 |     pub fn by_path(&self, path: &str) -> Option<i16> {
107 |         if let Some(cat) = self.index_by_path(path) {
108 |             return Some(cat.index);
109 |         }
110 |         None
111 |     }
112 | 
113 |     /// Retrieves the VocabCategoryIndex object for a given path, if it exists.
114 |     pub fn index_by_path(&self, path: &str) -> Option<VocabCategoryIndex> {
115 |         let mut current = self;
116 |         for word in path.to_lowercase().split("/").collect::<Vec<&str>>().iter() {
117 |             match current.children.get(&word.to_string()) {
118 |                 Some(next) => current = next.as_ref(),
119 |                 None => return None,
120 |             }
121 |         }
122 |         Some(current.clone())
123 |     }
124 | 
125 |     /// Returns the range of category IDs for a path, including its children, if the path exists.
126 |     pub fn path2range(&self, path: &str) -> Option<Range<i16>> {
127 |         if let Some(r) = self.index_by_path(path) {
128 |             return Some(r.index..(r.index + (r.count_children() + 1) as i16));
129 |         }
130 | 
131 |         None
132 |     }
133 | 
134 |     /// Counts the total number of children under this index, including nested children.
135 |     pub fn count_children(&self) -> usize {
136 |         let mut count = self.children.len();
137 | 
138 |         for (_, child) in self.children.iter() {
139 |             count += child.count_children();
140 |         }
141 |         count
142 |     }
143 | }
144 | 
145 | impl VocabCategoryDatabase {
146 |     /// Retrieves a category by its ID, if it exists.
147 |     pub fn get(&self, category_id: &i16) -> Option<VocabCategory> {
148 |         if let Some(cat) = self.nodes.get(&category_id.clone()) {
149 |             return Some(cat.clone());
150 |         }
151 |         None
152 |     }
153 | 
154 |     /// Retrieves a noun category by its path, if it exists.
155 |     pub fn nouns_by_path(&self, path: &str) -> Option<VocabCategory> {
156 |         match self.nouns.by_path(path) {
157 |             Some(r) => self.get(&r),
158 |             None => None,
159 |         }
160 |     }
161 | 
162 |     /// Returns the number of children for a given category ID.
163 |     pub fn get_children_count(&self, category_id: &i16) -> usize {
164 |         let node = self.nodes.get(&category_id.clone()).unwrap();
165 |         node.children.len()
166 |     }
167 | 
168 |     /// Retrieves the fully qualified names of a category's parent categories.
169 |     pub fn get_fqn(&self, category: &VocabCategory) -> Vec<String> {
170 |         let mut names: Vec<String> = Vec::new();
171 |         for parent_id in category.fqn.iter() {
172 |             let parent_name: String = match self.nodes.get(parent_id) {
173 |                 Some(r) => r.name.to_string(),
174 |                 None => String::from("Uknown"),
175 |             };
176 |             names.push(parent_name.to_string());
177 |         }
178 | 
179 |         names
180 |     }
181 | }
182 | 
183 | impl VocabCategory {}
184 | 
185 | impl fmt::Display for VocabCategory {
186 |     /// Formats the VocabCategory for display, showing its name.
187 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
188 |         write!(f, "{}", self.name)
189 |     }
190 | }
191 | 
192 | impl fmt::Debug for VocabCategory {
193 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
194 |         let fqn = self.fqn.iter().map(|id| id.to_string()).collect::<Vec<String>>();
195 |         write!(f, "{} -> {}", fqn.join("/"), self.name)
196 |     }
197 | }
198 | 
199 | impl Default for VocabCategoryIndex {
200 |     fn default() -> VocabCategoryIndex {
201 |         VocabCategoryIndex {
202 |             index: 0,
203 |             children: IndexMap::new(),
204 |         }
205 |     }
206 | }
207 | 


--------------------------------------------------------------------------------
/src/sophia/src/tokenizer/cleaner.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{Buffer, Token};
  8 | use crate::vocab::VocabDatabase;
  9 | 
 10 | static SPECIAL_CHARS: &[char] = &[
 11 |     '~', '`', '!', '@', '#', '$', '%', '&', '*', '(', ')', '-', '_', '+', '[', ']', '{', '}', '\\',
 12 |     '|', ';', ':', '\'', '"', ',', '.', '<', '>',
 13 | ];
 14 | static NUMERIC_CHARS: &[char] = &['.', ',', '^', '*', '/', ':'];
 15 | 
 16 | /// A utility for cleaning and classifying tokens, tracking character properties like numeric status and special characters.
 17 | #[derive(Default)]
 18 | pub struct TokenCleaner {
 19 |     chars: Vec<char>,
 20 |     word_len: usize,
 21 |     numeric_len: usize,
 22 |     pub is_numeric: bool,
 23 |     pub has_decimal: bool,
 24 |     pub has_special: bool,
 25 | }
 26 | 
 27 | impl TokenCleaner {
 28 |     /// Creates a new TokenCleaner instance with default values, marking it as numeric.
 29 |     pub fn new() -> Self {
 30 |         Self {
 31 |             is_numeric: true,
 32 |             ..Default::default()
 33 |         }
 34 |     }
 35 | 
 36 |     /// Resets the TokenCleaner to its initial state, equivalent to calling `new`.
 37 |     pub fn reset(&mut self) {
 38 |         *self = Self::new();
 39 |     }
 40 | 
 41 |     /// Cleans and classifies a word, updating the buffer with tokens for prefixes, suffixes, or special cases, returning the cleaned word if applicable.
 42 |     pub fn clean(
 43 |         &mut self,
 44 |         mut word: String,
 45 |         vocab: &VocabDatabase,
 46 |         buffer: &mut Buffer,
 47 |     ) -> Option<String> {
 48 |         // Scan characters
 49 |         self.scan_chars(&mut word, vocab, buffer);
 50 | 
 51 |         // Classify numeric
 52 |         if self.is_numeric {
 53 |             self.classify_numeric(&word, vocab, buffer);
 54 |             return None;
 55 |         }
 56 | 
 57 |         self.classify_token(&word, vocab, buffer)
 58 |     }
 59 | 
 60 |     /// Scans characters in a word, stripping prefixes/suffixes, updating possessive status, and tracking numeric/special properties.
 61 |     fn scan_chars(&mut self, word: &mut String, vocab: &VocabDatabase, buffer: &mut Buffer) {
 62 |         // Check for possession
 63 |         if word.ends_with("'s") {
 64 |             buffer.is_possessive = true;
 65 |             *word = word[..word.len() - 2].to_string();
 66 |         }
 67 | 
 68 |         // Iterate through chars
 69 |         let mut in_prefix = true;
 70 |         for (x, c) in word.chars().enumerate() {
 71 |             // Check for currency
 72 |             if x == 0 && (c == '-' || c == '+' || c.is_currency_symbol()) {
 73 |                 self.chars.push(c);
 74 | 
 75 |             // Prefix symbol
 76 |             } else if in_prefix && SPECIAL_CHARS.contains(&c) {
 77 |                 buffer.push_token(Token::prefix(&c.to_string(), vocab));
 78 |             } else {
 79 |                 in_prefix = false;
 80 | 
 81 |                 // Update word ending position
 82 |                 if c.is_alphanumeric() {
 83 |                     self.word_len = self.chars.len() + 1;
 84 |                 }
 85 | 
 86 |                 // Check if it's numeric
 87 |                 if self.is_numeric && !self.check_numeric(c) {
 88 |                     self.is_numeric = false;
 89 |                     self.numeric_len = self.chars.len();
 90 |                 }
 91 |                 self.chars.push(c);
 92 |             }
 93 |         }
 94 |     }
 95 | 
 96 |     /// Classifies a numeric word as a time or general numeric token, pushing it to the buffer.
 97 |     fn classify_numeric(&mut self, word: &str, vocab: &VocabDatabase, buffer: &mut Buffer) {
 98 |         if self.is_time() {
 99 |             buffer.push_token(Token::special(word, "|time|", "", "", vocab));
100 |         } else {
101 |             buffer.push_token(Token::numeric(word, vocab));
102 |         }
103 |     }
104 | 
105 |     /// Classifies a non-numeric token, handling numeric suffixes (e.g., decades, ordinals) and returning the cleaned word or None if fully processed.
106 |     fn classify_token(
107 |         &mut self,
108 |         word: &str,
109 |         vocab: &VocabDatabase,
110 |         buffer: &mut Buffer,
111 |     ) -> Option<String> {
112 |         // Check for numeric with suffix (eg. 3rd, 90s).
113 |         if self.numeric_len > 0 {
114 |             let suffix: String = self.chars[self.numeric_len..].iter().collect();
115 | 
116 |             if self.is_decade(&suffix) {
117 |                 let value = self.chars[..self.numeric_len].iter().collect::<String>();
118 |                 buffer.push_token(Token::special(
119 |                     word,
120 |                     "|date_period|",
121 |                     &value,
122 |                     &suffix,
123 |                     vocab,
124 |                 ));
125 |                 return None;
126 |             } else if let Some((suffix_tag, _)) = vocab.preprocess.hashes.get(&suffix) {
127 |                 let value = self.chars[..self.numeric_len].iter().collect::<String>();
128 |                 buffer.push_token(Token::special(word, suffix_tag, &value, &suffix, vocab));
129 |                 return None;
130 |             }
131 |         }
132 | 
133 |         // Add suffix to buffer
134 |         for c in self.chars[self.word_len..].iter() {
135 |             buffer.prepend_suffix(&Token::suffix(&c.to_string(), vocab));
136 |         }
137 | 
138 |         if self.word_len > 0 {
139 |             Some(self.chars[..self.word_len].iter().collect())
140 |         } else {
141 |             None
142 |         }
143 |     }
144 | 
145 |     /// Checks if a character maintains the numeric status of a word, updating decimal and special character flags.
146 |     fn check_numeric(&mut self, c: char) -> bool {
147 |         let mut ok = false;
148 | 
149 |         // Digit
150 |         if c.is_ascii_digit() {
151 |             ok = true;
152 | 
153 |         // Special  numeric character (. , /, etc.)
154 |         } else if NUMERIC_CHARS.contains(&c) {
155 |             // Only one non-comma character allowed
156 |             if (self.has_decimal || self.has_special) && c != ',' {
157 |                 self.is_numeric = false;
158 |                 return false;
159 |             }
160 | 
161 |             // Decimal or other character?
162 |             if c == '.' {
163 |                 self.has_decimal = true;
164 |             } else if c != ',' {
165 |                 self.has_special = true;
166 |             }
167 |             ok = true;
168 |         }
169 | 
170 |         ok
171 |     }
172 | 
173 |     /// Determines if the character sequence represents a time format (e.g., H:MM or HH:MM).
174 |     pub fn is_time(&self) -> bool {
175 |         let res = &self.chars;
176 | 
177 |         // Check for H:MM
178 |         if res.len() == 4 {
179 |             if res[1] != ':' || res[0] == '0' {
180 |                 return false;
181 |             }
182 | 
183 |             // Ensure  valid minutes
184 |             match format!("{}{}", res[2], res[3]).parse::<u8>() {
185 |                 Ok(mins) => {
186 |                     if mins > 59 {
187 |                         return false;
188 |                     }
189 |                 }
190 |                 Err(_) => return false,
191 |             };
192 | 
193 |         // Check for HH:MM
194 |         } else if res.len() == 5 {
195 |             if res[2] != ':' || !['0', '1', '2'].contains(&res[0]) {
196 |                 return false;
197 |             }
198 | 
199 |             // Ensure  valid minutes
200 |             match format!("{}{}", res[3], res[4]).parse::<u8>() {
201 |                 Ok(mins) => {
202 |                     if mins > 59 {
203 |                         return false;
204 |                     }
205 |                 }
206 |                 Err(_) => return false,
207 |             };
208 |         } else {
209 |             return false;
210 |         }
211 | 
212 |         true
213 |     }
214 | 
215 |     /// Checks if the character sequence represents a decade (e.g., 90s or 1990s).
216 |     fn is_decade(&self, suffix: &str) -> bool {
217 |         if suffix != "s" {
218 |             return false;
219 |         }
220 |         let res = &self.chars;
221 | 
222 |         // Check for 2 or 4 digit year
223 | 
224 |         (res.len() == 3 && res[1] == '0')
225 |             || (res.len() == 5 && res[3] == '0' && (res[0] == '1' || res[0] == '2'))
226 |     }
227 | }
228 | 
229 | /// A trait for checking if a character is a currency symbol.
230 | trait IsCurrencySymbol {
231 |     fn is_currency_symbol(self) -> bool;
232 | }
233 | 
234 | impl IsCurrencySymbol for char {
235 |     /// Implements `IsCurrencySymbol` for `char`, checking if the character is a currency symbol ($, €, £, ¥).
236 |     fn is_currency_symbol(self) -> bool {
237 |         matches!(self, '$' | '€' | '£' | '¥')
238 |     }
239 | }
240 | 


--------------------------------------------------------------------------------
/src/sophia/src/sophia.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use crate::error::Error;
  8 | use crate::interpret::{Interpretation, Interpreter};
  9 | use crate::tokenizer::{Token, TokenizedInput, Tokenizer};
 10 | use crate::vocab::{VocabCategory, VocabDatabase, VocabStats};
 11 | 
 12 | /// The main entry point for the Sophia natural language processing library, integrating tokenization and interpretation capabilities.
 13 | ///
 14 | /// The main entry point into the Sophia NLU engine, and contains everything you need for NLU (natural languaging understanding) tasks.  
 15 | /// Allows simple tokenization of input into both, individual words and MWEs (multi-word entities) mixed with individual words, along with
 16 | ///  interpreting  user input and breaking it down into usable phrase, noun, verb and other constructs.
 17 | pub struct Sophia {
 18 |     pub datadir: String,
 19 |     _language: String,
 20 |     pub vocab: VocabDatabase,
 21 |     pub tokenizer: Tokenizer,
 22 |     pub interpreter: Interpreter,
 23 | }
 24 | 
 25 | impl Sophia {
 26 |     /// Creates a new `Sophia` instance, loading the vocabulary database from the specified directory and language.
 27 |     ///
 28 |     /// # Arguments
 29 |     /// - `datadir`: The path to the directory containing the vocabulary database files.
 30 |     /// - `language`: The language code and filename of the .dat vocabulary file (eg. 'en' for 'en.dat' file)
 31 |     ///
 32 |     /// # Returns
 33 |     /// A `Result` containing the initialized `Sophia` instance or an `Error` if the vocabulary cannot be loaded.
 34 |     ///
 35 |     pub fn new(datadir: &str, language: &str) -> Result<Self, Error> {
 36 |         let vocab = VocabDatabase::load(datadir, language)?;
 37 | 
 38 |         Ok(Self {
 39 |             datadir: datadir.to_string(),
 40 |             _language: language.to_string(),
 41 |             interpreter: Interpreter::new(&vocab),
 42 |             tokenizer: Tokenizer::new(),
 43 |             vocab,
 44 |         })
 45 |     }
 46 | 
 47 |     /// Tokenizes the input text into a `TokenizedInput` containing tokens and MWEs.
 48 |     ///
 49 |     /// This method processes the input string using the `Tokenizer`, breaking it into individual tokens and identifying multi-word entities (MWEs).
 50 |     /// The resulting `TokenizedInput` can be iterated to access tokens or MWEs, with optional filtering for stopwords.
 51 |     ///
 52 |     /// # Arguments
 53 |     /// - `input`: The text to tokenize.
 54 |     ///
 55 |     /// # Returns
 56 |     /// A `Result` containing a `TokenizedInput` with the tokenized representation of the input text, or an `Error` if tokenization fails.
 57 |     ///
 58 |     /// # Example
 59 |     ///
 60 |     /// ```rust
 61 |     /// #![no_run]
 62 |     /// use sophia::{Sophia, Error};
 63 |     ///
 64 |     /// fn main() -> Result<(), Error> {
 65 |     ///     let sophia = Sophia::new("./vocab_data", "en")?;
 66 |     ///     let output = sophia.tokenize("The quick brown fox jumps")?;
 67 |     ///
 68 |     ///     // Iterate over individual tokens
 69 |     ///     for token in output.iter() {
 70 |     ///         println!("Word: {}, POS: {}", token.word, token.pos);
 71 |     ///     }
 72 |     ///
 73 |     ///     // Iterate over MWEs
 74 |     ///     for token in output.mwe() {
 75 |     ///         println!("MWE: {}, POS: {}", token.word, token.pos);
 76 |     ///     }
 77 |     ///
 78 |     ///     Ok(())
 79 |     /// }
 80 |     /// ```
 81 |     pub fn tokenize(&self, input: &str) -> TokenizedInput {
 82 |         self.tokenizer.encode(input, &self.vocab)
 83 |     }
 84 | 
 85 |     /// Interprets the input text, and returns an `Interpretation` with tokens, MWEs and usable phrases.
 86 |     ///
 87 |     /// This method first tokenizes the input using the `Tokenizer` and then processes the tokens using the `Interpreter` to generate a structured
 88 |     /// interpretation. The result includes individual tokens, MWEs, and phrases with associated scores for semantic analysis.
 89 |     ///
 90 |     /// # Arguments
 91 |     /// - `input`: The text to interpret.
 92 |     ///
 93 |     /// # Returns
 94 |     /// A `Result` containing an `Interpretation` with the analyzed structure of the input text, or an `Error` if tokenization or interpretation fails.
 95 |     /// # Example
 96 |     ///
 97 |     /// ```rust
 98 |     /// #![no_run]
 99 |     /// use sophia::{Sophia, Error};
100 |     ///
101 |     /// fn main() -> Result<(), Error> {
102 |     ///     let sophia = Sophia::new("./vocab_data", "en")?;
103 |     ///     let output = sophia.interpret("The quick brown fox jumps over the fallen tree while running through the forest with his friends.")?;
104 |     ///
105 |     ///     // Iterate over phrases
106 |     ///     for phrase in output.phrases.iter() {
107 |     ///         println!("Phrase: {:?}", phrase);
108 |     ///         for noun in phrase.nouns.iter() {
109 |     ///             println!("Noun Head: {}", output.tokens[noun.head].word);
110 |     ///         }
111 |     ///
112 |     ///         for verb in phrase.verbs.iter() {
113 |     ///             println!("Verb Head: {}", output.tokens[verb.head].word);
114 |     ///         }
115 |     ///
116 |     ///     // Iterate over individual tokens
117 |     ///     for token in output.tokens.iter() {
118 |     ///         println!("Word: {}, POS: {}", token.word, token.pos);
119 |     ///     }
120 |     ///
121 |     ///     Ok(())
122 |     /// }
123 |     /// ```
124 |     pub fn interpret(&self, input: &str) -> Interpretation {
125 |         self.interpreter.interpret(input, &self.tokenizer, &self.vocab)
126 |     }
127 | 
128 |     /// Gets an individual token by its index id#
129 |     ///
130 |     /// # Arguments
131 |     /// - `index`: The index id# of the token to retrieve
132 |     ///
133 |     /// # Returns
134 |     /// A `Option` containing the `Token` or None if the index id# does not exist.
135 |     /// # Example
136 |     ///
137 |     /// ```rust
138 |     /// #![no_run]
139 |     /// use sophia::{Sophia, Error};
140 |     ///
141 |     /// fn main() -> Result<(), Error> {
142 |     ///     let sophia = Sophia::new("./vocab_data", "en")?;
143 |     ///     let output = sophia.tokenize("She was running down the road");
144 |     ///
145 |     ///     // Get the stem of 'running'
146 |     ///     let index = output.tokens[2].stem;   // the index id# of the stem of 'running'.
147 |     ///     if let Some(token) = sophia.get_token(index) {
148 |     ///         println!("Stem of running is {}", token.word);
149 |     ///     }
150 |     ///     Ok(())
151 |     /// }
152 |     pub fn get_token(&self, index: i32) -> Option<Token> {
153 |         let token = self.vocab.words.id2token.get(&index)?;
154 |         let mut res = token.clone();
155 |         res.index = index;
156 |         Some(res)
157 |     }
158 | 
159 |     /// Gets an individual token by word.
160 |     ///
161 |     /// # Arguments
162 |     /// - `word`: The word to lookup and retrieve `Token` for.
163 |     ///
164 |     /// # Returns
165 |     /// A `Option` containing the `Token` or None if the word does not exist.
166 |     /// # Example
167 |     ///
168 |     /// ```rust
169 |     /// #![no_run]
170 |     /// use sophia::{Sophia, Error};
171 |     ///
172 |     /// fn main() -> Result<(), Error> {
173 |     ///     let sophia = Sophia::new("./vocab_data", "en")?;
174 |     ///     if let Some(token) = sophia.get_word("running") {
175 |     ///         println!("got token {}", token);
176 |     ///     }
177 |     ///     Ok(())
178 |     /// }
179 |     pub fn get_word(&self, word: &str) -> Option<Token> {
180 |         // Check wordlist
181 |         let pos_map = self.vocab.words.wordlist.get(word)?;
182 | 
183 |         // Get token
184 |         let (_, index) = pos_map.first().unwrap();
185 |         let token = self.vocab.words.id2token.get(index)?;
186 |         let mut res = token.clone();
187 | 
188 |         res.index = *index;
189 |         res.potential_pos = pos_map.keys().copied().collect();
190 | 
191 |         Some(res)
192 |     }
193 | 
194 |     /// Gets a category by its path.
195 |     ///
196 |     /// # Arguments
197 |     /// - `category_path`: The full category path to lookup (eg. verbs/action/search/retrieve/pursue)
198 |     ///
199 |     /// # Returns
200 |     /// A `Option` containing the `VocabCategory` or None if the word does not exist.
201 |     /// # Example
202 |     ///
203 |     /// ```rust
204 |     /// #![no_run]
205 |     /// use sophia::{Sophia, Error};
206 |     ///
207 |     /// fn main() -> Result<(), Error> {
208 |     ///     let sophia = Sophia::new("./vocab_data", "en")?;
209 |     ///     if let Some(cat) = sophia.get_category("verbs/action/search/retrieve/pursue") {
210 |     ///         println!("got category fqn {}", cat.fqn);
211 |     ///     }
212 |     ///     Ok(())
213 |     /// }
214 |     pub fn get_category(&self, category_path: &str) -> Option<VocabCategory> {
215 |         self.vocab.categories.get_category_by_path(category_path, &self.vocab)
216 |     }
217 | 
218 |     /// Returns various statistics regarding the loaded vocabulary file such as total singular / ambiguous words, MWEs, POS tags, and more.
219 |     pub fn get_vocab_stats(&self) -> VocabStats {
220 |         VocabStats::compile(&self.vocab)
221 |     }
222 | }
223 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/antecedent_buffer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
  3 | // See the full license at: https://cicero.sh/license.txt
  4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  5 | 
  6 | use super::CoreferenceCategories;
  7 | use crate::pos_tagger::POSTag;
  8 | use crate::tokenizer::Token;
  9 | use crate::vocab::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson};
 10 | use std::collections::HashSet;
 11 | 
 12 | /// Manages antecedents for coreference resolution, tracking primary and secondary antecedents, person counts, and plural references.
 13 | #[derive(Default)]
 14 | pub struct AntecedentBuffer {
 15 |     coref: CoreferenceCategories,
 16 |     count: usize,
 17 |     primary: Antecedent,
 18 |     secondary: Vec<Antecedent>,
 19 |     last_person: String,
 20 |     plural_person: HashSet<String>,
 21 |     primary_object: String,
 22 | }
 23 | 
 24 | /// Represents an antecedent with a name, part-of-speech tag, type (person, entity, object), and gender.
 25 | #[derive(Debug, Clone)]
 26 | struct Antecedent {
 27 |     name: String,
 28 |     pos: String,
 29 |     antecedent_type: AntecedentType,
 30 |     gender: PronounGender,
 31 | }
 32 | 
 33 | /// Defines the type of an antecedent, which can be none, person, entity, or object.
 34 | #[derive(Debug, Clone, PartialEq)]
 35 | enum AntecedentType {
 36 |     none,
 37 |     person,
 38 |     entity,
 39 |     object,
 40 | }
 41 | 
 42 | impl AntecedentBuffer {
 43 |     /// Creates a new AntecedentBuffer with the provided coreference categories.
 44 |     pub fn new(coref: &CoreferenceCategories) -> Self {
 45 |         Self {
 46 |             coref: coref.clone(),
 47 |             ..Default::default()
 48 |         }
 49 |     }
 50 | 
 51 |     /// Adds a noun token to the antecedent buffer, classifying it as person, entity, or object based on coreference rules.
 52 |     pub fn add_noun(&mut self, token: &Token) {
 53 |         // Check for person / object
 54 |         if self.coref.is_person(token) {
 55 |             self.add(&token.word, &token.pos, AntecedentType::person);
 56 |             self.last_person = token.word.to_string();
 57 |             self.plural_person.insert(token.word.to_string());
 58 |         } else if self.coref.is_entity(token) {
 59 |             self.add(&token.word, &token.pos, AntecedentType::entity);
 60 |         } else if token.is_noun() {
 61 |             self.add(&token.word, &token.pos, AntecedentType::object);
 62 |         }
 63 |     }
 64 | 
 65 |     /// Adds an antecedent to the buffer, updating primary or secondary lists and setting primary object if applicable.
 66 |     fn add(&mut self, name: &str, pos: &POSTag, antecedent_type: AntecedentType) {
 67 |         let ant = Antecedent {
 68 |             name: name.to_string(),
 69 |             pos: pos.to_str(),
 70 |             antecedent_type,
 71 |             gender: PronounGender::neutral,
 72 |         };
 73 | 
 74 |         if ant.antecedent_type == AntecedentType::object && self.primary_object.is_empty() {
 75 |             self.primary_object = name.to_string();
 76 |         }
 77 | 
 78 |         if ant.antecedent_type == AntecedentType::person
 79 |             && self.primary.antecedent_type == AntecedentType::none
 80 |         {
 81 |             self.primary = ant;
 82 |         } else {
 83 |             self.secondary.push(ant);
 84 |         }
 85 |     }
 86 | 
 87 |     /// Adds a non-noun token to the buffer, resetting plural person tracking for verbs/prepositions or clearing the buffer if needed.
 88 |     pub fn add_non_noun(&mut self, token: &Token) {
 89 |         if token.is_verb() || token.is_preposition() {
 90 |             self.plural_person = HashSet::new();
 91 |         }
 92 | 
 93 |         // Clear, if needed
 94 |         if self.count >= 30 || token.word.as_str() == "|nl|" {
 95 |             self.clear();
 96 |         } else {
 97 |             self.count += 1;
 98 |         }
 99 |     }
100 | 
101 |     /// Resolves a pronoun in the token by assigning an antecedent based on gender, number, and person, if applicable.
102 |     pub fn resolve_pronoun(&mut self, token: &mut Token) {
103 |         // Get pronoun
104 |         let pronoun = match &token.pronoun {
105 |             Some(r) => r,
106 |             None => return,
107 |         };
108 |         if !pronoun.is_anaphora() {
109 |             return;
110 |         }
111 |         self.count = 0;
112 | 
113 |         // Ensure third person
114 |         if pronoun.person == PronounPerson::first || pronoun.person == PronounPerson::second {
115 |             return;
116 |         }
117 | 
118 |         // Get antecedent
119 |         if pronoun.gender != PronounGender::neutral && pronoun.number == PronounNumber::singular {
120 |             token.antecedent = self.get_singular_person(pronoun);
121 |         } else if pronoun.gender == PronounGender::neutral
122 |             && pronoun.number == PronounNumber::singular
123 |         {
124 |             token.antecedent = if self.primary_object.is_empty() {
125 |                 None
126 |             } else {
127 |                 Some(self.primary_object.to_string())
128 |             };
129 |         } else if pronoun.number == PronounNumber::plural {
130 |             token.antecedent = self.get_plural(pronoun);
131 |         }
132 |     }
133 | 
134 |     /// Resolves a singular person pronoun, matching gender and updating the primary or secondary antecedent.
135 |     fn get_singular_person(&mut self, pronoun: &Pronoun) -> Option<String> {
136 |         // First person, or no primary
137 |         if self.primary.antecedent_type == AntecedentType::none {
138 |             return None;
139 |         }
140 | 
141 |         // Possessive
142 |         if (pronoun.category == PronounCategory::possessive
143 |             || self.last_person != self.primary.name)
144 |             && (self.primary.gender == pronoun.gender
145 |                 || self.primary.gender == PronounGender::neutral)
146 |         {
147 |             if self.primary.gender == PronounGender::neutral {
148 |                 self.primary.gender = pronoun.gender.clone();
149 |             }
150 |             //self.last_person = self.primary.name.to_string();
151 |             return Some(self.primary.name.to_string());
152 |         }
153 | 
154 |         // Go through names, identify correct gender
155 |         let mut name = String::new();
156 |         for elem in self.secondary.iter_mut().rev() {
157 |             if elem.antecedent_type != AntecedentType::person {
158 |                 continue;
159 |             } else if elem.gender == pronoun.gender {
160 |                 name = elem.name.to_string();
161 |                 break;
162 |             } else if elem.gender == PronounGender::neutral {
163 |                 elem.gender = pronoun.gender.clone();
164 |                 name = elem.name.to_string();
165 |                 break;
166 |             }
167 |         }
168 | 
169 |         // No name found
170 |         if name.is_empty() {
171 |             return None;
172 |         }
173 | 
174 |         self.last_person = name.to_string();
175 |         Some(name)
176 |     }
177 | 
178 |     /// Resolves a third-person plural pronoun, prioritizing plural persons or falling back to entities/objects.
179 |     fn get_plural(&mut self, _pronoun: &Pronoun) -> Option<String> {
180 |         // Try for person
181 |         if let Some(name) = self.get_plural_person() {
182 |             return Some(name);
183 |         }
184 | 
185 |         // Look for entity or object
186 |         let mut res: Option<String> = None;
187 |         for elem in self.secondary.iter().rev() {
188 |             if elem.antecedent_type == AntecedentType::person {
189 |                 continue;
190 |             }
191 |             if elem.antecedent_type == AntecedentType::entity
192 |                 || (elem.antecedent_type == AntecedentType::object && elem.pos.as_str() == "NP")
193 |             {
194 |                 res = Some(elem.name.to_string());
195 |                 break;
196 |             }
197 |         }
198 | 
199 |         res
200 |     }
201 | 
202 |     /// Retrieves a third-person plural person antecedent by combining primary and secondary persons, if available.
203 |     fn get_plural_person(&mut self) -> Option<String> {
204 |         if self.plural_person.len() >= 2 && self.plural_person.contains(&self.primary.name) {
205 |             return Some(self.plural_person.iter().cloned().collect::<Vec<_>>().join("|"));
206 |         } else if self.primary.name.is_empty() {
207 |             return None;
208 |         }
209 | 
210 |         // Look for person in buffer
211 |         let mut people = vec![self.primary.name.to_string()];
212 |         for elem in self.secondary.iter().rev() {
213 |             if elem.antecedent_type != AntecedentType::person {
214 |                 continue;
215 |             }
216 |             if elem.name != self.primary.name {
217 |                 people.push(elem.name.to_string());
218 |                 break;
219 |             }
220 |         }
221 | 
222 |         // Return, if we have two
223 |         if people.len() > 1 {
224 |             return Some(people.join("|"));
225 |         }
226 | 
227 |         None
228 |     }
229 | 
230 |     /// Clears the antecedent buffer, resetting all fields to their default state.
231 |     fn clear(&mut self) {
232 |         self.count = 0;
233 |         self.primary = Antecedent::default();
234 |         self.secondary = Vec::new();
235 |         self.last_person = String::new();
236 |         self.plural_person = HashSet::new();
237 |         self.primary_object = String::new();
238 |     }
239 | }
240 | 
241 | impl Default for Antecedent {
242 |     fn default() -> Antecedent {
243 |         Antecedent {
244 |             name: String::new(),
245 |             pos: String::new(),
246 |             antecedent_type: AntecedentType::none,
247 |             gender: PronounGender::neutral,
248 |         }
249 |     }
250 | }
251 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/antecedent_buffer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::CoreferenceCategories;
  8 | use crate::pos_tagger::POSTag;
  9 | use crate::tokenizer::Token;
 10 | use crate::vocab::{Pronoun, PronounCategory, PronounGender, PronounNumber, PronounPerson};
 11 | use std::collections::HashSet;
 12 | 
 13 | /// Manages antecedents for coreference resolution, tracking primary and secondary antecedents, person counts, and plural references.
 14 | #[derive(Default)]
 15 | pub struct AntecedentBuffer {
 16 |     coref: CoreferenceCategories,
 17 |     count: usize,
 18 |     primary: Antecedent,
 19 |     secondary: Vec<Antecedent>,
 20 |     last_person: String,
 21 |     plural_person: HashSet<String>,
 22 |     primary_object: String,
 23 | }
 24 | 
 25 | /// Represents an antecedent with a name, part-of-speech tag, type (person, entity, object), and gender.
 26 | #[derive(Debug, Clone)]
 27 | struct Antecedent {
 28 |     name: String,
 29 |     pos: String,
 30 |     antecedent_type: AntecedentType,
 31 |     gender: PronounGender,
 32 | }
 33 | 
 34 | /// Defines the type of an antecedent, which can be none, person, entity, or object.
 35 | #[derive(Debug, Clone, PartialEq)]
 36 | enum AntecedentType {
 37 |     none,
 38 |     person,
 39 |     entity,
 40 |     object,
 41 | }
 42 | 
 43 | impl AntecedentBuffer {
 44 |     /// Creates a new AntecedentBuffer with the provided coreference categories.
 45 |     pub fn new(coref: &CoreferenceCategories) -> Self {
 46 |         Self {
 47 |             coref: coref.clone(),
 48 |             ..Default::default()
 49 |         }
 50 |     }
 51 | 
 52 |     /// Adds a noun token to the antecedent buffer, classifying it as person, entity, or object based on coreference rules.
 53 |     pub fn add_noun(&mut self, token: &Token) {
 54 |         // Check for person / object
 55 |         if self.coref.is_person(token) {
 56 |             self.add(&token.word, &token.pos, AntecedentType::person);
 57 |             self.last_person = token.word.to_string();
 58 |             self.plural_person.insert(token.word.to_string());
 59 |         } else if self.coref.is_entity(token) {
 60 |             self.add(&token.word, &token.pos, AntecedentType::entity);
 61 |         } else if token.is_noun() {
 62 |             self.add(&token.word, &token.pos, AntecedentType::object);
 63 |         }
 64 |     }
 65 | 
 66 |     /// Adds an antecedent to the buffer, updating primary or secondary lists and setting primary object if applicable.
 67 |     fn add(&mut self, name: &str, pos: &POSTag, antecedent_type: AntecedentType) {
 68 |         let ant = Antecedent {
 69 |             name: name.to_string(),
 70 |             pos: pos.to_str(),
 71 |             antecedent_type,
 72 |             gender: PronounGender::neutral,
 73 |         };
 74 | 
 75 |         if ant.antecedent_type == AntecedentType::object && self.primary_object.is_empty() {
 76 |             self.primary_object = name.to_string();
 77 |         }
 78 | 
 79 |         if ant.antecedent_type == AntecedentType::person
 80 |             && self.primary.antecedent_type == AntecedentType::none
 81 |         {
 82 |             self.primary = ant;
 83 |         } else {
 84 |             self.secondary.push(ant);
 85 |         }
 86 |     }
 87 | 
 88 |     /// Adds a non-noun token to the buffer, resetting plural person tracking for verbs/prepositions or clearing the buffer if needed.
 89 |     pub fn add_non_noun(&mut self, token: &Token) {
 90 |         if token.is_verb() || token.is_preposition() {
 91 |             self.plural_person = HashSet::new();
 92 |         }
 93 | 
 94 |         // Clear, if needed
 95 |         if self.count >= 30 || token.word.as_str() == "|nl|" {
 96 |             self.clear();
 97 |         } else {
 98 |             self.count += 1;
 99 |         }
100 |     }
101 | 
102 |     /// Resolves a pronoun in the token by assigning an antecedent based on gender, number, and person, if applicable.
103 |     pub fn resolve_pronoun(&mut self, token: &mut Token) {
104 |         // Get pronoun
105 |         let pronoun = match &token.pronoun {
106 |             Some(r) => r,
107 |             None => return,
108 |         };
109 |         if !pronoun.is_anaphora() {
110 |             return;
111 |         }
112 |         self.count = 0;
113 | 
114 |         // Ensure third person
115 |         if pronoun.person == PronounPerson::first || pronoun.person == PronounPerson::second {
116 |             return;
117 |         }
118 | 
119 |         // Get antecedent
120 |         if pronoun.gender != PronounGender::neutral && pronoun.number == PronounNumber::singular {
121 |             token.antecedent = self.get_singular_person(pronoun);
122 |         } else if pronoun.gender == PronounGender::neutral
123 |             && pronoun.number == PronounNumber::singular
124 |         {
125 |             token.antecedent = if self.primary_object.is_empty() {
126 |                 None
127 |             } else {
128 |                 Some(self.primary_object.to_string())
129 |             };
130 |         } else if pronoun.number == PronounNumber::plural {
131 |             token.antecedent = self.get_plural(pronoun);
132 |         }
133 |     }
134 | 
135 |     /// Resolves a singular person pronoun, matching gender and updating the primary or secondary antecedent.
136 |     fn get_singular_person(&mut self, pronoun: &Pronoun) -> Option<String> {
137 |         // First person, or no primary
138 |         if self.primary.antecedent_type == AntecedentType::none {
139 |             return None;
140 |         }
141 | 
142 |         // Possessive
143 |         if (pronoun.category == PronounCategory::possessive
144 |             || self.last_person != self.primary.name)
145 |             && (self.primary.gender == pronoun.gender
146 |                 || self.primary.gender == PronounGender::neutral)
147 |         {
148 |             if self.primary.gender == PronounGender::neutral {
149 |                 self.primary.gender = pronoun.gender.clone();
150 |             }
151 |             //self.last_person = self.primary.name.to_string();
152 |             return Some(self.primary.name.to_string());
153 |         }
154 | 
155 |         // Go through names, identify correct gender
156 |         let mut name = String::new();
157 |         for elem in self.secondary.iter_mut().rev() {
158 |             if elem.antecedent_type != AntecedentType::person {
159 |                 continue;
160 |             } else if elem.gender == pronoun.gender {
161 |                 name = elem.name.to_string();
162 |                 break;
163 |             } else if elem.gender == PronounGender::neutral {
164 |                 elem.gender = pronoun.gender.clone();
165 |                 name = elem.name.to_string();
166 |                 break;
167 |             }
168 |         }
169 | 
170 |         // No name found
171 |         if name.is_empty() {
172 |             return None;
173 |         }
174 | 
175 |         self.last_person = name.to_string();
176 |         Some(name)
177 |     }
178 | 
179 |     /// Resolves a third-person plural pronoun, prioritizing plural persons or falling back to entities/objects.
180 |     fn get_plural(&mut self, _pronoun: &Pronoun) -> Option<String> {
181 |         // Try for person
182 |         if let Some(name) = self.get_plural_person() {
183 |             return Some(name);
184 |         }
185 | 
186 |         // Look for entity or object
187 |         let mut res: Option<String> = None;
188 |         for elem in self.secondary.iter().rev() {
189 |             if elem.antecedent_type == AntecedentType::person {
190 |                 continue;
191 |             }
192 |             if elem.antecedent_type == AntecedentType::entity
193 |                 || (elem.antecedent_type == AntecedentType::object && elem.pos.as_str() == "NP")
194 |             {
195 |                 res = Some(elem.name.to_string());
196 |                 break;
197 |             }
198 |         }
199 | 
200 |         res
201 |     }
202 | 
203 |     /// Retrieves a third-person plural person antecedent by combining primary and secondary persons, if available.
204 |     fn get_plural_person(&mut self) -> Option<String> {
205 |         if self.plural_person.len() >= 2 && self.plural_person.contains(&self.primary.name) {
206 |             return Some(self.plural_person.iter().cloned().collect::<Vec<_>>().join("|"));
207 |         } else if self.primary.name.is_empty() {
208 |             return None;
209 |         }
210 | 
211 |         // Look for person in buffer
212 |         let mut people = vec![self.primary.name.to_string()];
213 |         for elem in self.secondary.iter().rev() {
214 |             if elem.antecedent_type != AntecedentType::person {
215 |                 continue;
216 |             }
217 |             if elem.name != self.primary.name {
218 |                 people.push(elem.name.to_string());
219 |                 break;
220 |             }
221 |         }
222 | 
223 |         // Return, if we have two
224 |         if people.len() > 1 {
225 |             return Some(people.join("|"));
226 |         }
227 | 
228 |         None
229 |     }
230 | 
231 |     /// Clears the antecedent buffer, resetting all fields to their default state.
232 |     fn clear(&mut self) {
233 |         self.count = 0;
234 |         self.primary = Antecedent::default();
235 |         self.secondary = Vec::new();
236 |         self.last_person = String::new();
237 |         self.plural_person = HashSet::new();
238 |         self.primary_object = String::new();
239 |     }
240 | }
241 | 
242 | impl Default for Antecedent {
243 |     fn default() -> Antecedent {
244 |         Antecedent {
245 |             name: String::new(),
246 |             pos: String::new(),
247 |             antecedent_type: AntecedentType::none,
248 |             gender: PronounGender::neutral,
249 |         }
250 |     }
251 | }
252 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpreter/buffer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the Functional Source License, Version 1.1 (FSL-1.1)
  3 | // See the full license at: https://cicero.sh/license.txt
  4 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  5 | 
  6 | use super::{AntecedentBuffer, CoreferenceCategories, Phrase};
  7 | use crate::pos_tagger::POSTag;
  8 | use crate::tokenizer::Token;
  9 | use std::fmt;
 10 | 
 11 | /// A buffer for processing tokens, tracking verbs, nouns, pronouns, and antecedents, with support for phrase splitting and enclosed character handling.
 12 | #[derive(Default)]
 13 | pub struct Buffer {
 14 |     pub position: usize,
 15 |     pub tokens: Vec<Token>,
 16 |     pub is_locked: bool,
 17 |     pub enclosed_chars: Vec<char>,
 18 |     pub enclosed_chars_num_phrases: usize,
 19 |     pub verbs: Vec<usize>,
 20 |     pub nouns: Vec<usize>,
 21 |     pub pronouns: Vec<usize>,
 22 |     pub antecedents: AntecedentBuffer,
 23 | }
 24 | 
 25 | static ENCLOSED_START_CHARS: &[char] = &['"', '\'', '(', '[', '<', '|'];
 26 | 
 27 | impl Buffer {
 28 |     /// Creates a new Buffer instance with an initialized AntecedentBuffer using the provided coreference categories.
 29 |     pub fn new(coref: &CoreferenceCategories) -> Self {
 30 |         Self {
 31 |             antecedents: AntecedentBuffer::new(coref),
 32 |             ..Default::default()
 33 |         }
 34 |     }
 35 | 
 36 |     /// Adds a token to the buffer, updating verb, noun, pronoun, and antecedent tracking, and returns the token's index.
 37 |     pub fn add(&mut self, buf_token: &Token) -> usize {
 38 |         let mut token = buf_token.clone();
 39 | 
 40 |         // Process token type
 41 |         if token.is_verb() {
 42 |             self.verbs.push(self.tokens.len());
 43 |             if self.verbs.len() == 1 {
 44 |                 self.position = self.tokens.len();
 45 |             }
 46 |             self.is_locked = false;
 47 | 
 48 |         // Add noun
 49 |         } else if token.is_noun() {
 50 |             self.nouns.push(self.tokens.len());
 51 |             self.is_locked = false;
 52 |             self.antecedents.add_noun(&token);
 53 | 
 54 |         // Add pronoun
 55 |         } else if token.is_pronoun() {
 56 |             self.pronouns.push(self.tokens.len());
 57 |             self.antecedents.resolve_pronoun(&mut token);
 58 |         }
 59 | 
 60 |         // Add non-pronoun to antecedent buffer
 61 |         if !token.is_pronoun() {
 62 |             self.antecedents.add_non_noun(&token);
 63 |         }
 64 | 
 65 |         // Add token
 66 |         self.tokens.push(token);
 67 |         self.tokens.len() - 1
 68 |     }
 69 | 
 70 |     /// Checks if the buffer can be split at the given position based on token type, enclosed characters, and buffer state.
 71 |     pub fn can_split(&self, x: usize) -> bool {
 72 |         if self.tokens[x].pos == POSTag::SYM
 73 |             && self.enclosed_chars.is_empty()
 74 |             && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap())
 75 |         {
 76 |             return true;
 77 |         } else if self.tokens[x].pos == POSTag::SYM
 78 |             && !self.enclosed_chars.is_empty()
 79 |             && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap()
 80 |         {
 81 |             return true;
 82 |         } else if !self.pronouns.is_empty()
 83 |             && !self.verbs.is_empty()
 84 |             && self.tokens[x].pos == POSTag::CC
 85 |         {
 86 |             return true;
 87 |         } else if self.verbs.len() < 2 || self.is_locked {
 88 |             return false;
 89 |         } else if self.nouns.is_empty() {
 90 |             return false;
 91 |         }
 92 | 
 93 |         true
 94 |     }
 95 |     /// Attempts to split the buffer into a Phrase if conditions are met, determining the split position and handling enclosed characters.
 96 |     pub fn split(&mut self) -> Option<Phrase> {
 97 |         // Check minimum requirements
 98 |         if !self.can_split(self.tokens.len() - 1) {
 99 |             return None;
100 |         }
101 | 
102 |         // Determine split position
103 |         let mut split_pos = None;
104 |         for x in self.position..self.tokens.len() {
105 |             self.position = x + 1;
106 | 
107 |             // Check enclosed char
108 |             if self.tokens[x].pos == POSTag::SYM
109 |                 && self.enclosed_chars.is_empty()
110 |                 && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap())
111 |             {
112 |                 self.enclosed_chars = vec![self.tokens[x].word.chars().next().unwrap(), ' '];
113 |                 self.enclosed_chars_num_phrases = 0;
114 |                 self.enclosed_chars[1] = match self.enclosed_chars[0] {
115 |                     '\'' => '\'',
116 |                     '(' => ')',
117 |                     '[' => ']',
118 |                     '{' => '}',
119 |                     '<' => '>',
120 |                     '|' => '|',
121 |                     _ => '"',
122 |                 };
123 |                 split_pos = Some(x);
124 |                 break;
125 |             } else if self.tokens[x].pos == POSTag::SYM
126 |                 && self.enclosed_chars.len() == 2
127 |                 && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap()
128 |             {
129 |                 split_pos = Some(x);
130 |                 break;
131 |             }
132 | 
133 |             // Unlock, if needed
134 |             if self.is_locked && (self.tokens[x].is_noun() || self.tokens[x].is_verb()) {
135 |                 self.is_locked = false;
136 |             }
137 | 
138 |             if (!self.nouns.is_empty() && self.nouns[0] >= x) || self.is_locked {
139 |                 continue;
140 |             } else if self.tokens[x].is_sentence_stopper() {
141 |                 split_pos = Some(x);
142 |                 break;
143 |             }
144 | 
145 |             // Lock if CC tag
146 |             if self.tokens[x].pos == POSTag::CC {
147 |                 self.is_locked = true;
148 |                 continue;
149 |             }
150 | 
151 |             // Handle previous comma
152 |             if x > 0 && self.tokens[x - 1].word.as_str() == "," {
153 |                 if ["CS", "CA", "VBG"].contains(&self.tokens[x].pos.to_str().as_str()) {
154 |                     self.is_locked = true;
155 |                     continue;
156 |                 } else {
157 |                     split_pos = Some(x - 1);
158 |                     break;
159 |                 }
160 |             }
161 | 
162 |             // Check for potential phrase splitter
163 |             if self.tokens[x].word.as_str() == ","
164 |                 || !["CC", "DT", "PRP", "RB", "RBS", "RBR"]
165 |                     .contains(&self.tokens[x].pos.to_str().as_str())
166 |             {
167 |                 continue;
168 |             }
169 | 
170 |             // If adjective, ensure following token is noun
171 |             if self.tokens[x].is_adjective() && (self.check_determiner_offset(x) > 0) {
172 |                 continue;
173 |             } else if x > 0
174 |                 && (self.tokens[x].is_pronoun() || self.tokens[x].is_determiner())
175 |                 && self.tokens[x - 1].is_preposition()
176 |             {
177 |                 self.is_locked = true;
178 |                 continue;
179 |             } else if self.tokens[x].is_adverb() && x > 0 && self.tokens[x - 1].is_verb() {
180 |                 continue;
181 |             }
182 | 
183 |             split_pos = Some(x);
184 |             break;
185 |         }
186 | 
187 |         // Split if needed
188 |         if let Some(pos) = split_pos {
189 |             let phrase = self.do_split(pos);
190 |             return Some(phrase);
191 |         }
192 | 
193 |         None
194 |     }
195 | 
196 |     /// Performs the actual split at the specified position, creating a new Phrase and updating the buffer's token list.
197 |     pub fn do_split(&mut self, split_pos: usize) -> Phrase {
198 |         // Get phrase
199 |         let remaining_tokens = self.tokens.split_off(split_pos);
200 |         let phrase = Phrase::new(&0, None);
201 | 
202 |         // Drain buffer after a split
203 |         self.tokens = remaining_tokens;
204 |         //self.drain(phrase.tokens.len());
205 | 
206 |         phrase
207 |     }
208 | 
209 |     /// Drains the buffer after a split, updating verb, noun, and pronoun indices and resetting position and lock state.
210 |     pub fn drain(&mut self, length: usize) {
211 |         self.verbs = self.verbs.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
212 |         self.nouns = self.nouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
213 |         self.pronouns =
214 |             self.pronouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
215 |         self.position = if !self.verbs.is_empty() {
216 |             self.verbs[0]
217 |         } else {
218 |             0
219 |         };
220 |         self.is_locked = false;
221 |     }
222 | 
223 |     /// Checks for a determiner followed by a noun (with optional adjective) and returns the offset (2 or 3) or 0 if not found.
224 |     pub fn check_determiner_offset(&self, pos: usize) -> usize {
225 |         if self.tokens.len() < (pos + 1) {
226 |             return 0;
227 |         }
228 | 
229 |         if self.tokens[pos + 1].is_noun() {
230 |             return 2;
231 |         } else if self.tokens.len() >= (pos + 2)
232 |             && self.tokens[pos + 1].is_adjective()
233 |             && self.tokens[pos + 2].is_noun()
234 |         {
235 |             return 3;
236 |         }
237 | 
238 |         0
239 |     }
240 | }
241 | 
242 | impl fmt::Debug for Buffer {
243 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
244 |         let words = self.tokens.iter().map(|token| token.word.to_string()).collect::<Vec<String>>();
245 |         let verbs = self
246 |             .verbs
247 |             .iter()
248 |             .map(|pos| format!("{} {}", self.tokens[*pos].word, pos))
249 |             .collect::<Vec<String>>();
250 |         let nouns = self
251 |             .nouns
252 |             .iter()
253 |             .map(|pos| self.tokens[*pos].word.to_string())
254 |             .collect::<Vec<String>>();
255 |         write!(
256 |             f,
257 |             "[buffer] {} [verbs] {} [nouns] {}",
258 |             words.join(" "),
259 |             verbs.join(", "),
260 |             nouns.join(", ")
261 |         )
262 |     }
263 | }
264 | 


--------------------------------------------------------------------------------
/src/sophia/src/interpret/buffer.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{AntecedentBuffer, CoreferenceCategories, Phrase};
  8 | use crate::pos_tagger::POSTag;
  9 | use crate::tokenizer::Token;
 10 | use std::fmt;
 11 | 
 12 | /// A buffer for processing tokens, tracking verbs, nouns, pronouns, and antecedents, with support for phrase splitting and enclosed character handling.
 13 | #[derive(Default)]
 14 | pub struct Buffer {
 15 |     pub position: usize,
 16 |     pub tokens: Vec<Token>,
 17 |     pub is_locked: bool,
 18 |     pub enclosed_chars: Vec<char>,
 19 |     pub enclosed_chars_num_phrases: usize,
 20 |     pub verbs: Vec<usize>,
 21 |     pub nouns: Vec<usize>,
 22 |     pub pronouns: Vec<usize>,
 23 |     pub antecedents: AntecedentBuffer,
 24 | }
 25 | 
 26 | static ENCLOSED_START_CHARS: &[char] = &['"', '\'', '(', '[', '<', '|'];
 27 | 
 28 | impl Buffer {
 29 |     /// Creates a new Buffer instance with an initialized AntecedentBuffer using the provided coreference categories.
 30 |     pub fn new(coref: &CoreferenceCategories) -> Self {
 31 |         Self {
 32 |             antecedents: AntecedentBuffer::new(coref),
 33 |             ..Default::default()
 34 |         }
 35 |     }
 36 | 
 37 |     /// Adds a token to the buffer, updating verb, noun, pronoun, and antecedent tracking, and returns the token's index.
 38 |     pub fn add(&mut self, buf_token: &Token) -> usize {
 39 |         let mut token = buf_token.clone();
 40 | 
 41 |         // Process token type
 42 |         if token.is_verb() {
 43 |             self.verbs.push(self.tokens.len());
 44 |             if self.verbs.len() == 1 {
 45 |                 self.position = self.tokens.len();
 46 |             }
 47 |             self.is_locked = false;
 48 | 
 49 |         // Add noun
 50 |         } else if token.is_noun() {
 51 |             self.nouns.push(self.tokens.len());
 52 |             self.is_locked = false;
 53 |             self.antecedents.add_noun(&token);
 54 | 
 55 |         // Add pronoun
 56 |         } else if token.is_pronoun() {
 57 |             self.pronouns.push(self.tokens.len());
 58 |             self.antecedents.resolve_pronoun(&mut token);
 59 |         }
 60 | 
 61 |         // Add non-pronoun to antecedent buffer
 62 |         if !token.is_pronoun() {
 63 |             self.antecedents.add_non_noun(&token);
 64 |         }
 65 | 
 66 |         // Add token
 67 |         self.tokens.push(token);
 68 |         self.tokens.len() - 1
 69 |     }
 70 | 
 71 |     /// Checks if the buffer can be split at the given position based on token type, enclosed characters, and buffer state.
 72 |     pub fn can_split(&self, x: usize) -> bool {
 73 |         if self.tokens[x].pos == POSTag::SYM
 74 |             && self.enclosed_chars.is_empty()
 75 |             && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap())
 76 |         {
 77 |             return true;
 78 |         } else if self.tokens[x].pos == POSTag::SYM
 79 |             && !self.enclosed_chars.is_empty()
 80 |             && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap()
 81 |         {
 82 |             return true;
 83 |         } else if !self.pronouns.is_empty()
 84 |             && !self.verbs.is_empty()
 85 |             && self.tokens[x].pos == POSTag::CC
 86 |         {
 87 |             return true;
 88 |         } else if self.verbs.len() < 2 || self.is_locked {
 89 |             return false;
 90 |         } else if self.nouns.is_empty() {
 91 |             return false;
 92 |         }
 93 | 
 94 |         true
 95 |     }
 96 |     /// Attempts to split the buffer into a Phrase if conditions are met, determining the split position and handling enclosed characters.
 97 |     pub fn split(&mut self) -> Option<Phrase> {
 98 |         // Check minimum requirements
 99 |         if !self.can_split(self.tokens.len() - 1) {
100 |             return None;
101 |         }
102 | 
103 |         // Determine split position
104 |         let mut split_pos = None;
105 |         for x in self.position..self.tokens.len() {
106 |             self.position = x + 1;
107 | 
108 |             // Check enclosed char
109 |             if self.tokens[x].pos == POSTag::SYM
110 |                 && self.enclosed_chars.is_empty()
111 |                 && ENCLOSED_START_CHARS.contains(&self.tokens[x].word.chars().next().unwrap())
112 |             {
113 |                 self.enclosed_chars = vec![self.tokens[x].word.chars().next().unwrap(), ' '];
114 |                 self.enclosed_chars_num_phrases = 0;
115 |                 self.enclosed_chars[1] = match self.enclosed_chars[0] {
116 |                     '\'' => '\'',
117 |                     '(' => ')',
118 |                     '[' => ']',
119 |                     '{' => '}',
120 |                     '<' => '>',
121 |                     '|' => '|',
122 |                     _ => '"',
123 |                 };
124 |                 split_pos = Some(x);
125 |                 break;
126 |             } else if self.tokens[x].pos == POSTag::SYM
127 |                 && self.enclosed_chars.len() == 2
128 |                 && self.enclosed_chars[1] == self.tokens[x].word.chars().next().unwrap()
129 |             {
130 |                 split_pos = Some(x);
131 |                 break;
132 |             }
133 | 
134 |             // Unlock, if needed
135 |             if self.is_locked && (self.tokens[x].is_noun() || self.tokens[x].is_verb()) {
136 |                 self.is_locked = false;
137 |             }
138 | 
139 |             if (!self.nouns.is_empty() && self.nouns[0] >= x) || self.is_locked {
140 |                 continue;
141 |             } else if self.tokens[x].is_sentence_stopper() {
142 |                 split_pos = Some(x);
143 |                 break;
144 |             }
145 | 
146 |             // Lock if CC tag
147 |             if self.tokens[x].pos == POSTag::CC {
148 |                 self.is_locked = true;
149 |                 continue;
150 |             }
151 | 
152 |             // Handle previous comma
153 |             if x > 0 && self.tokens[x - 1].word.as_str() == "," {
154 |                 if ["CS", "CA", "VBG"].contains(&self.tokens[x].pos.to_str().as_str()) {
155 |                     self.is_locked = true;
156 |                     continue;
157 |                 } else {
158 |                     split_pos = Some(x - 1);
159 |                     break;
160 |                 }
161 |             }
162 | 
163 |             // Check for potential phrase splitter
164 |             if self.tokens[x].word.as_str() == ","
165 |                 || !["CC", "DT", "PRP", "RB", "RBS", "RBR"]
166 |                     .contains(&self.tokens[x].pos.to_str().as_str())
167 |             {
168 |                 continue;
169 |             }
170 | 
171 |             // If adjective, ensure following token is noun
172 |             if self.tokens[x].is_adjective() && (self.check_determiner_offset(x) > 0) {
173 |                 continue;
174 |             } else if x > 0
175 |                 && (self.tokens[x].is_pronoun() || self.tokens[x].is_determiner())
176 |                 && self.tokens[x - 1].is_preposition()
177 |             {
178 |                 self.is_locked = true;
179 |                 continue;
180 |             } else if self.tokens[x].is_adverb() && x > 0 && self.tokens[x - 1].is_verb() {
181 |                 continue;
182 |             }
183 | 
184 |             split_pos = Some(x);
185 |             break;
186 |         }
187 | 
188 |         // Split if needed
189 |         if let Some(pos) = split_pos {
190 |             let phrase = self.do_split(pos);
191 |             return Some(phrase);
192 |         }
193 | 
194 |         None
195 |     }
196 | 
197 |     /// Performs the actual split at the specified position, creating a new Phrase and updating the buffer's token list.
198 |     pub fn do_split(&mut self, split_pos: usize) -> Phrase {
199 |         // Get phrase
200 |         let remaining_tokens = self.tokens.split_off(split_pos);
201 |         let phrase = Phrase::new(&0, None);
202 | 
203 |         // Drain buffer after a split
204 |         self.tokens = remaining_tokens;
205 |         //self.drain(phrase.tokens.len());
206 | 
207 |         phrase
208 |     }
209 | 
210 |     /// Drains the buffer after a split, updating verb, noun, and pronoun indices and resetting position and lock state.
211 |     pub fn drain(&mut self, length: usize) {
212 |         self.verbs = self.verbs.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
213 |         self.nouns = self.nouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
214 |         self.pronouns =
215 |             self.pronouns.iter().filter(|&&v| v > length).map(|&v| v - length).collect();
216 |         self.position = if !self.verbs.is_empty() {
217 |             self.verbs[0]
218 |         } else {
219 |             0
220 |         };
221 |         self.is_locked = false;
222 |     }
223 | 
224 |     /// Checks for a determiner followed by a noun (with optional adjective) and returns the offset (2 or 3) or 0 if not found.
225 |     pub fn check_determiner_offset(&self, pos: usize) -> usize {
226 |         if self.tokens.len() < (pos + 1) {
227 |             return 0;
228 |         }
229 | 
230 |         if self.tokens[pos + 1].is_noun() {
231 |             return 2;
232 |         } else if self.tokens.len() >= (pos + 2)
233 |             && self.tokens[pos + 1].is_adjective()
234 |             && self.tokens[pos + 2].is_noun()
235 |         {
236 |             return 3;
237 |         }
238 | 
239 |         0
240 |     }
241 | }
242 | 
243 | impl fmt::Debug for Buffer {
244 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
245 |         let words = self.tokens.iter().map(|token| token.word.to_string()).collect::<Vec<String>>();
246 |         let verbs = self
247 |             .verbs
248 |             .iter()
249 |             .map(|pos| format!("{} {}", self.tokens[*pos].word, pos))
250 |             .collect::<Vec<String>>();
251 |         let nouns = self
252 |             .nouns
253 |             .iter()
254 |             .map(|pos| self.tokens[*pos].word.to_string())
255 |             .collect::<Vec<String>>();
256 |         write!(
257 |             f,
258 |             "[buffer] {} [verbs] {} [nouns] {}",
259 |             words.join(" "),
260 |             verbs.join(", "),
261 |             nouns.join(", ")
262 |         )
263 |     }
264 | }
265 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/pos_tag.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use serde::{Deserialize, Serialize};
  8 | use std::fmt;
  9 | 
 10 | /// Part-of-speech tags based on the Penn Treebank tagset with custom modifications.
 11 | /// For details on added tags (e.g., CA, CS, NZ), modified tags (e.g., EX, CD), and removed punctuation tags (e.g., SS, PUNC, SYM), refer to the crate documentation.
 12 | #[derive(Default, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
 13 | pub enum POSTag {
 14 |     CC,
 15 |     CS,
 16 |     CA,
 17 |     DT,
 18 |     EX,
 19 |     #[default]
 20 |     FW,
 21 |     IN,
 22 |     JJ,
 23 |     JJR,
 24 |     JJS,
 25 |     LS,
 26 |     MD,
 27 |     MWE,
 28 |     NN,
 29 |     NNS,
 30 |     NNP,
 31 |     NNPS,
 32 |     NM,
 33 |     NZ,
 34 |     PDT,
 35 |     PR,
 36 |     PRP,
 37 |     PUNC,
 38 |     RB,
 39 |     RBR,
 40 |     RBS,
 41 |     SS,
 42 |     SYM,
 43 |     SYS,
 44 |     UH,
 45 |     VB,
 46 |     VBD,
 47 |     VBG,
 48 |     VBN,
 49 |     VBP,
 50 |     VBZ,
 51 |     VF,
 52 |     VFG,
 53 |     VH,
 54 |     VHF,
 55 |     VHP,
 56 |     VHZ,
 57 |     WDT,
 58 |     WPR,
 59 |     WPRP,
 60 |     WRB,
 61 | }
 62 | 
 63 | impl POSTag {
 64 |     /// Convert a string into an instance of the POSTag enum
 65 |     pub fn from_str(tag: &str) -> Self {
 66 |         match tag.to_uppercase().as_str() {
 67 |             "CC" => Self::CC,
 68 |             "CS" => Self::CS,
 69 |             "CA" => Self::CA,
 70 |             "DT" => Self::DT,
 71 |             "EX" => Self::EX,
 72 |             "FW" => Self::FW,
 73 |             "IN" => Self::IN,
 74 |             "JJ" => Self::JJ,
 75 |             "JJR" => Self::JJR,
 76 |             "JJS" => Self::JJS,
 77 |             "LS" => Self::LS,
 78 |             "MD" => Self::MD,
 79 |             "MWE" => Self::MWE,
 80 |             "NN" => Self::NN,
 81 |             "NNS" => Self::NNS,
 82 |             "NNP" => Self::NNP,
 83 |             "NNPS" => Self::NNPS,
 84 |             "NM" => Self::NM,
 85 |             "NZ" => Self::NZ,
 86 |             "PDT" => Self::PDT,
 87 |             "PR" => Self::PR,
 88 |             "PRP" => Self::PRP,
 89 |             "PUNC" => Self::PUNC,
 90 |             "RB" => Self::RB,
 91 |             "RBR" => Self::RBR,
 92 |             "RBS" => Self::RBS,
 93 |             "SS" => Self::SS,
 94 |             "SYM" => Self::SYM,
 95 |             "SYS" => Self::SYS,
 96 |             "UH" => Self::UH,
 97 |             "VB" => Self::VB,
 98 |             "VBD" => Self::VBD,
 99 |             "VBG" => Self::VBG,
100 |             "VBN" => Self::VBN,
101 |             "VBP" => Self::VBP,
102 |             "VBZ" => Self::VBZ,
103 |             "VF" => Self::VF,
104 |             "VFG" => Self::VFG,
105 |             "VH" => Self::VH,
106 |             "VHF" => Self::VHF,
107 |             "VHP" => Self::VHP,
108 |             "VHZ" => Self::VHZ,
109 |             "WDT" => Self::WDT,
110 |             "WPR" => Self::WPR,
111 |             "WPRP" => Self::WPRP,
112 |             "WRB" => Self::WRB,
113 |             _ => Self::FW,
114 |         }
115 |     }
116 | 
117 |     /// Convert an instance of POStag into its string counterpart
118 |     pub fn to_str(&self) -> String {
119 |         match self {
120 |             Self::CC => "CC".to_string(),
121 |             Self::CS => "CS".to_string(),
122 |             Self::CA => "CA".to_string(),
123 |             Self::DT => "DT".to_string(),
124 |             Self::EX => "EX".to_string(),
125 |             Self::FW => "FW".to_string(),
126 |             Self::IN => "IN".to_string(),
127 |             Self::JJ => "JJ".to_string(),
128 |             Self::JJR => "JJR".to_string(),
129 |             Self::JJS => "JJS".to_string(),
130 |             Self::LS => "LS".to_string(),
131 |             Self::MD => "MD".to_string(),
132 |             Self::MWE => "MWE".to_string(),
133 |             Self::NN => "NN".to_string(),
134 |             Self::NNS => "NNS".to_string(),
135 |             Self::NNP => "NNP".to_string(),
136 |             Self::NNPS => "NNPS".to_string(),
137 |             Self::NM => "NM".to_string(),
138 |             Self::NZ => "NZ".to_string(),
139 |             Self::PDT => "PDT".to_string(),
140 |             Self::PR => "PR".to_string(),
141 |             Self::PRP => "PRP".to_string(),
142 |             Self::PUNC => "PUNC".to_string(),
143 |             Self::RB => "RB".to_string(),
144 |             Self::RBR => "RBR".to_string(),
145 |             Self::RBS => "RBS".to_string(),
146 |             Self::SS => "SS".to_string(),
147 |             Self::SYM => "SYM".to_string(),
148 |             Self::SYS => "SYS".to_string(),
149 |             Self::UH => "UH".to_string(),
150 |             Self::VB => "VB".to_string(),
151 |             Self::VBD => "VBD".to_string(),
152 |             Self::VBG => "VBG".to_string(),
153 |             Self::VBN => "VBN".to_string(),
154 |             Self::VBP => "VBP".to_string(),
155 |             Self::VBZ => "VBZ".to_string(),
156 |             Self::VF => "VF".to_string(),
157 |             Self::VFG => "VFG".to_string(),
158 |             Self::VH => "VH".to_string(),
159 |             Self::VHF => "VHF".to_string(),
160 |             Self::VHP => "VHP".to_string(),
161 |             Self::VHZ => "VHZ".to_string(),
162 |             Self::WDT => "WDT".to_string(),
163 |             Self::WPR => "WPR".to_string(),
164 |             Self::WPRP => "WPRP".to_string(),
165 |             Self::WRB => "WRB".to_string(),
166 |         }
167 |     }
168 | 
169 |     /// Convert a u8 into an instance of POSTag enum, mainly used by the POS tagger
170 |     pub fn from_u8(value: u8) -> Self {
171 |         match value {
172 |             1 => Self::CC,
173 |             2 => Self::CS,
174 |             3 => Self::CA,
175 |             4 => Self::DT,
176 |             5 => Self::EX,
177 |             6 => Self::FW,
178 |             7 => Self::IN,
179 |             8 => Self::JJ,
180 |             9 => Self::JJR,
181 |             10 => Self::JJS,
182 |             11 => Self::LS,
183 |             12 => Self::MD,
184 |             13 => Self::MWE,
185 |             14 => Self::NN,
186 |             15 => Self::NNS,
187 |             16 => Self::NNP,
188 |             17 => Self::NNPS,
189 |             18 => Self::NM,
190 |             19 => Self::NZ,
191 |             20 => Self::PDT,
192 |             21 => Self::PR,
193 |             22 => Self::PRP,
194 |             23 => Self::PUNC,
195 |             24 => Self::RB,
196 |             25 => Self::RBR,
197 |             26 => Self::RBS,
198 |             27 => Self::SS,
199 |             28 => Self::SYM,
200 |             29 => Self::SYS,
201 |             30 => Self::UH,
202 |             31 => Self::VB,
203 |             32 => Self::VBD,
204 |             33 => Self::VBG,
205 |             34 => Self::VBN,
206 |             35 => Self::VBP,
207 |             36 => Self::VBZ,
208 |             37 => Self::VF,
209 |             38 => Self::VFG,
210 |             39 => Self::VH,
211 |             40 => Self::VHF,
212 |             41 => Self::VHP,
213 |             42 => Self::VHZ,
214 |             43 => Self::WDT,
215 |             44 => Self::WPR,
216 |             45 => Self::WPRP,
217 |             46 => Self::WRB,
218 |             _ => Self::FW,
219 |         }
220 |     }
221 | 
222 |     /// Converts an instance of the POSTag enum into its u8 counterpart, generally only used by the POS tagger
223 |     pub fn to_u8(&self) -> u8 {
224 |         match self {
225 |             Self::CC => 1,
226 |             Self::CS => 2,
227 |             Self::CA => 3,
228 |             Self::DT => 4,
229 |             Self::EX => 5,
230 |             Self::FW => 6,
231 |             Self::IN => 7,
232 |             Self::JJ => 8,
233 |             Self::JJR => 9,
234 |             Self::JJS => 10,
235 |             Self::LS => 11,
236 |             Self::MD => 12,
237 |             Self::MWE => 13,
238 |             Self::NN => 14,
239 |             Self::NNS => 15,
240 |             Self::NNP => 16,
241 |             Self::NNPS => 17,
242 |             Self::NM => 18,
243 |             Self::NZ => 19,
244 |             Self::PDT => 20,
245 |             Self::PR => 21,
246 |             Self::PRP => 22,
247 |             Self::PUNC => 23,
248 |             Self::RB => 24,
249 |             Self::RBR => 25,
250 |             Self::RBS => 26,
251 |             Self::SS => 27,
252 |             Self::SYM => 28,
253 |             Self::SYS => 29,
254 |             Self::UH => 30,
255 |             Self::VB => 31,
256 |             Self::VBD => 32,
257 |             Self::VBG => 33,
258 |             Self::VBN => 34,
259 |             Self::VBP => 35,
260 |             Self::VBZ => 36,
261 |             Self::VF => 37,
262 |             Self::VFG => 38,
263 |             Self::VH => 39,
264 |             Self::VHF => 40,
265 |             Self::VHP => 41,
266 |             Self::VHZ => 42,
267 |             Self::WDT => 43,
268 |             Self::WPR => 44,
269 |             Self::WPRP => 45,
270 |             Self::WRB => 46,
271 |         }
272 |     }
273 | 
274 |     /// Convert tag to a shortened version -- used for training
275 |     /// of cohorts based model to assist with automated spelling corrections.
276 |     pub fn to_short_tag(&self) -> Self {
277 |         match *self {
278 |             Self::CC | Self::CS | Self::CA => Self::CC,
279 |             Self::DT | Self::PDT | Self::WDT => Self::DT,
280 |             Self::IN => Self::IN,
281 |             Self::JJ | Self::JJR | Self::JJS => Self::JJ,
282 |             Self::NN | Self::NNS | Self::NNP | Self::NNPS | Self::NM | Self::NZ => Self::NN,
283 |             Self::PR | Self::PRP | Self::WPR | Self::WPRP => Self::PR,
284 |             Self::PUNC | Self::SS => Self::PUNC,
285 |             Self::RB | Self::RBR | Self::RBS | Self::WRB => Self::RB,
286 |             Self::VB | Self::VBD | Self::VBG | Self::VBN | Self::VBP | Self::VBZ | Self::MD => {
287 |                 Self::VB
288 |             }
289 |             _ => Self::FW,
290 |         }
291 |     }
292 | 
293 |     /// Check whether the POS tag belongs to a noun
294 |     pub fn is_noun(&self) -> bool {
295 |         self.to_str().starts_with("N") || *self == Self::SYS
296 |     }
297 | 
298 |     /// Check whether the POS tag belongs to a verb
299 |     pub fn is_verb(&self) -> bool {
300 |         self.to_str().starts_with("V")
301 |     }
302 | 
303 |     /// Check whether the POS tag belongs to a conjunction
304 |     pub fn is_conjunction(&self) -> bool {
305 |         self.to_str().starts_with("C")
306 |     }
307 |     /// Check whether the POS tag belongs to a base verb
308 |     pub fn is_base_verb(&self) -> bool {
309 |         *self == Self::VB || *self == Self::VBG
310 |     }
311 | 
312 |     pub fn is_punctuation(&self) -> bool {
313 |         *self == Self::SS || *self == Self::PUNC
314 |     }
315 | 
316 |     /// Check whether or not tag belongs to a pronoun
317 |     pub fn is_pronoun(&self) -> bool {
318 |         *self == Self::PR || *self == Self::PRP
319 |     }
320 | 
321 |     /// Check whether the POS tag belongs to a adjective
322 |     pub fn is_adjective(&self) -> bool {
323 |         self.to_str().starts_with("J")
324 |     }
325 | 
326 |     /// Check whether the POS tag belongs to a adverb
327 |     pub fn is_adverb(&self) -> bool {
328 |         self.to_str().starts_with("R")
329 |     }
330 | 
331 |     /// Check whether the POS tag belongs to a named entiy
332 |     pub fn is_named_entity(&self) -> bool {
333 |         self.to_str().starts_with("NNP")
334 |     }
335 | }
336 | 
337 | impl fmt::Display for POSTag {
338 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
339 |         write!(f, "{}", self.to_str())
340 |     }
341 | }
342 | 
343 | impl fmt::Debug for POSTag {
344 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
345 |         write!(f, "{}", self.to_str())
346 |     }
347 | }
348 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/hmm.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{POSPrediction, POSPredictionMethod, POSTag, TokenKey};
  8 | use crate::tokenizer::Token;
  9 | use crate::vocab::{Capitalization, VocabMWE};
 10 | use serde::{Deserialize, Serialize};
 11 | use std::collections::HashMap;
 12 | use std::hash::Hash;
 13 | 
 14 | pub const TOTAL_TAGS: usize = 47;
 15 | 
 16 | #[derive(Default, Serialize, Deserialize)]
 17 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")]
 18 | pub struct HMM<S> {
 19 |     pub vocab_size: f32,
 20 |     pub initial_probs: Vec<f32>,
 21 |     pub transmition_probs: Vec<Vec<f32>>,
 22 |     pub emission_probs: Vec<HashMap<S, f32>>,
 23 |     pub smoothing: f64,
 24 | }
 25 | 
 26 | #[derive(Debug)]
 27 | struct Probability {
 28 |     pub deterministic_tag_idx: usize,
 29 |     pub viterbi: Vec<f32>,
 30 |     pub backpointer: Vec<usize>,
 31 | }
 32 | 
 33 | impl<S> HMM<S>
 34 | where
 35 |     S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>,
 36 |     Token: TokenKey<S>,
 37 | {
 38 |     pub fn new() -> Self {
 39 |         Self {
 40 |             vocab_size: 0.0,
 41 |             initial_probs: vec![0.0; TOTAL_TAGS],
 42 |             transmition_probs: vec![vec![0.0; TOTAL_TAGS]; TOTAL_TAGS],
 43 |             emission_probs: vec![HashMap::new(); TOTAL_TAGS],
 44 |             smoothing: 1.0,
 45 |         }
 46 |     }
 47 | 
 48 |     /// Apply hmm model to vector of tokens
 49 |     pub fn apply(&self, tokens: &mut [Token]) {
 50 |         let mut start_pos = 0;
 51 |         let mut end_pos: usize;
 52 |         loop {
 53 |             // Get end position
 54 |             end_pos = match tokens[start_pos..].iter().position(|token| token.pos == POSTag::SS) {
 55 |                 Some(r) => r + start_pos + 1,
 56 |                 None => tokens.len().saturating_sub(1)
 57 |             };
 58 |             if start_pos >= end_pos {
 59 |                 break;
 60 |             }
 61 | 
 62 |             // Apply viterbi
 63 |             self.viterbi_decode(start_pos, end_pos, tokens);
 64 |             start_pos = end_pos;
 65 |             if start_pos >= tokens.len() - 1 {
 66 |                 break;
 67 |             }
 68 |         }
 69 |     }
 70 | 
 71 |     /// Predict tags for a sentence
 72 |     fn viterbi_decode(&self, start_pos: usize, end_pos: usize, tokens: &mut [Token]) {
 73 |         // Go through tokens
 74 |         let mut results: Vec<Probability> = Vec::new();
 75 |         for (offset, token) in tokens[start_pos..end_pos].iter().enumerate() {
 76 |             let position = offset + start_pos;
 77 | 
 78 |             // Initial token
 79 |             if offset == 0 {
 80 |                 let tag_indices = if token.potential_pos.len() > 1 {
 81 |                     token
 82 |                         .potential_pos
 83 |                         .iter()
 84 |                         .filter(|&tag| *tag != POSTag::FW)
 85 |                         .map(|tag| tag.to_u8() as usize)
 86 |                         .collect::<Vec<usize>>()
 87 |                 } else if token.pos == POSTag::FW {
 88 |                     (1..47).filter(|x| *x != 6).collect::<Vec<usize>>()
 89 |                 } else {
 90 |                     vec![token.pos.to_u8() as usize]
 91 |                 };
 92 | 
 93 |                 // Instantiate probability
 94 |                 let current_tag_idx = if tag_indices.len() == 1 {
 95 |                     tag_indices[0]
 96 |                 } else {
 97 |                     0
 98 |                 };
 99 |                 let mut probs = Probability::new(current_tag_idx);
100 | 
101 |                 for tag_idx in tag_indices {
102 |                     probs.viterbi[tag_idx] =
103 |                         self.initial_probs[tag_idx] + self.get_emission_prob(tag_idx, token);
104 |                 }
105 | 
106 |                 results.push(probs);
107 |                 continue;
108 |             }
109 | 
110 |             // Forward pass
111 |             let probs = self.calculate_viterbi(position, &results, tokens);
112 |             results.push(probs);
113 |         }
114 | 
115 |         // Find best final state
116 |         let last_idx = results.len() - 1;
117 |         let mut best_final_state = 0;
118 |         let mut best_score = results[last_idx].viterbi[0];
119 | 
120 |         for tag_idx in 1..TOTAL_TAGS {
121 |             if tag_idx == 6 {
122 |                 continue;
123 |             }
124 | 
125 |             if results[last_idx].viterbi[tag_idx] > best_score {
126 |                 best_score = results[last_idx].viterbi[tag_idx];
127 |                 best_final_state = tag_idx;
128 |             }
129 |         }
130 | 
131 |         // Backtrack to find best path
132 |         let mut path = vec![0; results.len()];
133 |         path[last_idx] = best_final_state;
134 |         for idx in (0..results.len() - 1).rev() {
135 |             path[idx] = results[idx + 1].backpointer[path[idx + 1]];
136 |         }
137 | 
138 |         // Update tokens with new POS tags
139 |         let (mut is_initial, mut in_nnp) = (true, false);
140 |         for (offset, tag_idx) in path.iter().enumerate() {
141 |             let position = offset + start_pos;
142 | 
143 |             if tokens[position].potential_pos.len() < 2 && tokens[position].index > 0 {
144 |                 tokens[position].pos_prediction.confidence = 1.0;
145 |                 tokens[position].pos_prediction.tag = tokens[position].pos;
146 |                 tokens[position].pos_prediction.prev_tag = tokens[position].pos;
147 |             } else {
148 |                 let tag = POSTag::from_u8(*tag_idx as u8);
149 |                 let confidence = self.get_confidence_score(position, offset, &results, tokens);
150 | 
151 |                 tokens[position].pos_prediction = POSPrediction::new(
152 |                     POSPredictionMethod::hmm,
153 |                     &tokens[position].word,
154 |                     tokens[position].pos,
155 |                     tag,
156 |                     confidence,
157 |                     &HashMap::new(),
158 |                     &[],
159 |                 );
160 |                 tokens[position].pos = tag;
161 |             }
162 | 
163 |             // Check for named entity
164 |             let tag = tokens[position].pos;
165 |             if tag == POSTag::NN
166 |                 && VocabMWE::classify_capitalization(&tokens[position].word)
167 |                     != Capitalization::lower
168 |                 && !is_initial
169 |             {
170 |                 //tokens[position].pos = POSTag::NNP;
171 |                 in_nnp = true;
172 |             } else if tag == POSTag::NN && in_nnp {
173 |                 //tokens[position].pos = POSTag::NNP;
174 |             } else {
175 |                 in_nnp = false;
176 |             }
177 |             is_initial = tokens[position].pos == POSTag::SS;
178 |         }
179 |     }
180 | 
181 |     // Calculate the viterbi for a single token.
182 |     fn calculate_viterbi(
183 |         &self,
184 |         position: usize,
185 |         results: &[Probability],
186 |         tokens: &[Token],
187 |     ) -> Probability {
188 |         // Instantiate probability
189 |         let token = &tokens[position];
190 |         let deterministic_tag_idx = if token.potential_pos.len() > 1 || token.pos == POSTag::FW {
191 |             0
192 |         } else {
193 |             token.pos.to_u8() as usize
194 |         };
195 |         let mut probs = Probability::new(deterministic_tag_idx);
196 | 
197 |         // Initialize
198 |         let prev_probs = &results.last().unwrap();
199 | 
200 |         // Get tag indices
201 |         let tag_indices = if token.potential_pos.len() > 1 {
202 |             token
203 |                 .potential_pos
204 |                 .iter()
205 |                 .filter(|&tag| *tag != POSTag::FW)
206 |                 .map(|tag| tag.to_u8() as usize)
207 |                 .collect::<Vec<usize>>()
208 |         } else if token.pos == POSTag::FW {
209 |             (1..47).filter(|x| *x != 6).collect::<Vec<usize>>()
210 |         } else {
211 |             vec![token.pos.to_u8() as usize]
212 |         };
213 | 
214 |         // Get previous tag indices
215 |         let prev_tag_indices = if tokens[position - 1].potential_pos.len() > 1 {
216 |             tokens[position - 1]
217 |                 .potential_pos
218 |                 .iter()
219 |                 .filter(|&tag| *tag != POSTag::FW)
220 |                 .map(|tag| tag.to_u8() as usize)
221 |                 .collect::<Vec<usize>>()
222 |         } else if tokens[position - 1].pos == POSTag::FW || prev_probs.deterministic_tag_idx == 6 {
223 |             (1..47).filter(|x| *x != 6).collect::<Vec<usize>>()
224 |         } else {
225 |             vec![prev_probs.deterministic_tag_idx]
226 |         };
227 | 
228 |         // Calculate scores
229 |         for tag_idx in tag_indices.iter() {
230 |             let emission_prob = self.get_emission_prob(*tag_idx, token);
231 | 
232 |             for prev_tag_idx in prev_tag_indices.iter() {
233 |                 let score = prev_probs.viterbi[*prev_tag_idx]
234 |                     + self.transmition_probs[*prev_tag_idx][*tag_idx]
235 |                     + emission_prob;
236 |                 if score > probs.viterbi[*tag_idx] {
237 |                     probs.viterbi[*tag_idx] = score;
238 |                     probs.backpointer[*tag_idx] = *prev_tag_idx;
239 |                 }
240 |             }
241 |         }
242 | 
243 |         probs
244 |     }
245 | 
246 |     /// Get emission probabilities
247 |     fn get_emission_prob(&self, tag_idx: usize, token: &Token) -> f32 {
248 |         match self.emission_probs[tag_idx].get(&token.get_key()) {
249 |             Some(&prob) => prob,
250 |             None => {
251 |                 let tag_vocab_size = self.emission_probs[tag_idx].len() as f32;
252 |                 (self.smoothing as f32 / (tag_vocab_size + self.vocab_size * self.smoothing as f32))
253 |                     .ln()
254 |             }
255 |         }
256 |     }
257 | 
258 |     /// Returns a value between 0.0 and 1.0, where 1.0 means completely certain
259 |     fn get_confidence_score(
260 |         &self,
261 |         position: usize,
262 |         offset: usize,
263 |         results: &[Probability],
264 |         tokens: &[Token],
265 |     ) -> f32 {
266 |         let token = &tokens[position];
267 |         let prob_result = &results[offset];
268 | 
269 |         let tag_indices: Vec<usize> = token
270 |             .potential_pos
271 |             .iter()
272 |             .filter(|&tag| *tag != POSTag::FW)
273 |             .map(|tag| tag.to_u8() as usize)
274 |             .collect();
275 | 
276 |         // Get scores for all possible tags at this position
277 |         let mut scores: Vec<f32> = tag_indices
278 |             .iter()
279 |             .map(|&idx| prob_result.viterbi[idx])
280 |             .filter(|&score| score != f32::NEG_INFINITY)
281 |             .collect();
282 | 
283 |         if scores.len() <= 1 {
284 |             return 1.0;
285 |         }
286 | 
287 |         // Sort scores in descending order
288 |         scores.sort_by(|a, b| b.partial_cmp(a).unwrap());
289 | 
290 |         // Convert log probabilities to actual probabilities
291 |         let max_score = scores[0];
292 |         let prob_scores: Vec<f32> = scores.iter().map(|&score| (score - max_score).exp()).collect();
293 | 
294 |         let total_prob: f32 = prob_scores.iter().sum();
295 |         let normalized_probs: Vec<f32> =
296 |             prob_scores.iter().map(|&prob| prob / total_prob).collect();
297 | 
298 |         // Confidence is the probability of the best choice
299 |         normalized_probs[0]
300 |     }
301 | }
302 | 
303 | impl Probability {
304 |     pub fn new(deterministic_tag_idx: usize) -> Self {
305 |         Self {
306 |             deterministic_tag_idx,
307 |             viterbi: vec![f32::NEG_INFINITY; TOTAL_TAGS],
308 |             backpointer: vec![0; TOTAL_TAGS],
309 |         }
310 |     }
311 | }
312 | 


--------------------------------------------------------------------------------
/src/sophia/src/tokenizer/token.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use crate::pos_tagger::{POSPrediction, POSTag};
  8 | use crate::vocab::{
  9 |     f8::f8,
 10 |     {Pronoun, VocabDatabase},
 11 | };
 12 | use serde::{Deserialize, Serialize};
 13 | use std::collections::HashMap;
 14 | use std::fmt;
 15 | use std::ops::Range;
 16 | 
 17 | /// Represents a token with linguistic properties, including word, part-of-speech, categories, pronoun details, and scoring information.
 18 | #[derive(Default, Clone, Debug, Serialize, Deserialize)]
 19 | pub struct Token {
 20 |     pub word: String,
 21 |     #[serde(skip)]
 22 |     pub index: i32,
 23 |     pub stem: i32,
 24 |     pub potential_stem: Vec<i32>,
 25 |     pub is_name: bool,
 26 |     #[serde(skip)]
 27 |     pub token_type: TokenType,
 28 |     #[serde(skip)]
 29 |     pub is_possessive: bool,
 30 |     #[serde(skip)]
 31 |     pub is_negative: bool,
 32 |     pub pos: POSTag,
 33 |     #[serde(skip)]
 34 |     pub pos_prediction: POSPrediction,
 35 |     #[serde(skip)]
 36 |     pub potential_pos: Vec<POSTag>,
 37 |     pub categories: Vec<i16>,
 38 |     pub ner: Vec<i16>,
 39 |     pub synonyms: Vec<i32>,
 40 |     pub hypernyms: Vec<i32>,
 41 |     pub hyponyms: Vec<i32>,
 42 |     pub classification_scores: HashMap<i8, f8>,
 43 |     pub pronoun: Option<Pronoun>,
 44 |     #[serde(skip)]
 45 |     pub antecedent: Option<String>,
 46 |     #[serde(skip)]
 47 |     pub inner_word: String,
 48 |     #[serde(skip)]
 49 |     pub inner_value: String,
 50 |     #[serde(skip)]
 51 |     pub inner_unit: String,
 52 | }
 53 | 
 54 | /// Defines the type of a token, which can be a word, prefix, or suffix.
 55 | #[derive(Default, Serialize, Deserialize, Eq, PartialEq, Clone, Debug)]
 56 | pub enum TokenType {
 57 |     #[default]
 58 |     word,
 59 |     prefix,
 60 |     suffix,
 61 | }
 62 | 
 63 | impl Token {
 64 |     /// Creates a new Token from a word using the vocabulary database, initializing its properties.
 65 |     pub fn new(query_word: &str, vocab: &VocabDatabase) -> Token {
 66 |         if query_word.is_empty() {
 67 |             return Self::default();
 68 |         }
 69 | 
 70 |         // Get word lookup table
 71 |         let (word, lookup) = match vocab.lookup_word(query_word) {
 72 |             Some(r) => r,
 73 |             None => return Self::unknown(query_word),
 74 |         };
 75 |         let (_, token_id) = lookup.iter().next().unwrap();
 76 | 
 77 |         // Get oken by id
 78 |         let mut token = Self::from_id(*token_id, vocab);
 79 |         token.word = word;
 80 |         token.token_type = TokenType::word;
 81 |         token.potential_pos = lookup.keys().copied().collect();
 82 | 
 83 |         token
 84 |     }
 85 | 
 86 |     /// Creates a prefix Token from a word using the vocabulary database.
 87 |     pub fn prefix(word: &str, vocab: &VocabDatabase) -> Token {
 88 |         let mut token = Self::new(word, vocab);
 89 |         token.token_type = TokenType::prefix;
 90 |         token
 91 |     }
 92 | 
 93 |     /// Creates a suffix Token from a word using the vocabulary database.
 94 |     pub fn suffix(word: &str, vocab: &VocabDatabase) -> Token {
 95 |         let mut token = Self::new(word, vocab);
 96 |         token.token_type = TokenType::suffix;
 97 |         token
 98 |     }
 99 | 
100 |     /// Creates a numeric Token with the specified word, setting inner word and value.
101 |     pub fn numeric(word: &str, vocab: &VocabDatabase) -> Token {
102 |         let mut token = Self::new("|num|", vocab);
103 |         token.inner_word = word.to_string();
104 |         token.inner_value = word.to_string();
105 |         token
106 |     }
107 | 
108 |     /// Creates a special Token for system tags, with specified word, tag, value, and unit.
109 |     pub fn special(word: &str, tag: &str, value: &str, unit: &str, vocab: &VocabDatabase) -> Token {
110 |         let mut token = Self::new(tag, vocab);
111 |         token.inner_word = word.to_string();
112 |         token.inner_value = value.to_string();
113 |         token.inner_unit = unit.to_string();
114 |         token
115 |     }
116 | 
117 |     /// Creates an unknown Token with the specified word and default properties.
118 |     pub fn unknown(word: &str) -> Token {
119 |         Self {
120 |             word: word.to_string(),
121 |             ..Default::default()
122 |         }
123 |     }
124 | 
125 |     /// Creates a Token from a token ID using the vocabulary database, setting its index.
126 |     pub fn from_id(token_id: i32, vocab: &VocabDatabase) -> Token {
127 |         let mut token = match vocab.words.id2token.get(&token_id) {
128 |             Some(r) => r.clone(),
129 |             None => Self::default(),
130 |         };
131 |         token.index = token_id;
132 | 
133 |         token
134 |     }
135 | 
136 |     /// Updates the POS tag of the Token, returning a new Token if the tag is valid in the vocabulary.
137 |     pub fn update_pos(&self, pos_code: POSTag, vocab: &VocabDatabase) -> Option<Token> {
138 |         // Get map
139 |         let index_map = vocab.words.wordlist.get(&self.word)?;
140 | 
141 |         // Get token id
142 |         let index = index_map.get(&pos_code)?;
143 | 
144 |         // Return token
145 |         let token = Self::from_id(*index, vocab);
146 |         Some(token)
147 |     }
148 | 
149 |     /// Forces the Token to a verb POS tag if possible, returning a new Token or None if no verb tag is available.
150 |     pub fn force_verb(&self, vocab: &VocabDatabase) -> Option<Token> {
151 |         if self.is_verb() {
152 |             return None;
153 |         }
154 | 
155 |         for code in self.potential_pos.iter() {
156 |             if !code.to_str().starts_with("V") {
157 |                 continue;
158 |             }
159 |             return self.update_pos(*code, vocab);
160 |         }
161 | 
162 |         None
163 |     }
164 | 
165 |     /// Checks if the Token has a category within the specified range.
166 |     pub fn has_category(&self, category_range: &Range<i16>) -> bool {
167 |         self.categories.iter().any(|&x| category_range.contains(&x))
168 |     }
169 | 
170 |     /// Checks if the Token has a named entity recognition (NER) category within the specified range.
171 |     pub fn has_ner(&self, category_range: &Range<i16>) -> bool {
172 |         self.ner.iter().any(|&x| category_range.contains(&x))
173 |     }
174 | 
175 |     /// Checks if the Token is a noun (starts with 'N' or is SYS).
176 |     pub fn is_noun(&self) -> bool {
177 |         self.pos.to_str().starts_with("N") || self.pos == POSTag::SYS
178 |     }
179 | 
180 |     /// Checks if the Token is a verb (starts with 'V').
181 |     pub fn is_verb(&self) -> bool {
182 |         self.pos.to_str().starts_with("V")
183 |     }
184 | 
185 |     /// Checks if the Token is a base verb (VB or VBG).
186 |     pub fn is_base_verb(&self) -> bool {
187 |         ["VB", "VBG"].contains(&self.pos.to_str().as_str())
188 |     }
189 | 
190 |     /// Checks if the Token is a past verb (VBD, VBN, or VHP).
191 |     pub fn is_past_verb(&self) -> bool {
192 |         ["VBD", "VBN", "VHP"].contains(&self.pos.to_str().as_str())
193 |     }
194 | 
195 |     /// Checks if the Token is a present verb (VB, VBG, VBZ, VH, or VHZ).
196 |     pub fn is_present_verb(&self) -> bool {
197 |         ["VB", "VBG", "VBZ", "VH", "VHZ"].contains(&self.pos.to_str().as_str())
198 |     }
199 | 
200 |     /// Checks if the Token is a future verb (VF, VFG, or VHF).
201 |     pub fn is_future_verb(&self) -> bool {
202 |         ["VF", "VFG", "VHF"].contains(&self.pos.to_str().as_str())
203 |     }
204 | 
205 |     /// Checks if the Token is an adjective (starts with 'JJ').
206 |     pub fn is_adjective(&self) -> bool {
207 |         self.pos.to_str().starts_with("JJ")
208 |     }
209 | 
210 |     /// Checks if the Token is an adverb (starts with 'RB').
211 |     pub fn is_adverb(&self) -> bool {
212 |         self.pos.to_str().starts_with("RB")
213 |     }
214 | 
215 |     /// Checks if the Token is a named entity (starts with 'NNP').
216 |     pub fn is_named_entity(&self) -> bool {
217 |         self.pos.to_str().starts_with("NNP")
218 |     }
219 | 
220 |     /// Checks if the Token is an n-gram (MWE).
221 |     pub fn is_ngram(&self) -> bool {
222 |         self.pos == POSTag::MWE
223 |     }
224 | 
225 |     /// Checks if the Token is a conjunction (starts with 'C').
226 |     pub fn is_conjunction(&self) -> bool {
227 |         self.pos.to_str().starts_with("C")
228 |     }
229 | 
230 |     /// Checks if the Token is a determiner (DT).
231 |     pub fn is_determiner(&self) -> bool {
232 |         self.pos == POSTag::DT
233 |     }
234 | 
235 |     /// Checks if the Token is a pronoun (PR or PRP).
236 |     pub fn is_pronoun(&self) -> bool {
237 |         self.pos == POSTag::PR || self.pos == POSTag::PRP
238 |     }
239 | 
240 |     /// Checks if the Token is a modal verb (MD).
241 |     pub fn is_modal_verb(&self) -> bool {
242 |         self.pos == POSTag::MD
243 |     }
244 | 
245 |     /// Checks if the Token is a preposition (IN).
246 |     pub fn is_preposition(&self) -> bool {
247 |         self.pos == POSTag::IN
248 |     }
249 | 
250 |     /// Checks if the Token is a sentence stopper (SS).
251 |     pub fn is_sentence_stopper(&self) -> bool {
252 |         self.pos == POSTag::SS
253 |     }
254 | 
255 |     /// Check if the token is a punctuation mark
256 |     pub fn is_punctuation(&self) -> bool {
257 |         self.pos == POSTag::SS || self.pos == POSTag::PUNC
258 |     }
259 | 
260 |     /// Checks if the Token is a potential phrase splitter (PUNC).
261 |     pub fn is_phrase_splitter(&self) -> bool {
262 |         self.pos == POSTag::PUNC
263 |     }
264 | 
265 |     /// Retrieves the category vectors for the Token from the vocabulary database.
266 |     pub fn get_category_vec(&self, vocab: &VocabDatabase) -> Vec<Vec<i16>> {
267 |         let mut res: Vec<Vec<i16>> = Vec::new();
268 |         for category_id in self.categories.iter() {
269 |             let cat = match vocab.categories.get(category_id) {
270 |                 Some(r) => r,
271 |                 None => continue,
272 |             };
273 |             res.push(cat.fqn.clone());
274 |         }
275 | 
276 |         res
277 |     }
278 | 
279 |     /// Calculates the semantic distance between two Tokens based on their category vectors.
280 |     pub fn get_distance(&self, token2: &Token, vocab: &VocabDatabase) -> f32 {
281 |         // Get category vectors
282 |         let token1_categories = self.get_category_vec(vocab);
283 |         let token2_categories = token2.get_category_vec(vocab);
284 | 
285 |         // Initialize
286 |         let mut total_score = 0.0;
287 |         let mut comparisons = 0;
288 | 
289 |         // Go through categories, calculate distance / score
290 |         for cat1 in token1_categories.iter() {
291 |             for cat2 in token2_categories.iter() {
292 |                 let depth = self.get_common_category_depth(cat1, cat2);
293 |                 //let depth = 1;
294 |                 total_score += depth as f32;
295 |                 comparisons += 1;
296 |             }
297 |         }
298 | 
299 |         if comparisons > 0 {
300 |             total_score / comparisons as f32
301 |         } else {
302 |             0.0
303 |         }
304 |     }
305 | 
306 |     /// Calculates the common depth between two category paths.
307 |     fn get_common_category_depth(&self, path1: &[i16], path2: &[i16]) -> usize {
308 |         let mut depth = 0;
309 |         for (p1, p2) in path1.iter().zip(path2.iter()) {
310 |             if p1 == p2 {
311 |                 depth += 1;
312 |             } else {
313 |                 break;
314 |             }
315 |         }
316 | 
317 |         depth
318 |     }
319 | }
320 | 
321 | impl fmt::Display for Token {
322 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
323 |         if let Some(antecedent) = &self.antecedent {
324 |             write!(
325 |                 f,
326 |                 "{} ({}), antecedent: {}",
327 |                 self.word, self.pos, antecedent
328 |             )
329 |         } else if self.pos == POSTag::SYS && !self.inner_word.is_empty() {
330 |             write!(
331 |                 f,
332 |                 "{} ({}), inner word: {}, value: {}, unit{}",
333 |                 self.word, self.pos, self.inner_word, self.inner_value, self.inner_unit
334 |             )
335 |         } else {
336 |             write!(f, "{} ({})", self.word, self.pos)
337 |         }
338 |     }
339 | }
340 | 
341 | impl fmt::Display for TokenType {
342 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
343 |         match self {
344 |             TokenType::word => write!(f, "word"),
345 |             TokenType::prefix => write!(f, "prefix"),
346 |             TokenType::suffix => write!(f, "suffix"),
347 |         }
348 |     }
349 | }
350 | 


--------------------------------------------------------------------------------
/src/sophia/src/vocab/spell_check.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::VocabDatabase;
  8 | use crate::pos_tagger::{POSPrefix, POSSuffix, POSTag};
  9 | use crate::tokenizer::Token;
 10 | use serde::{Deserialize, Serialize};
 11 | use std::collections::HashMap;
 12 | use std::fmt;
 13 | 
 14 | const MAX_FREQUENCY: usize = 3;
 15 | const FREQUENCY_WEIGHT: f32 = 0.40;
 16 | const DISTANCE_WEIGHT: f32 = 0.85;
 17 | const TAG_BEFORE_WEIGHT: f32 = 0.55;
 18 | const WORD_BEFORE_WEIGHT: f32 = 0.65;
 19 | const SUFFIX_BONUS: f32 = 0.75;
 20 | const PREFIX_BONUS: f32 = 0.75;
 21 | const DOUBLE_LETTER_BONUS: f32 = 0.35;
 22 | 
 23 | /// Represents the automated spell checker, namely the various cohorts
 24 | /// that are based on POS / word length and used to minimize the search space of possible corrections.
 25 | #[derive(Default, Clone, Serialize, Deserialize)]
 26 | pub struct SpellChecker {
 27 |     pub cohorts: HashMap<SpellCheckerCohort, Vec<SpellCheckerEntry>>,
 28 | }
 29 | 
 30 | /// Individual entry for a candidate, stores
 31 | /// preceeding tag / word frequency for weighted scoring.
 32 | #[derive(Default, Clone, Serialize, Deserialize)]
 33 | pub struct SpellCheckerEntry {
 34 |     pub word_index: i32,
 35 |     pub tag_before: Vec<POSTag>,
 36 |     pub word_before: Vec<i32>,
 37 | }
 38 | 
 39 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
 40 | pub struct SpellCheckerCohort {
 41 |     pub pos: SpellCheckerCohortPOS,
 42 |     pub length: SpellCheckerCohortSize,
 43 | }
 44 | 
 45 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
 46 | pub enum SpellCheckerCohortPOS {
 47 |     noun,
 48 |     verb,
 49 |     adverb,
 50 |     adjective,
 51 |     entity,
 52 |     other,
 53 | }
 54 | 
 55 | #[derive(Debug, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
 56 | pub enum SpellCheckerCohortSize {
 57 |     short,        // <= 4 chars
 58 |     short_medium, // 5 or 6 chars
 59 |     medium,       // 7 or 8 chars
 60 |     medium_long,  // 9 or 10 chars
 61 |     long,         // 11+ chars
 62 | }
 63 | 
 64 | /// Candidate spelling correct, used to rank and
 65 | //  score possible corrections.
 66 | #[derive(Default, Clone)]
 67 | struct Candidate {
 68 |     pub token: Token,
 69 |     pub item: SpellCheckerEntry,
 70 |     pub score: f32,
 71 |     pub frequency: usize,
 72 |     pub distance: usize,
 73 |     pub tag_before: usize,
 74 |     pub word_before: usize,
 75 |     pub same_suffix: bool,
 76 |     pub same_prefix: bool,
 77 |     pub has_double_letter: bool,
 78 | }
 79 | 
 80 | impl SpellChecker {
 81 |     /// Check word for corrected spelling
 82 |     pub fn try_correct(
 83 |         &self,
 84 |         position: usize,
 85 |         tokens: &[Token],
 86 |         vocab: &VocabDatabase,
 87 |     ) -> Option<Token> {
 88 |         // Get candidates
 89 |         let mut candidates = self.get_candidates(&tokens[position], vocab);
 90 | 
 91 |         // Look for spelling correction
 92 |         for queue in &mut candidates {
 93 |             if queue.is_empty() {
 94 |                 continue;
 95 |             }
 96 | 
 97 |             // Score candidates
 98 |             self.score_candidates(queue, position, tokens);
 99 | 
100 |             // Sort candidates
101 |             queue.sort_unstable_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
102 |             return Some(queue[0].token.clone());
103 |         }
104 | 
105 |         None
106 |     }
107 | 
108 |     /// Get cohort based on POS tag and length
109 |     fn get_cohorts(&self, token: &Token) -> Vec<SpellCheckerCohort> {
110 |         // Get tags
111 |         let tags = token
112 |             .pos_prediction
113 |             .probabilities
114 |             .iter()
115 |             .filter(|(_, score)| **score >= 0.2)
116 |             .map(|(tag, _)| *tag)
117 |             .collect::<Vec<POSTag>>();
118 | 
119 |         // Go through tags
120 |         let cohorts: Vec<SpellCheckerCohort> = tags
121 |             .iter()
122 |             .flat_map(|tag| {
123 |                 let pos = SpellCheckerCohortPOS::from(*tag);
124 |                 let sizes = SpellCheckerCohortSize::get_sizes(token.word.len());
125 | 
126 |                 sizes
127 |                     .iter()
128 |                     .map(|length| SpellCheckerCohort {
129 |                         pos: pos.clone(),
130 |                         length: length.clone(),
131 |                     })
132 |                     .collect::<Vec<SpellCheckerCohort>>()
133 |             })
134 |             .collect();
135 | 
136 |         cohorts
137 |     }
138 | 
139 |     // Get initial candidates, sorted by distance
140 |     fn get_candidates(&self, token: &Token, vocab: &VocabDatabase) -> Vec<Vec<Candidate>> {
141 |         // Get cohorts
142 |         let cohorts = self.get_cohorts(token);
143 |         let mut candidates: Vec<Vec<Candidate>> = vec![vec![]; 4];
144 |         let word = token.word.to_lowercase();
145 | 
146 |         // Go through cohorts
147 |         for cohort in cohorts.iter() {
148 |             let search = match self.cohorts.get(cohort) {
149 |                 Some(r) => r,
150 |                 None => continue,
151 |             };
152 | 
153 |             // Initialize variables
154 |             let mut frequency = MAX_FREQUENCY;
155 |             let freq_interval = search.len() / 3;
156 | 
157 |             // Gather candidates
158 |             for (x, item) in search.iter().enumerate() {
159 |                 if x > 0 && x % freq_interval == 0 {
160 |                     frequency -= 1;
161 |                 }
162 |                 let s_token = vocab.from_int(item.word_index);
163 | 
164 |                 // GEt distance
165 |                 let lev_distance = self.levenshtein(&word, &s_token.word);
166 |                 let distance = candidates.len().saturating_sub(lev_distance);
167 |                 if distance > 0 && lev_distance > 0 {
168 |                     candidates[lev_distance - 1]
169 |                         .push(Candidate::new(frequency, distance, &s_token, item));
170 |                 }
171 |             }
172 |         }
173 | 
174 |         candidates
175 |     }
176 | 
177 |     // Score candidates
178 |     fn score_candidates(&self, candidates: &mut [Candidate], position: usize, tokens: &[Token]) {
179 |         // Iterate through candidates
180 |         for cand in candidates.iter_mut() {
181 |             // Get preceding tag and word score
182 |             if position > 0 {
183 |                 if let Some(idx) =
184 |                     cand.item.tag_before.iter().position(|&tag| tag == tokens[position - 1].pos)
185 |                 {
186 |                     cand.tag_before = self.get_frequency_idx(idx, cand.item.tag_before.len());
187 |                 }
188 | 
189 |                 if let Some(idx) = cand
190 |                     .item
191 |                     .word_before
192 |                     .iter()
193 |                     .position(|&w_idx| w_idx == tokens[position - 1].index)
194 |                 {
195 |                     cand.word_before = self.get_frequency_idx(idx, cand.item.word_before.len());
196 |                 }
197 |             }
198 | 
199 |             // Check suffix
200 |             if let Ok(suffix) = POSSuffix::try_from(&tokens[position])
201 |                 && let Ok(chk_suffix) = POSSuffix::try_from(&cand.token)
202 |             {
203 |                 cand.same_suffix = suffix == chk_suffix;
204 |             }
205 | 
206 |             // Check prefix
207 |             if let Ok(prefix) = POSPrefix::try_from(&tokens[position])
208 |                 && let Ok(chk_prefix) = POSPrefix::try_from(&cand.token)
209 |             {
210 |                 cand.same_prefix = prefix == chk_prefix;
211 |             }
212 | 
213 |             // Check double letter
214 |             cand.has_double_letter =
215 |                 self.check_double_letter(&tokens[position].word.to_lowercase(), &cand.token.word);
216 | 
217 |             // Score candidate
218 |             cand.score = cand.calculate_score();
219 |         }
220 |     }
221 | 
222 |     // Check whether or not word has double letter typo
223 |     fn check_double_letter(&self, word: &str, candidate_word: &str) -> bool {
224 |         let letters: Vec<char> = word.chars().collect();
225 |         for (x, char) in letters[1..].iter().enumerate() {
226 |             if *char == letters[x] {
227 |                 // Check if candidate has double letter
228 |                 let chk = format!("{}{}", char, char);
229 |                 if !candidate_word.contains(&chk) {
230 |                     return true;
231 |                 }
232 |             }
233 |         }
234 | 
235 |         false
236 |     }
237 | 
238 |     /// Get frequency based score
239 |     fn get_frequency_idx(&self, idx: usize, total: usize) -> usize {
240 |         let interval = total / MAX_FREQUENCY;
241 |         if idx == 0 || interval == 0 {
242 |             MAX_FREQUENCY
243 |         } else {
244 |             MAX_FREQUENCY.saturating_sub(idx / interval)
245 |         }
246 |     }
247 |     /// Calculate levenshtein distance
248 |     fn levenshtein(&self, s1: &str, s2: &str) -> usize {
249 |         let len1 = s1.len();
250 |         let len2 = s2.len();
251 | 
252 |         if len1 == 0 {
253 |             return len2;
254 |         }
255 |         if len2 == 0 {
256 |             return len1;
257 |         }
258 | 
259 |         let mut prev_row = vec![0; len2 + 1];
260 |         let mut curr_row = vec![0; len2 + 1];
261 | 
262 |         for j in 0..=len2 {
263 |             prev_row[j] = j;
264 |         }
265 | 
266 |         for (i, c1) in s1.chars().enumerate() {
267 |             curr_row[0] = i + 1;
268 | 
269 |             for (j, c2) in s2.chars().enumerate() {
270 |                 let cost = if c1 == c2 { 0 } else { 1 };
271 |                 curr_row[j + 1] = std::cmp::min(
272 |                     std::cmp::min(prev_row[j + 1] + 1, curr_row[j] + 1),
273 |                     prev_row[j] + cost,
274 |                 );
275 |             }
276 | 
277 |             std::mem::swap(&mut prev_row, &mut curr_row);
278 |         }
279 | 
280 |         prev_row[len2]
281 |     }
282 | }
283 | 
284 | impl Candidate {
285 |     pub fn new(frequency: usize, distance: usize, token: &Token, item: &SpellCheckerEntry) -> Self {
286 |         Self {
287 |             token: token.clone(),
288 |             item: item.clone(),
289 |             frequency,
290 |             distance,
291 |             ..Default::default()
292 |         }
293 |     }
294 | 
295 |     /// Score the candidate
296 |     pub fn calculate_score(&mut self) -> f32 {
297 |         let mut score =
298 |             (self.frequency as f32 * FREQUENCY_WEIGHT) + (self.distance as f32 * DISTANCE_WEIGHT);
299 |         score += (self.tag_before as f32 * TAG_BEFORE_WEIGHT)
300 |             + (self.word_before as f32 * WORD_BEFORE_WEIGHT);
301 | 
302 |         if self.same_suffix {
303 |             score += SUFFIX_BONUS;
304 |         }
305 |         if self.same_prefix {
306 |             score += PREFIX_BONUS;
307 |         }
308 |         if self.has_double_letter {
309 |             score += DOUBLE_LETTER_BONUS;
310 |         }
311 | 
312 |         score
313 |     }
314 | }
315 | 
316 | impl fmt::Debug for Candidate {
317 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
318 |         write!(
319 |             f,
320 |             "word {} pos {} score {:.2} frequency {} distance {} tag_before {} word before {} suffix {} prefix {}",
321 |             self.token.word,
322 |             self.token.pos,
323 |             self.score,
324 |             self.frequency,
325 |             self.distance,
326 |             self.tag_before,
327 |             self.word_before,
328 |             self.same_suffix,
329 |             self.same_prefix
330 |         )
331 |     }
332 | }
333 | 
334 | impl From<POSTag> for SpellCheckerCohortPOS {
335 |     fn from(tag: POSTag) -> Self {
336 |         match tag {
337 |             t if t.is_named_entity() => Self::entity,
338 |             t if t.is_noun() => Self::noun,
339 |             t if t.is_verb() => Self::verb,
340 |             t if t.is_adverb() => Self::adverb,
341 |             t if t.is_adjective() => Self::adjective,
342 |             _ => Self::other,
343 |         }
344 |     }
345 | }
346 | 
347 | impl From<usize> for SpellCheckerCohortSize {
348 |     fn from(length: usize) -> Self {
349 |         match length {
350 |             len if len <= 4 => Self::short,
351 |             len if len <= 6 => Self::short_medium,
352 |             len if len <= 8 => Self::medium,
353 |             len if len <= 10 => Self::medium_long,
354 |             _ => Self::long,
355 |         }
356 |     }
357 | }
358 | 
359 | impl SpellCheckerCohortSize {
360 |     pub fn get_sizes(length: usize) -> Vec<Self> {
361 |         match length {
362 |             len if len <= 3 => vec![Self::short],
363 |             len if len <= 5 => vec![Self::short, Self::short_medium],
364 |             len if len <= 7 => vec![Self::short_medium, Self::medium],
365 |             len if len <= 11 => vec![Self::medium, Self::medium_long, Self::long],
366 |             _ => vec![Self::medium_long, Self::long],
367 |         }
368 |     }
369 | }
370 | 


--------------------------------------------------------------------------------
/src/sophia/src/pos_tagger/model.rs:
--------------------------------------------------------------------------------
  1 | // Copyright 2025 Aquila Labs of Alberta, Canada <matt@cicero.sh>
  2 | // Licensed under the PolyForm Noncommercial License 1.0.0
  3 | // Commercial use requires a separate license: https://cicero.sh/sophia/
  4 | // License text: https://polyformproject.org/licenses/noncommercial/1.0.0/
  5 | // Distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND.
  6 | 
  7 | use super::{
  8 |     POSContext, POSFeature, POSPrediction, POSPredictionMethod, POSPrefix, POSSuffix, POSTag,
  9 |     SIBLING_TAGS_AFTER, SIBLING_TAGS_BEFORE, TokenKey,
 10 | };
 11 | use crate::tokenizer::Token;
 12 | use serde::{Deserialize, Serialize};
 13 | use std::collections::{HashMap, HashSet};
 14 | use std::hash::Hash;
 15 | 
 16 | pub trait POSModelInterface {
 17 |     fn predict(&self, position: usize, tokens: &[Token]) -> Option<POSPrediction>;
 18 | }
 19 | 
 20 | #[derive(Default, Clone, Serialize, Deserialize)]
 21 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")]
 22 | pub struct POSModel<S> {
 23 |     pub word: String,
 24 |     pub target_tags: Vec<POSTag>,
 25 |     pub tag_freq: HashMap<POSTag, f32>,
 26 |     pub features: HashMap<POSFeature<S>, POSWeight>,
 27 |     pub conjunctions: HashMap<POSFeature<S>, Vec<POSConjunction<S>>>,
 28 | }
 29 | 
 30 | #[derive(Default, Debug, Clone, Serialize, Deserialize)]
 31 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")]
 32 | pub struct POSConjunction<S> {
 33 |     pub weight: POSWeight,
 34 |     pub deterministic_tag: Option<POSTag>,
 35 |     pub siblings: Vec<POSFeature<S>>,
 36 |     pub exceptions: Vec<(POSFeature<S>, Option<POSTag>)>,
 37 | }
 38 | 
 39 | #[derive(Default, Debug, Clone, Serialize, Deserialize)]
 40 | pub struct POSWeight {
 41 |     pub tags: HashMap<POSTag, f32>,
 42 |     pub weight: f32,
 43 |     pub mi_score: f32,
 44 | }
 45 | 
 46 | struct POSPositionTracker<S> {
 47 |     primary: Vec<Option<POSScore<S>>>,
 48 |     secondary: Vec<HashMap<POSTag, f32>>,
 49 | }
 50 | 
 51 | #[derive(Clone)]
 52 | struct POSScore<S> {
 53 |     feature: POSFeature<S>,
 54 |     tags: HashMap<POSTag, f32>,
 55 | }
 56 | 
 57 | #[derive(Default, Clone, Serialize, Deserialize)]
 58 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")]
 59 | pub struct POSTagModel<S> {
 60 |     pub target_tags: Vec<POSTag>,
 61 |     pub global: POSModel<S>,
 62 |     pub words: HashMap<S, POSModel<S>>,
 63 | }
 64 | 
 65 | #[derive(Default, Serialize, Deserialize)]
 66 | #[serde(bound = "S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>")]
 67 | pub struct POSTagModelRepo<S> {
 68 |     pub tags: HashMap<POSTag, Vec<String>>,
 69 |     pub models: HashMap<String, POSTagModel<S>>,
 70 | }
 71 | 
 72 | impl<S> POSModelInterface for POSModel<S>
 73 | where
 74 |     S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>,
 75 |     Token: TokenKey<S>,
 76 | {
 77 |     /// Resolve an ambiguous word
 78 |     fn predict(&self, position: usize, tokens: &[Token]) -> Option<POSPrediction> {
 79 |         // Get context
 80 |         let context = POSContext::from_tokens(position, tokens);
 81 |         let context_vec: HashSet<POSFeature<S>> = context.iter_ft().collect();
 82 | 
 83 |         // Check conjunctions
 84 |         if let Some(pred) = self.check_conjunctions(position, &context, &context_vec, tokens) {
 85 |             return Some(pred);
 86 |         }
 87 | 
 88 |         // Check confidence score
 89 | 
 90 |         // Fallback to individual features
 91 |         let mut tracker = POSPositionTracker::new();
 92 |         for feature in context.iter_ft() {
 93 |             if let Some(weight) = self.features.get(&feature) {
 94 |                 tracker.add_feature(&feature, weight);
 95 |             }
 96 |         }
 97 | 
 98 |         // Combine scores
 99 |         let scores = tracker.combine(&self.tag_freq);
100 |         if scores.is_empty() {
101 |             return None;
102 |         }
103 | 
104 |         // Get highest score
105 |         let mut scores_vec = scores.iter().collect::<Vec<_>>();
106 |         scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
107 | 
108 |         Some(POSPrediction::new(
109 |             POSPredictionMethod::standard,
110 |             &tokens[position].word,
111 |             tokens[position].pos,
112 |             *scores_vec[0].0,
113 |             *scores_vec[0].1,
114 |             &scores,
115 |             &[],
116 |         ))
117 |     }
118 | }
119 | 
120 | impl<S> POSModel<S>
121 | where
122 |     S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>,
123 |     Token: TokenKey<S>,
124 | {
125 |     /// Check conjunctions
126 |     fn check_conjunctions(
127 |         &self,
128 |         position: usize,
129 |         context: &POSContext<S>,
130 |         context_vec: &HashSet<POSFeature<S>>,
131 |         tokens: &[Token],
132 |     ) -> Option<POSPrediction> {
133 |         let mut scores: HashMap<POSTag, f32> = HashMap::new();
134 | 
135 |         // Go through context
136 |         for feature in context_vec.iter() {
137 |             let conjunction_set = match self.conjunctions.get(feature) {
138 |                 Some(r) => r,
139 |                 None => continue,
140 |             };
141 | 
142 |             // Find strongest matching conjunction, if any
143 |             for conjunction in conjunction_set.iter() {
144 |                 // Check exceptions
145 |                 if let Some(tag) = self.check_exceptions(conjunction, context_vec) {
146 |                     if tag.is_none() {
147 |                         continue;
148 |                     }
149 |                     return Some(POSPrediction::new(
150 |                         POSPredictionMethod::exception,
151 |                         &tokens[position].word,
152 |                         tokens[position].pos,
153 |                         tag.unwrap(),
154 |                         1.0,
155 |                         &HashMap::new(),
156 |                         &[],
157 |                     ));
158 |                 }
159 | 
160 |                 // Check siblings
161 |                 if !conjunction.siblings.iter().all(|sib| {
162 |                     let offset =
163 |                         ((feature.offset + sib.offset) + (SIBLING_TAGS_BEFORE as i8)) as usize;
164 |                     context.0[offset].contains(&sib.feature_token)
165 |                 }) {
166 |                     continue;
167 |                 }
168 | 
169 |                 // Check for deterministic tag
170 |                 if let Some(tag) = conjunction.deterministic_tag {
171 |                     return Some(POSPrediction::new(
172 |                         POSPredictionMethod::deterministic_rule,
173 |                         &tokens[position].word,
174 |                         tokens[position].pos,
175 |                         tag,
176 |                         1.0,
177 |                         &HashMap::new(),
178 |                         &[],
179 |                     ));
180 |                 }
181 | 
182 |                 // Add to results
183 |                 for (tag, score) in conjunction.weight.tags.iter() {
184 |                     *scores.entry(*tag).or_insert(0.0) += *score * conjunction.weight.weight;
185 |                 }
186 |                 break;
187 |             }
188 |         }
189 | 
190 |         // Check for none
191 |         if scores.is_empty() {
192 |             return None;
193 |         }
194 | 
195 |         // Get highest score
196 |         let mut scores_vec = scores.iter().collect::<Vec<_>>();
197 |         scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
198 | 
199 |         Some(POSPrediction::new(
200 |             POSPredictionMethod::conjunction,
201 |             &tokens[position].word,
202 |             tokens[position].pos,
203 |             *scores_vec[0].0,
204 |             *scores_vec[0].1,
205 |             &scores,
206 |             &[],
207 |         ))
208 |     }
209 | 
210 |     /// Check exceptions
211 |     fn check_exceptions(
212 |         &self,
213 |         conjunction: &POSConjunction<S>,
214 |         context_vec: &HashSet<POSFeature<S>>,
215 |     ) -> Option<Option<POSTag>> {
216 |         for (exception, opt_tag) in conjunction.exceptions.iter() {
217 |             if !context_vec.contains(exception) {
218 |                 continue;
219 |             }
220 | 
221 |             if let Some(_tag) = opt_tag {
222 |                 //return Some(Some(*tag));
223 |                 return Some(None);
224 |             } else {
225 |                 return Some(None);
226 |             }
227 |         }
228 | 
229 |         None
230 |     }
231 | 
232 |     /// Predict cohort, used for automated spelling corrections to narraow search space of candidates
233 |     pub fn predict_cohort(&self, position: usize, tokens: &[Token]) -> Option<POSPrediction> {
234 |         // Get initial prediction
235 |         let mut pred = self.predict(position, tokens)?;
236 |         let max_value = pred.probabilities.values().sum::<f32>();
237 | 
238 |         // Scale and normalize probabilities
239 |         for (tag, score) in pred.probabilities.iter_mut() {
240 |             let overall_score = self.tag_freq.get(tag).unwrap_or(&0.0);
241 |             *score /= max_value;
242 |             *score *= (0.10 / overall_score.max(1e-6)).sqrt();
243 |             //*score *= (0.10 / *overall_score);
244 |         }
245 | 
246 |         // Add suffix bonus
247 |         if let Ok(suffix) = POSSuffix::try_from(&tokens[position]) {
248 |             let (suffix_tag, suffix_bonus) = match suffix {
249 |                 POSSuffix::ed | POSSuffix::ing => (POSTag::VB, 0.10),
250 |                 POSSuffix::day
251 |                 | POSSuffix::ion
252 |                 | POSSuffix::tion
253 |                 | POSSuffix::ness
254 |                 | POSSuffix::ment
255 |                 | POSSuffix::ity
256 |                 | POSSuffix::ty
257 |                 | POSSuffix::ance
258 |                 | POSSuffix::ence
259 |                 | POSSuffix::age
260 |                 | POSSuffix::ship
261 |                 | POSSuffix::hood => (POSTag::NN, 0.15),
262 |                 POSSuffix::wise => (POSTag::RB, 0.15),
263 |                 POSSuffix::ly | POSSuffix::ward => (POSTag::RB, 0.15),
264 |                 POSSuffix::er
265 |                 | POSSuffix::est
266 |                 | POSSuffix::ous
267 |                 | POSSuffix::less
268 |                 | POSSuffix::ful
269 |                 | POSSuffix::able
270 |                 | POSSuffix::ible => (POSTag::JJ, 0.15),
271 |                 POSSuffix::al | POSSuffix::ive => (POSTag::JJ, 0.10),
272 |                 _ => (POSTag::FW, 0.0),
273 |             };
274 | 
275 |             if pred.probabilities.contains_key(&suffix_tag) {
276 |                 *pred.probabilities.get_mut(&suffix_tag).unwrap() += suffix_bonus;
277 |             }
278 |         }
279 | 
280 |         // Add prefix bonus
281 |         if let Ok(prefix) = POSPrefix::try_from(&tokens[position]) {
282 |             let (prefix_tag, prefix_bonus) = match prefix {
283 |                 POSPrefix::non
284 |                 | POSPrefix::anti
285 |                 | POSPrefix::semi
286 |                 | POSPrefix::uni
287 |                 | POSPrefix::bi
288 |                 | POSPrefix::tri
289 |                 | POSPrefix::quad
290 |                 | POSPrefix::mono
291 |                 | POSPrefix::pseudo
292 |                 | POSPrefix::quasi => (POSTag::JJ, 0.075),
293 |                 POSPrefix::en | POSPrefix::em | POSPrefix::mis => (POSTag::VB, 0.075),
294 |                 POSPrefix::sub | POSPrefix::inter | POSPrefix::intra | POSPrefix::trans => {
295 |                     (POSTag::NN, 0.075)
296 |                 }
297 |                 POSPrefix::un
298 |                 | POSPrefix::pre
299 |                 | POSPrefix::over
300 |                 | POSPrefix::micro
301 |                 | POSPrefix::mega
302 |                 | POSPrefix::extra
303 |                 | POSPrefix::poly => (POSTag::JJ, 0.05),
304 |                 POSPrefix::re | POSPrefix::dis | POSPrefix::de => (POSTag::VB, 0.05),
305 |                 POSPrefix::co | POSPrefix::com | POSPrefix::post | POSPrefix::fore => {
306 |                     (POSTag::NN, 0.05)
307 |                 }
308 |                 _ => (POSTag::FW, 0.0),
309 |             };
310 | 
311 |             if pred.probabilities.contains_key(&prefix_tag) {
312 |                 *pred.probabilities.get_mut(&prefix_tag).unwrap() += prefix_bonus;
313 |             }
314 |         }
315 | 
316 |         // Normalize
317 |         let total = pred.probabilities.values().sum::<f32>();
318 |         for (_, score) in pred.probabilities.iter_mut() {
319 |             *score /= total;
320 |         }
321 | 
322 |         // GEt highest ranking probability
323 |         let mut scores_vec = pred.probabilities.iter().collect::<Vec<_>>();
324 |         scores_vec.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap());
325 | 
326 |         // Set new tag
327 |         pred.tag = *scores_vec[0].0;
328 |         pred.confidence = *scores_vec[0].1;
329 | 
330 |         Some(pred)
331 |     }
332 | }
333 | 
334 | impl<S> POSModelInterface for POSTagModel<S>
335 | where
336 |     S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>,
337 |     Token: TokenKey<S>,
338 | {
339 |     /// Predict tag for an ambiguous word
340 |     fn predict(&self, position: usize, tokens: &[Token]) -> Option<POSPrediction> {
341 |         // Check per-word models
342 |         if let Some(model) = self.words.get(&tokens[position].get_key())
343 |             && let Some(pred) = model.predict(position, tokens)
344 |             && pred.confidence >= 0.85
345 |         {
346 |             return Some(pred);
347 |         }
348 | 
349 |         self.global.predict(position, tokens)
350 |     }
351 | }
352 | 
353 | impl<S> POSPositionTracker<S>
354 | where
355 |     S: Default + Clone + Eq + PartialEq + Hash + Serialize + for<'a> Deserialize<'a>,
356 |     Token: TokenKey<S>,
357 | {
358 |     pub fn new() -> Self {
359 |         let length = SIBLING_TAGS_BEFORE + SIBLING_TAGS_AFTER + 1;
360 | 
361 |         Self {
362 |             primary: vec![None; length],
363 |             secondary: vec![HashMap::new(); length],
364 |         }
365 |     }
366 | 
367 |     // Update with a new features
368 |     pub fn add_feature(&mut self, feature: &POSFeature<S>, weight: &POSWeight) {
369 |         let index = (SIBLING_TAGS_BEFORE as i8 + feature.offset) as usize;
370 | 
371 |         // Secondary feature
372 |         if !feature.feature_token.is_primary() {
373 |             for (tag, score) in weight.tags.iter() {
374 |                 *self.secondary[index].entry(*tag).or_insert(0.0) += weight.weight * *score;
375 |             }
376 | 
377 |         // Primary feature
378 |         } else if self.primary[index].is_none()
379 |             || feature.get_score() > self.primary[index].as_ref().unwrap().feature.get_score()
380 |         {
381 |             let tags: HashMap<POSTag, f32> =
382 |                 weight.tags.iter().map(|(tag, score)| (*tag, (weight.weight * *score))).collect();
383 | 
384 |             self.primary[index] = Some(POSScore {
385 |                 feature: feature.clone(),
386 |                 tags,
387 |             });
388 |         }
389 |     }
390 | 
391 |     /// Combine scores
392 |     pub fn combine(&self, tag_freq: &HashMap<POSTag, f32>) -> HashMap<POSTag, f32> {
393 |         let mut scores: HashMap<POSTag, f32> = HashMap::new();
394 |         for (x, score_opt) in self.primary.iter().enumerate() {
395 |             let score = match score_opt {
396 |                 Some(r) => r,
397 |                 None => continue,
398 |             };
399 | 
400 |             for (tag, tag_score) in score.tags.iter() {
401 |                 *scores.entry(*tag).or_insert(0.0) += *tag_score;
402 |             }
403 | 
404 |             // Add secondary
405 |             for (tag, tag_score) in self.secondary[x].iter() {
406 |                 *scores.entry(*tag).or_insert(0.0) += *tag_score;
407 |             }
408 |         }
409 | 
410 |         // Add tag freq
411 |         for (tag, score) in scores.iter_mut() {
412 |             if let Some(freq_score) = tag_freq.get(tag) {
413 |                 *score = (*score * 0.8) + (freq_score * 0.2);
414 |             }
415 |         }
416 | 
417 |         scores
418 |     }
419 | }
420 | 


--------------------------------------------------------------------------------