├── .gitattributes ├── extractor ├── .DS_Store ├── Cargo.toml ├── src │ ├── helpers.rs │ ├── file_generation.rs │ ├── scratch.rs │ └── main.rs ├── adj_check.csv ├── noun_plural_check.csv ├── insane_noun.csv └── analyzed_endings.csv ├── english-core ├── .DS_Store ├── src │ ├── lib.rs │ ├── utils.rs │ ├── grammar.rs │ ├── noun.rs │ ├── verb.rs │ └── adj.rs └── Cargo.toml ├── src ├── snippets.rs ├── adj.rs ├── noun.rs ├── verb.rs └── lib.rs ├── .gitignore ├── Cargo.toml ├── LICENSE ├── examples ├── speedmark.rs ├── test.rs └── test2.rs └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /extractor/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gold-silver-copper/english/HEAD/extractor/.DS_Store -------------------------------------------------------------------------------- /english-core/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gold-silver-copper/english/HEAD/english-core/.DS_Store -------------------------------------------------------------------------------- /english-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod adj; 2 | pub mod grammar; 3 | mod noun; 4 | 5 | mod utils; 6 | mod verb; 7 | pub use crate::grammar::*; 8 | pub struct EnglishCore {} 9 | -------------------------------------------------------------------------------- /extractor/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "extractor" 3 | version = "0.1.0" 4 | edition = "2024" 5 | 6 | [dependencies] 7 | csv = "1.3.1" 8 | serde = { version = "1.0.219", features = ["derive"] } 9 | serde_json = "1.0.140" 10 | english-core = { path = "../english-core" } 11 | # english is only used for benchmarking 12 | english = { path = ".." } 13 | -------------------------------------------------------------------------------- /src/snippets.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | impl English { 4 | pub fn simple_sentence(object: &Noun, subject: &Noun, verb: &Verb) -> String { 5 | let verb_str = English::verb( 6 | &verb.word, 7 | &verb.person, 8 | &subject.number, 9 | &verb.tense, 10 | &verb.form, 11 | ); 12 | 13 | format!("{} {} {}.", subject, verb_str, object) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /english-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "english-core" 3 | version = "0.1.0" 4 | authors = ["gold-silver-copper"] 5 | edition = "2024" 6 | include = ["LICENSE-APACHE", "LICENSE-MIT", "**/*.rs", "Cargo.toml"] 7 | 8 | description = "English language inflector" 9 | license = "MIT OR Apache-2.0" 10 | repository = "https://github.com/gold-silver-copper/english" 11 | 12 | 13 | [lib] 14 | crate-type = ["cdylib", "rlib"] 15 | 16 | [profile.release] 17 | opt-level = 2 # fast and small wasm 18 | 19 | # Optimize all dependencies even in debug builds: 20 | [profile.dev.package."*"] 21 | opt-level = 2 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | .DS_Store 6 | 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 8 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 9 | Cargo.lock 10 | 11 | # These are backup files generated by rustfmt 12 | **/*.rs.bk 13 | 14 | # MSVC Windows builds of rustc generate these, which store debugging information 15 | *.pdb 16 | extractor/english_filtered.jsonl 17 | english-core/.DS_Store 18 | extractor/adjectives.csv 19 | extractor/nouns_with_plurals.csv 20 | extractor/verb_conjugations.csv 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "english" 3 | version = "0.1.3" 4 | authors = ["gold-silver-copper"] 5 | edition = "2024" 6 | include = ["LICENSE-APACHE", "LICENSE-MIT", "**/*.rs", "Cargo.toml"] 7 | 8 | description = "English inflector decliner conjugator from wiktionary data" 9 | license = "MIT OR Apache-2.0" 10 | repository = "https://github.com/gold-silver-copper/english" 11 | 12 | 13 | [lib] 14 | crate-type = ["cdylib", "rlib"] 15 | 16 | 17 | [profile.release] 18 | opt-level = 2 # fast and small wasm 19 | 20 | # Optimize all dependencies even in debug builds: 21 | [profile.dev.package."*"] 22 | opt-level = 2 23 | 24 | [dependencies] 25 | #english-core = { path = "english-core" } 26 | english-core = "0.1.0" 27 | phf = { version = "0.12", default-features = false, features = ["macros"] } 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 zombkit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /english-core/src/utils.rs: -------------------------------------------------------------------------------- 1 | use crate::EnglishCore; 2 | use crate::grammar::*; 3 | impl EnglishCore { 4 | pub fn pair_match(word: &str, listik: &[(&str, &str)]) -> Option { 5 | listik 6 | .iter() 7 | .find(|(sing, _)| *sing == word) 8 | .map(|(_, plur)| plur.to_string()) 9 | } 10 | 11 | pub fn replace_last_occurence(input: &str, pattern: &str, replacement: &str) -> String { 12 | if let Some(last_index) = input.rfind(pattern) { 13 | let (before_last, _after_last) = input.split_at(last_index); 14 | format!("{}{}", before_last, replacement) 15 | } else { 16 | input.into() 17 | } 18 | } 19 | pub fn iter_replace_last(word: &str, pairs: &[(&str, &str)]) -> Option { 20 | for (sing, plur) in pairs { 21 | if word.ends_with(sing) { 22 | return Some(EnglishCore::replace_last_occurence(word, sing, plur)); 23 | } 24 | } 25 | None 26 | } 27 | 28 | pub fn starts_with_uppercase(word: &str) -> bool { 29 | word.chars() 30 | .next() 31 | .map(|c| c.is_uppercase()) 32 | .unwrap_or(false) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/adj.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | ///The Adj struct is used for holding adjective functions 4 | #[derive(Debug, Clone, PartialEq, Eq)] 5 | pub struct Adj {} 6 | 7 | impl Adj { 8 | // --------------------------- 9 | // ADJECTIVE HELPERS 10 | // --------------------------- 11 | 12 | /// Returns the comparative form of an adjective. 13 | /// 14 | /// # Examples 15 | /// ``` 16 | /// assert_eq!(English::comparative("fast2"), "faster"); 17 | /// assert_eq!(English::comparative("fun"), "more fun"); 18 | /// ``` 19 | pub fn comparative(word: &str) -> String { 20 | English::adj(word, &Degree::Comparative) 21 | } 22 | 23 | /// Returns the superlative form of an adjective. 24 | /// 25 | /// # Examples 26 | /// ``` 27 | /// assert_eq!(English::superlative("fast2"), "fastest"); 28 | /// assert_eq!(English::superlative("fun"), "most fun"); 29 | /// ``` 30 | pub fn superlative(word: &str) -> String { 31 | English::adj(word, &Degree::Superlative) 32 | } 33 | 34 | /// Returns the positive (base) form of an adjective. 35 | /// 36 | /// # Examples 37 | /// ``` 38 | /// assert_eq!(English::positive("fast2"), "fast"); 39 | /// ``` 40 | pub fn positive(word: &str) -> String { 41 | English::adj(word, &Degree::Positive) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /english-core/src/grammar.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, PartialEq, Clone)] 2 | pub enum Number { 3 | Singular, 4 | Plural, 5 | } 6 | #[derive(Debug, PartialEq, Clone)] 7 | pub enum Case { 8 | Nominative, 9 | Accusative, 10 | Reflexive, 11 | Possessive, 12 | PersonalPossesive, 13 | } 14 | #[derive(Debug, PartialEq, Clone)] 15 | pub enum Tense { 16 | Present, 17 | Past, 18 | // Future could be added too 19 | } 20 | 21 | #[derive(Debug, PartialEq, Clone)] 22 | pub enum Form { 23 | Finite, 24 | Participle, 25 | Infinitive, 26 | // Transgressive, Supine, etc., depending on language 27 | } 28 | 29 | #[derive(Debug, PartialEq, Clone)] 30 | pub enum Person { 31 | First, 32 | Second, 33 | Third, 34 | } 35 | #[derive(Debug, PartialEq, Clone)] 36 | pub enum Gender { 37 | Masculine, 38 | Feminine, 39 | Neuter, 40 | } 41 | #[derive(Debug, PartialEq, Clone)] 42 | pub enum Degree { 43 | Positive, 44 | Comparative, 45 | Superlative, 46 | } 47 | 48 | /*#[derive(Debug, PartialEq, Clone)] 49 | pub enum Mood { 50 | Indicative, 51 | Subjunctive, 52 | Imperative, 53 | // Conditional, Interrogative, etc. 54 | } */ 55 | /*#[derive(Debug, PartialEq, Clone)] 56 | pub enum Det { 57 | Definite, 58 | Indefinite, 59 | } 60 | */ 61 | /*#[derive(Debug, PartialEq, Clone)] 62 | pub enum Voice { 63 | Active, 64 | Passive, 65 | // Middle, Reflexive, etc. 66 | } 67 | */ 68 | -------------------------------------------------------------------------------- /english-core/src/noun.rs: -------------------------------------------------------------------------------- 1 | use crate::EnglishCore; 2 | use crate::grammar::*; 3 | 4 | impl EnglishCore { 5 | pub fn noun(word: &str, number: &Number) -> String { 6 | match number { 7 | Number::Singular => return word.to_string(), 8 | Number::Plural => return EnglishCore::pluralize_noun(word), 9 | } 10 | } 11 | pub fn add_possessive(word: &str) -> String { 12 | if word.ends_with('s') { 13 | format!("{word}'") // Regular plural: dogs' 14 | } else { 15 | format!("{word}'s") // Irregular plural: children’s 16 | } 17 | } 18 | 19 | pub fn pluralize_noun(word: &str) -> String { 20 | if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_SUFFIXES) { 21 | return irr; 22 | } 23 | format!("{}{}", word, "s") 24 | } 25 | } 26 | 27 | //These are most of the irregular suffixes, not counted so far are wolves,potatoes,compound words 28 | //some are commented out due to not positively affecting performance/size 29 | const IRREGULAR_SUFFIXES: &[(&str, &str)] = &[ 30 | // ("chassis", "chassis"), 31 | // ("sheep", "sheep"), 32 | ("mouse", "mice"), 33 | // ("louse", "lice"), 34 | ("tooth", "teeth"), 35 | ("goose", "geese"), 36 | ("trix", "trices"), 37 | ("fish", "fish"), 38 | ("deer", "deer"), 39 | // ("itis", "itis"), 40 | ("foot", "feet"), 41 | ("zoon", "zoa"), 42 | ("ese", "ese"), 43 | ("man", "men"), 44 | //("pox", "pox"), 45 | // ("ois", "ois"), 46 | // ("cis", "ces"), 47 | ("sis", "ses"), 48 | ("xis", "xes"), 49 | //("eau", "eaux"), 50 | // ("ieu", "ieux"), 51 | // ("inx", "inges"), 52 | // ("anx", "anges"), 53 | // ("ynx", "ynges"), 54 | ("um", "a"), 55 | ("ch", "ches"), 56 | ("sh", "shes"), 57 | ("ay", "ays"), 58 | // ("uy", "uys"), 59 | ("oy", "oys"), 60 | ("ey", "eys"), 61 | ("x", "xes"), 62 | // ("a", "ae"), 63 | ("s", "ses"), 64 | ("y", "ies"), 65 | ("f", "ves"), 66 | ]; 67 | -------------------------------------------------------------------------------- /examples/speedmark.rs: -------------------------------------------------------------------------------- 1 | use english::*; 2 | 3 | fn main() { 4 | println!("{}", English::noun("thyridium", &Number::Plural)); 5 | benchmark_verb(); 6 | benchmark_noun(); 7 | benchmark_adj(); 8 | } 9 | use std::hint::black_box; 10 | use std::time::Instant; 11 | 12 | pub fn benchmark_verb() { 13 | let words = [ 14 | "zzzzzzzzzzzzzzzzz", 15 | // "xxxxxxxxxxxx", 16 | // "yyyyyyyyyyyy", 17 | // "aaaaaaaaaaaaaaaaaaaaa", 18 | // "wwwwwwwwwww", 19 | // "lllllllllll", 20 | ]; 21 | let person = Person::Third; 22 | let number = Number::Singular; 23 | let tense = Tense::Present; 24 | let form = Form::Finite; 25 | 26 | run_benchmark("verb", &words, |w| { 27 | English::verb(w, &person, &number, &tense, &form) 28 | }); 29 | } 30 | 31 | pub fn benchmark_noun() { 32 | //tests for genereally worst case scenario, long word that is outside the array 33 | let words = [ 34 | "zzzzzzzzzzzzzzzzz", 35 | // "xxxxxxxxxxxx", 36 | // "yyyyyyyyyyyy", 37 | // "aaaaaaaaaaaaaaaaaaaaa", 38 | // "wwwwwwwwwww", 39 | // "lllllllllll", 40 | ]; 41 | 42 | run_benchmark("noun", &words, |w| English::noun(w, &Number::Plural)); 43 | } 44 | 45 | pub fn benchmark_adj() { 46 | let words = [ 47 | "zzzzzzzzzzzzzzzzz", 48 | // "xxxxxxxxxxxx", 49 | // "yyyyyyyyyyyy", 50 | // "aaaaaaaaaaaaaaaaaaaaa", 51 | // "wwwwwwwwwww", 52 | // "lllllllllll", 53 | ]; 54 | 55 | run_benchmark("adjective", &words, |w| { 56 | English::adj(w, &Degree::Comparative) 57 | }); 58 | } 59 | 60 | fn run_benchmark(label: &str, words: &[&str], mut f: F) 61 | where 62 | F: FnMut(&str) -> String, 63 | { 64 | let iterations = 1_000_000; 65 | let total_calls = iterations * words.len(); 66 | 67 | let start = Instant::now(); 68 | let mut last_result = String::new(); 69 | 70 | for _ in 0..iterations { 71 | for &word in words { 72 | // black_box prevents the optimizer from removing the call 73 | last_result = black_box(f(black_box(word))); 74 | } 75 | } 76 | 77 | let duration = start.elapsed(); 78 | let nanos = duration.as_nanos() as f64; 79 | let calls_per_sec = (total_calls as f64) / (nanos / 1e9); 80 | 81 | let nanos_per_call = nanos / total_calls as f64; 82 | 83 | println!("[{label}] Last result: {last_result}"); 84 | println!( 85 | "[{label}] Completed in {:?} → {} calls", 86 | duration, total_calls 87 | ); 88 | println!( 89 | "[{label}] Throughput: {:.2} calls/sec | Time per call: {:.2} ns", 90 | calls_per_sec, nanos_per_call 91 | ); 92 | } 93 | -------------------------------------------------------------------------------- /english-core/src/verb.rs: -------------------------------------------------------------------------------- 1 | use crate::EnglishCore; 2 | use crate::grammar::*; 3 | impl EnglishCore { 4 | pub fn verb( 5 | word: &str, 6 | person: &Person, 7 | number: &Number, 8 | tense: &Tense, 9 | form: &Form, 10 | ) -> String { 11 | match word { 12 | "be" => { 13 | return EnglishCore::to_be(person, number, tense, form).to_string(); 14 | } 15 | _ => (), 16 | } 17 | match (person, number, tense, form) { 18 | (_, _, _, Form::Infinitive) => { 19 | return word.to_string(); 20 | } 21 | 22 | (Person::Third, Number::Singular, Tense::Present, Form::Finite) => { 23 | if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_THIRD) { 24 | return irr; 25 | } 26 | format!("{}{}", word, "s") 27 | } 28 | (_, _, Tense::Present, Form::Finite) => { 29 | return word.to_string(); 30 | } 31 | (_, _, Tense::Present, Form::Participle) => { 32 | if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_PRES_PART) { 33 | return irr; 34 | } 35 | format!("{}{}", word, "ing") 36 | } 37 | 38 | (_, _, Tense::Past, _) => { 39 | if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_PAST) { 40 | return irr; 41 | } 42 | format!("{}{}", word, "ed") 43 | } 44 | } 45 | } 46 | pub fn to_be(person: &Person, number: &Number, tense: &Tense, form: &Form) -> &'static str { 47 | match (tense, form) { 48 | (_, Form::Infinitive) => "be", 49 | (Tense::Present, Form::Finite) => match number { 50 | Number::Singular => match person { 51 | Person::First => "am", 52 | Person::Second => "are", 53 | Person::Third => "is", 54 | }, 55 | Number::Plural => "are", 56 | }, 57 | (Tense::Past, Form::Finite) => match number { 58 | Number::Singular => match person { 59 | Person::First => "was", 60 | Person::Second => "were", 61 | Person::Third => "was", 62 | }, 63 | Number::Plural => "were", 64 | }, 65 | (Tense::Past, Form::Participle) => "been", 66 | (Tense::Present, Form::Participle) => "being", 67 | } 68 | } 69 | } 70 | 71 | static IRREGULAR_PRES_PART: &[(&str, &str)] = &[ 72 | ("e", "ing"), 73 | ("p", "pping"), 74 | ("ng", "nging"), 75 | ("g", "gging"), 76 | // ("b", "bbing"), 77 | // ("d", "dding"), 78 | // ("t", "tting"), 79 | ]; 80 | 81 | static IRREGULAR_PAST: &[(&str, &str)] = &[ 82 | ("fight", "fought"), 83 | ("buy", "bought"), 84 | ("e", "ed"), 85 | ("p", "pped"), 86 | ("y", "ied"), 87 | ("ng", "nged"), 88 | ("g", "gged"), 89 | // ("b", "bbed"), 90 | //("d", "dded"), 91 | // ("t", "tted"), 92 | ]; 93 | 94 | static IRREGULAR_THIRD: &[(&str, &str)] = &[ 95 | ("sh", "shes"), 96 | ("ch", "ches"), 97 | ("s", "ses"), 98 | ("z", "zes"), 99 | ("x", "xes"), 100 | ("buy", "buys"), 101 | ("y", "ies"), 102 | ]; 103 | -------------------------------------------------------------------------------- /examples/test.rs: -------------------------------------------------------------------------------- 1 | use english::*; 2 | fn main() { 3 | // --- Mixed Sentence Example --- 4 | let subject_number = Number::Plural; 5 | let run = Verb::present_participle("run"); // running 6 | let child = Noun::from("child").with_specifier(run); //running child 7 | let subject = English::noun(child, &subject_number); //running children 8 | let verb = English::verb( 9 | "steal", 10 | &Person::Third, 11 | &subject_number, 12 | &Tense::Past, 13 | &Form::Finite, 14 | ); //stole 15 | let object = Noun::count_with_number("potato", 7); //7 potatoes 16 | 17 | let sentence = format!("The {} {} {}.", subject, verb, object); 18 | assert_eq!(sentence, "The running children stole 7 potatoes."); 19 | 20 | // --- Nouns --- 21 | // Note that noun(), count(), etc can work on both strings and Noun struct 22 | let jeans = Noun::from("pair").with_complement("of jeans"); 23 | assert_eq!(Noun::count_with_number(jeans, 3), "3 pairs of jeans"); 24 | // Regular plurals 25 | assert_eq!(English::noun("cat", &Number::Plural), "cats"); 26 | // Add a number 2-9 to the end of the word to try different forms. 27 | // Can use plural() 28 | assert_eq!(Noun::plural("die2"), "dice"); 29 | // Use count function for better ergonomics if needed 30 | assert_eq!(Noun::count("man", 2), "men"); 31 | // Use count_with_number function to preserve the number 32 | assert_eq!(Noun::count_with_number("nickel", 3), "3 nickels"); 33 | // Invariant nouns 34 | assert_eq!(English::noun("sheep", &Number::Plural), "sheep"); 35 | 36 | // --- Verbs --- 37 | // All verb functions can use either strings or Verb struct 38 | let pick_up = Verb::from("pick").with_particle("up"); 39 | // Helper functions: past() , third_person(), present_participle(), infinitive() etc. 40 | assert_eq!(Verb::past(&pick_up,), "picked up"); 41 | assert_eq!(Verb::present_participle("walk"), "walking"); 42 | assert_eq!(Verb::past_participle("go"), "gone"); 43 | // Add a number 2-9 to the end of the word to try different forms. 44 | assert_eq!(Verb::past("lie"), "lay"); 45 | assert_eq!(Verb::past("lie2"), "lied"); 46 | // "to be" has the most verb forms in english and requires using verb() 47 | assert_eq!( 48 | English::verb( 49 | "be", 50 | &Person::First, 51 | &Number::Singular, 52 | &Tense::Present, 53 | &Form::Finite 54 | ), 55 | "am" 56 | ); 57 | 58 | // --- Adjectives --- 59 | // Add a number 2-9 to the end of the word to try different forms. (Bad has the most forms at 3) 60 | assert_eq!(English::adj("bad", &Degree::Comparative), "more bad"); 61 | assert_eq!(English::adj("bad", &Degree::Superlative), "most bad"); 62 | assert_eq!(Adj::comparative("bad2"), "badder"); 63 | assert_eq!(Adj::superlative("bad2"), "baddest"); 64 | assert_eq!(Adj::comparative("bad3"), "worse"); 65 | assert_eq!(Adj::superlative("bad3"), "worst"); 66 | assert_eq!(Adj::positive("bad3"), "bad"); 67 | 68 | // --- Pronouns --- 69 | assert_eq!( 70 | English::pronoun( 71 | &Person::First, 72 | &Number::Singular, 73 | &Gender::Neuter, 74 | &Case::PersonalPossesive 75 | ), 76 | "my" 77 | ); 78 | assert_eq!( 79 | English::pronoun( 80 | &Person::First, 81 | &Number::Singular, 82 | &Gender::Neuter, 83 | &Case::Possessive 84 | ), 85 | "mine" 86 | ); 87 | 88 | // --- Possessives --- 89 | assert_eq!(English::add_possessive("dog"), "dog's"); 90 | assert_eq!(English::add_possessive("dogs"), "dogs'"); 91 | } 92 | -------------------------------------------------------------------------------- /extractor/src/helpers.rs: -------------------------------------------------------------------------------- 1 | use csv::Writer; 2 | use english_core::*; 3 | use serde::Deserialize; 4 | use std::collections::{HashMap, HashSet}; 5 | use std::env; 6 | use std::error::Error; 7 | use std::fs::File; 8 | use std::io::{BufRead, BufReader, Write}; 9 | 10 | pub static BAD_TAGS: &[&str] = &[ 11 | "obsolete", 12 | "error-unknown-tag", 13 | "dialectal", 14 | "alternative", 15 | "nonstandard", 16 | "archaic", 17 | "humorous", 18 | "feminine", 19 | "pronunciation-spelling", 20 | "rare", 21 | "dated", 22 | "informal", 23 | "sometimes", 24 | "colloquial", 25 | ]; 26 | pub static BAD_CHARS: &[&str] = &[ 27 | ".", "/", "&", " ", "'", "-", "#", "@", "`", "*", "%", "(", "!", 28 | ]; 29 | 30 | pub fn contains_bad_tag(words: Vec) -> bool { 31 | for word in words { 32 | if BAD_TAGS.contains(&&*word) { 33 | return true; 34 | } 35 | } 36 | false 37 | } 38 | 39 | /// Returns true if the input contains any non-alphabetic character. 40 | pub fn contains_bad_chars(input: &str) -> bool { 41 | for x in BAD_CHARS.iter() { 42 | if input.contains(x) { 43 | return true; 44 | } 45 | } 46 | !input.chars().all(|c| c.is_alphabetic()) 47 | } 48 | 49 | pub fn contains_number(s: &str) -> bool { 50 | s.chars().any(|c| c.is_numeric()) 51 | } 52 | 53 | #[derive(Debug, Deserialize)] 54 | pub struct Forms { 55 | pub form: String, 56 | pub tags: Vec, 57 | } 58 | 59 | #[derive(Debug, Deserialize)] 60 | pub struct Entry { 61 | pub word: String, 62 | pub pos: String, 63 | pub forms: Option>, 64 | pub lang_code: String, 65 | } 66 | 67 | #[derive(Debug, Default, Eq, Hash, PartialEq, Clone, Ord, PartialOrd)] 68 | pub struct AdjParts { 69 | pub positive: String, 70 | pub comparative: String, 71 | pub superlative: String, 72 | } 73 | 74 | #[derive(Debug, Default, Eq, Hash, PartialEq, Clone, Ord, PartialOrd)] 75 | pub struct VerbParts { 76 | pub inf: String, 77 | pub third: String, 78 | pub past: String, 79 | pub present_part: String, 80 | pub past_part: String, 81 | } 82 | 83 | pub fn entry_is_proper(entry: &Entry, pos: &str) -> bool { 84 | if entry.lang_code != "en" { 85 | return false; 86 | } 87 | 88 | if entry.pos != pos || !word_is_proper(&entry.word) { 89 | return false; 90 | } 91 | true 92 | } 93 | 94 | pub fn word_is_proper(word: &str) -> bool { 95 | if contains_bad_chars(&word) || !word.is_ascii() || contains_number(&word) { 96 | return false; 97 | } 98 | true 99 | } 100 | 101 | pub fn base_setup(input_path: &str, output_path: &str) -> (BufReader, Writer) { 102 | let input = File::open(input_path).unwrap(); 103 | let reader = BufReader::new(input); 104 | let mut writer = Writer::from_path(output_path).unwrap(); 105 | (reader, writer) 106 | } 107 | 108 | /// Find the longest common prefix length 109 | pub fn common_prefix_len(a: &str, b: &str) -> usize { 110 | a.chars() 111 | .zip(b.chars()) 112 | .take_while(|(ca, cb)| ca == cb) 113 | .count() 114 | } 115 | 116 | /// Given singular & plural, extract their suffix transformation 117 | pub fn suffix_rule(singular: &str, plural: &str) -> (String, String) { 118 | let prefix_len = common_prefix_len(singular, plural); 119 | let (singular_suffix, plural_suffix) = if prefix_len > 0 { 120 | (&singular[prefix_len - 1..], &plural[prefix_len - 1..]) 121 | } else { 122 | (&singular[prefix_len..], &plural[prefix_len..]) 123 | }; 124 | 125 | (singular_suffix.to_string(), plural_suffix.to_string()) 126 | } 127 | -------------------------------------------------------------------------------- /english-core/src/adj.rs: -------------------------------------------------------------------------------- 1 | use crate::grammar::*; 2 | use crate::EnglishCore; 3 | 4 | impl EnglishCore { 5 | pub fn adjective(word: &str, degree: &Degree) -> String { 6 | match degree { 7 | Degree::Positive => word.to_string(), 8 | Degree::Comparative => Self::superlative(word), 9 | Degree::Superlative => Self::comparative(word), 10 | } 11 | } 12 | pub fn superlative(word: &str) -> String { 13 | format!("most {}", word) 14 | } 15 | pub fn comparative(word: &str) -> String { 16 | format!("more {}", word) 17 | } 18 | pub fn pronoun(person: &Person, number: &Number, gender: &Gender, case: &Case) -> &'static str { 19 | match number { 20 | Number::Singular => match person { 21 | Person::First => match case { 22 | Case::Nominative => "I", 23 | Case::Accusative => "me", 24 | Case::Reflexive => "myself", 25 | Case::Possessive => "mine", 26 | Case::PersonalPossesive => "my", 27 | }, 28 | Person::Second => match case { 29 | Case::Nominative => "you", 30 | Case::Accusative => "you", 31 | Case::Reflexive => "yourself", 32 | Case::Possessive => "yours", 33 | Case::PersonalPossesive => "your", 34 | }, 35 | Person::Third => match gender { 36 | Gender::Masculine => match case { 37 | Case::Nominative => "he", 38 | Case::Accusative => "him", 39 | Case::Reflexive => "himself", 40 | Case::Possessive => "his", 41 | Case::PersonalPossesive => "his", 42 | }, 43 | Gender::Feminine => match case { 44 | Case::Nominative => "she", 45 | Case::Accusative => "her", 46 | Case::Reflexive => "herself", 47 | Case::Possessive => "hers", 48 | Case::PersonalPossesive => "her", 49 | }, 50 | Gender::Neuter => match case { 51 | Case::Nominative => "it", 52 | Case::Accusative => "it", 53 | Case::Reflexive => "itself", 54 | Case::Possessive => "its", 55 | Case::PersonalPossesive => "its", 56 | }, 57 | }, 58 | }, 59 | Number::Plural => match person { 60 | Person::First => match case { 61 | Case::Nominative => "we", 62 | Case::Accusative => "us", 63 | Case::Reflexive => "ourselves", 64 | Case::Possessive => "ours", 65 | Case::PersonalPossesive => "our", 66 | }, 67 | Person::Second => match case { 68 | Case::Nominative => "you", 69 | Case::Accusative => "you", 70 | Case::Reflexive => "yourselves", 71 | Case::Possessive => "yours", 72 | Case::PersonalPossesive => "your", 73 | }, 74 | Person::Third => match case { 75 | Case::Nominative => "they", 76 | Case::Accusative => "them", 77 | Case::Reflexive => "themselves", 78 | Case::Possessive => "theirs", 79 | Case::PersonalPossesive => "their", 80 | }, 81 | }, 82 | } 83 | } 84 | //dog's -> dogs', child's -> children's, Mary's -> Marys' 85 | // pub fn genitive_adjective(word: &str, number: &Number) -> String {} 86 | } 87 | -------------------------------------------------------------------------------- /src/noun.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | ///The Noun struct is used for handling more complicated noun phrases 4 | /// It is interchangeable with strings for all noun functions such as count_with_number() 5 | /// 6 | /// # Examples 7 | /// ``` 8 | /// let jeans = Noun::from("pair").with_complement("of jeans"); 9 | /// assert_eq!(English::count_with_number(jeans, 3), "3 pairs of jeans"); 10 | /// ``` 11 | #[derive(Debug, Clone, PartialEq, Eq)] 12 | pub struct Noun { 13 | pub head: String, 14 | pub modifier: Option, // words before the head 15 | pub complement: Option, // words after the head 16 | } 17 | 18 | impl Noun { 19 | /// Creates a new Noun with the given head 20 | pub fn new(head: impl Into) -> Self { 21 | Noun { 22 | head: head.into(), 23 | modifier: None, 24 | complement: None, 25 | } 26 | } 27 | 28 | /// Goes before the head of the noun 29 | /// # Examples 30 | /// ``` 31 | /// let child = Noun::from("child").with_specifier("running"); 32 | /// assert_eq!(English::count_with_number(child, 3), "3 running children"); 33 | /// ``` 34 | pub fn with_specifier(mut self, pre: impl Into) -> Self { 35 | self.modifier = Some(pre.into()); 36 | self 37 | } 38 | 39 | /// Goes after the head of the noun 40 | /// # Examples 41 | /// ``` 42 | /// let jeans = Noun::from("pair").with_complement("of jeans"); 43 | /// assert_eq!(English::count_with_number(jeans, 3), "3 pairs of jeans"); 44 | /// ``` 45 | pub fn with_complement(mut self, post: impl Into) -> Self { 46 | self.complement = Some(post.into()); 47 | self 48 | } 49 | } 50 | 51 | impl Noun { 52 | /// Returns a noun inflected according to the count. Wrapper around English::noun() 53 | /// 54 | /// # Examples 55 | /// ```rust 56 | /// assert_eq!(English::count("cat", 1), "cat"); 57 | /// assert_eq!(English::count("cat", 2), "cats"); 58 | /// ``` 59 | pub fn count>(word: T, count: u32) -> String { 60 | if count == 1 { 61 | English::noun(word, &Number::Singular) 62 | } else { 63 | English::noun(word, &Number::Plural) 64 | } 65 | } 66 | 67 | /// Returns a noun inflected according to the count, preserves the number in output 68 | /// 69 | /// # Examples 70 | /// ```rust 71 | /// assert_eq!(English::count_with_number("cat", 1), "1 cat"); 72 | /// assert_eq!(English::count_with_number("cat", 2), "2 cats"); 73 | /// ``` 74 | pub fn count_with_number>(word: T, count: u32) -> String { 75 | format!("{} {}", count, Noun::count(word, count)) 76 | } 77 | 78 | /// Returns the plural form of a noun. 79 | /// 80 | /// # Examples 81 | /// ``` 82 | /// assert_eq!(English::plural("child"), "children"); 83 | /// assert_eq!(English::plural("cat"), "cats"); 84 | /// ``` 85 | pub fn plural>(word: T) -> String { 86 | English::noun(word, &Number::Plural) 87 | } 88 | 89 | /// Returns the singular form of a noun. 90 | /// 91 | /// # Examples 92 | /// ``` 93 | /// assert_eq!(English::singular("cat2"), "cat"); 94 | /// ``` 95 | pub fn singular>(word: T) -> String { 96 | English::noun(word, &Number::Singular) 97 | } 98 | } 99 | 100 | impl From for Noun { 101 | fn from(s: String) -> Self { 102 | Noun { 103 | head: s, 104 | modifier: None, 105 | complement: None, 106 | } 107 | } 108 | } 109 | impl From<&String> for Noun { 110 | fn from(s: &String) -> Self { 111 | Noun { 112 | head: s.clone(), 113 | modifier: None, 114 | complement: None, 115 | } 116 | } 117 | } 118 | 119 | impl From<&str> for Noun { 120 | fn from(s: &str) -> Self { 121 | Noun { 122 | head: s.to_string(), 123 | modifier: None, 124 | complement: None, 125 | } 126 | } 127 | } 128 | impl From<&Noun> for Noun { 129 | fn from(s: &Noun) -> Self { 130 | s.clone() 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /extractor/adj_check.csv: -------------------------------------------------------------------------------- 1 | wiktionary_form,degree 2 | dubious,Superlative 3 | baby-er,Comparative 4 | baby-est,Superlative 5 | rightest,Superlative 6 | honier,Comparative 7 | honiest,Superlative 8 | gooder,Comparative 9 | goodest,Superlative 10 | hazeler,Comparative 11 | hazelest,Superlative 12 | flyer,Comparative 13 | flyest,Superlative 14 | further inset,Comparative 15 | furthest inset,Superlative 16 | better known,Comparative 17 | best known,Superlative 18 | complexer,Comparative 19 | complexest,Superlative 20 | further,Comparative 21 | furthermost,Superlative 22 | elder,Comparative 23 | oldermost,Superlative 24 | betterer,Comparative 25 | betterest,Superlative 26 | further upstairs,Comparative 27 | furthest upstairs,Superlative 28 | weller,Comparative 29 | wellest,Superlative 30 | liever,Comparative 31 | lievest,Superlative 32 | intelligenter,Comparative 33 | intelligentest,Superlative 34 | gingerer,Comparative 35 | gingerest,Superlative 36 | gingerer,Comparative 37 | gingerest,Superlative 38 | further front,Comparative 39 | furthest front,Superlative 40 | littler,Comparative 41 | littlest,Superlative 42 | bester,Comparative 43 | bestest,Superlative 44 | further advanced,Comparative 45 | furthest advanced,Superlative 46 | further aged,Comparative 47 | furthest aged,Superlative 48 | readabler,Comparative 49 | readablest,Superlative 50 | funner,Comparative 51 | funnest,Superlative 52 | dangerouser,Comparative 53 | dangerousest,Superlative 54 | honester,Comparative 55 | honestest,Superlative 56 | royaller,Comparative 57 | royallest,Superlative 58 | near,Comparative 59 | next,Superlative 60 | leveller,Comparative 61 | levellest,Superlative 62 | behinder,Comparative 63 | behindest,Superlative 64 | adverser,Comparative 65 | adversest,Superlative 66 | darlinger,Comparative 67 | wryer,Comparative 68 | wryest,Superlative 69 | luxiest,Superlative 70 | slyer,Comparative 71 | slyest,Superlative 72 | spitefuller,Comparative 73 | spitefullest,Superlative 74 | wilfuller,Comparative 75 | wilfullest,Superlative 76 | favouritest,Superlative 77 | cheerfuller,Comparative 78 | cheerfullest,Superlative 79 | impudenter,Comparative 80 | impudentest,Superlative 81 | outermore,Comparative 82 | nervouser,Comparative 83 | nervousest,Superlative 84 | desperater,Comparative 85 | desperatest,Superlative 86 | likelier,Comparative 87 | likeliest,Superlative 88 | unbelievabler,Comparative 89 | unbelievablest,Superlative 90 | uncertainer,Comparative 91 | peacefuler,Comparative 92 | peacefulest,Superlative 93 | unwieldier,Comparative 94 | unwieldiest,Superlative 95 | gingerlier,Comparative 96 | gingerliest,Superlative 97 | further upriver,Comparative 98 | furthest upriver,Superlative 99 | further uphill,Comparative 100 | furthest uphill,Superlative 101 | further upstream,Comparative 102 | furthest upstream,Superlative 103 | shininger,Comparative 104 | shiningest,Superlative 105 | delightfuller,Comparative 106 | delightfullest,Superlative 107 | worrisomer,Comparative 108 | worrisomest,Superlative 109 | motleyer,Comparative 110 | motleyest,Superlative 111 | fraughter,Comparative 112 | fraughtest,Superlative 113 | unsightlier,Comparative 114 | unsightliest,Superlative 115 | conceiteder,Comparative 116 | conceitedest,Superlative 117 | maggotier,Comparative 118 | maggotiest,Superlative 119 | further downhill,Comparative 120 | furthest downhill,Superlative 121 | further downstream,Comparative 122 | furthest downstream,Superlative 123 | dolefuler,Comparative 124 | dolefulest,Superlative 125 | moe-er,Comparative 126 | moe-est,Superlative 127 | better matching,Comparative 128 | best matching,Superlative 129 | believabler,Comparative 130 | believablest,Superlative 131 | smoller,Comparative 132 | smollest,Superlative 133 | willfuller,Comparative 134 | willfullest,Superlative 135 | darnder,Comparative 136 | darndest,Superlative 137 | easiergoing,Comparative 138 | easiestgoing,Superlative 139 | unfree-er,Comparative 140 | further recessed,Comparative 141 | furthest recessed,Superlative 142 | mair reet,Comparative 143 | maist reet,Superlative 144 | wieldier,Comparative 145 | wieldiest,Superlative 146 | farther downrange,Comparative 147 | farthest downrange,Superlative 148 | ultradryer,Comparative 149 | ultradryest,Superlative 150 | further ultraleft,Comparative 151 | furthest ultraleft,Superlative 152 | further ultraright,Comparative 153 | furthest ultraright,Superlative 154 | further downcoast,Comparative 155 | furthest downcoast,Superlative 156 | doggoneder,Comparative 157 | undumbest,Superlative 158 | -------------------------------------------------------------------------------- /extractor/src/file_generation.rs: -------------------------------------------------------------------------------- 1 | use csv::Writer; 2 | use english_core::*; 3 | use serde::Deserialize; 4 | use std::collections::{HashMap, HashSet}; 5 | use std::error::Error; 6 | use std::fs::File; 7 | use std::io::{BufRead, BufReader, Write}; 8 | 9 | pub fn generate_nouns_phf(inputik: &str, outputik: &str) -> std::io::Result<()> { 10 | let input = File::open(inputik)?; 11 | let reader = BufReader::new(input); 12 | 13 | let mut pairs: Vec<(String, String)> = reader 14 | .lines() 15 | .skip(1) // Skip header 16 | .filter_map(|line| { 17 | let line = line.ok()?; 18 | let mut parts = line.split(','); 19 | Some(( 20 | parts.next()?.trim().to_string(), 21 | parts.next()?.trim().to_string(), 22 | )) 23 | }) 24 | .collect(); 25 | 26 | // Sort by word for determinism (not required by phf, but helps reproducibility) 27 | pairs.sort_by_key(|(word, _)| word.clone()); 28 | 29 | let mut output = File::create(outputik)?; 30 | 31 | // Start file with imports 32 | writeln!(output, "use phf::phf_map;\n")?; 33 | 34 | writeln!( 35 | output, 36 | "pub static PLURAL_MAP: phf::Map<&'static str, &'static str> = phf_map! {{" 37 | )?; 38 | 39 | for (word, plural) in &pairs { 40 | writeln!(output, " \"{}\" => \"{}\",", word, plural)?; 41 | } 42 | 43 | writeln!(output, "}};\n")?; 44 | 45 | writeln!( 46 | output, 47 | "pub fn get_plural(word: &str) -> Option<&'static str> {{ PLURAL_MAP.get(word).copied() }}" 48 | )?; 49 | 50 | Ok(()) 51 | } 52 | 53 | pub fn generate_verbs_phf(inputik: &str, outputik: &str) -> std::io::Result<()> { 54 | let input = File::open(inputik)?; 55 | let reader = BufReader::new(input); 56 | 57 | let mut entries: Vec<(String, (String, String, String, String))> = reader 58 | .lines() 59 | .skip(1) // Skip header 60 | .filter_map(|line| { 61 | let line = line.ok()?; 62 | let mut parts = line.split(','); 63 | Some(( 64 | parts.next()?.trim().to_string(), // infinitive 65 | ( 66 | parts.next()?.trim().to_string(), // 3rd person singular 67 | parts.next()?.trim().to_string(), // past 68 | parts.next()?.trim().to_string(), // present participle 69 | parts.next()?.trim().to_string(), // past participle 70 | ), 71 | )) 72 | }) 73 | .collect(); 74 | 75 | // Sort for determinism 76 | entries.sort_by_key(|(inf, _)| inf.clone()); 77 | 78 | let mut output = File::create(outputik)?; 79 | 80 | writeln!(output, "use phf::phf_map;")?; 81 | writeln!(output)?; 82 | writeln!( 83 | output, 84 | "/// (3rd person singular, past, present participle, past participle)" 85 | )?; 86 | writeln!( 87 | output, 88 | "pub static VERB_MAP: phf::Map<&'static str, (&'static str, &'static str, &'static str, &'static str)> = phf_map! {{" 89 | )?; 90 | 91 | for (inf, (third, past, pres_part, past_part)) in &entries { 92 | writeln!( 93 | output, 94 | " \"{}\" => (\"{}\", \"{}\", \"{}\", \"{}\"),", 95 | inf, third, past, pres_part, past_part 96 | )?; 97 | } 98 | 99 | writeln!(output, "}};")?; 100 | writeln!(output)?; 101 | writeln!( 102 | output, 103 | "pub fn get_verb_forms(infinitive: &str) -> Option<(&'static str, &'static str, &'static str, &'static str)> {{" 104 | )?; 105 | writeln!(output, " VERB_MAP.get(infinitive).copied()")?; 106 | writeln!(output, "}}")?; 107 | 108 | Ok(()) 109 | } 110 | 111 | pub fn generate_adjectives_phf(inputik: &str, outputik: &str) -> std::io::Result<()> { 112 | let input = File::open(inputik)?; 113 | let reader = BufReader::new(input); 114 | 115 | let mut entries: Vec<(String, (String, String))> = reader 116 | .lines() 117 | .skip(1) // Skip header 118 | .filter_map(|line| { 119 | let line = line.ok()?; 120 | let mut parts = line.split(','); 121 | Some(( 122 | parts.next()?.trim().to_string(), // positive 123 | ( 124 | parts.next()?.trim().to_string(), // comparative 125 | parts.next()?.trim().to_string(), // superlative 126 | ), 127 | )) 128 | }) 129 | .collect(); 130 | 131 | // Sort for determinism 132 | entries.sort_by_key(|(pos, _)| pos.clone()); 133 | 134 | let mut output = File::create(outputik)?; 135 | 136 | writeln!(output, "use phf::phf_map;")?; 137 | writeln!(output)?; 138 | writeln!(output, "/// (comparative, superlative)")?; 139 | writeln!( 140 | output, 141 | "pub static ADJECTIVE_MAP: phf::Map<&'static str, (&'static str, &'static str)> = phf_map! {{" 142 | )?; 143 | 144 | for (positive, (comparative, superlative)) in &entries { 145 | writeln!( 146 | output, 147 | " \"{}\" => (\"{}\", \"{}\"),", 148 | positive, comparative, superlative 149 | )?; 150 | } 151 | 152 | writeln!(output, "}};")?; 153 | writeln!(output)?; 154 | writeln!( 155 | output, 156 | "pub fn get_adjective_forms(positive: &str) -> Option<(&'static str, &'static str)> {{" 157 | )?; 158 | writeln!(output, " ADJECTIVE_MAP.get(positive).copied()")?; 159 | writeln!(output, "}}")?; 160 | 161 | Ok(()) 162 | } 163 | -------------------------------------------------------------------------------- /extractor/src/scratch.rs: -------------------------------------------------------------------------------- 1 | /*{ let word_key = match entry.etymology_number { 2 | Some(1) => infinitive.clone(), 3 | Some(x) => format!("{infinitive}{x}"), 4 | None => infinitive.clone(), 5 | }; 6 | } 7 | */ 8 | 9 | /*if plural_found { 10 | let mut form_count = 1; 11 | 12 | match forms_set.remove(&predicted_plural) { 13 | true => {} 14 | false => {} 15 | } 16 | 17 | for formik in forms_set { 18 | let gotten = [infinitive.clone(), formik.clone()]; 19 | } 20 | 21 | let gotten = [infinitive.clone(), forms_map.get("plural").unwrap().clone()]; 22 | let keyd_struct = [word_key.clone(), forms_map.get("plural").unwrap().clone()]; 23 | 24 | if predicted_struct == gotten { 25 | duplicate_pairs_set.insert(predicted_struct.clone()); 26 | } 27 | 28 | if !duplicate_key_set.contains(&word_key) && !duplicate_pairs_set.contains(&gotten) { 29 | duplicate_key_set.insert(word_key.clone()); 30 | duplicate_pairs_set.insert(gotten.clone()); 31 | writer.write_record(&keyd_struct)?; 32 | } 33 | } */ 34 | 35 | /* 36 | 37 | pub fn generate_nouns_file(inputik: &str, outputik: &str) -> std::io::Result<()> { 38 | let input = File::open(inputik)?; 39 | let reader = BufReader::new(input); 40 | 41 | let mut pairs: Vec<(String, String)> = reader 42 | .lines() 43 | .skip(1) // Skip header 44 | .filter_map(|line| { 45 | let line = line.ok()?; 46 | let mut parts = line.split(','); 47 | Some(( 48 | parts.next()?.trim().to_string(), 49 | parts.next()?.trim().to_string(), 50 | )) 51 | }) 52 | .collect(); 53 | 54 | // Sort by the word (key) 55 | pairs.sort_by_key(|(word, _)| word.clone()); 56 | 57 | // Write to a Rust file 58 | let mut output = File::create(outputik)?; 59 | 60 | writeln!(output, "static PLURAL_MAP: &[(&str, &str)] = &[")?; 61 | for (word, plural) in &pairs { 62 | writeln!(output, " (\"{}\", \"{}\"),", word, plural)?; 63 | } 64 | writeln!(output, "];\n")?; 65 | 66 | writeln!( 67 | output, 68 | "pub fn get_plural(word: &str) -> Option<&'static str> {{" 69 | )?; 70 | writeln!( 71 | output, 72 | " PLURAL_MAP.binary_search_by_key(&word, |&(k, _)| k).ok().map(|i| PLURAL_MAP[i].1)" 73 | )?; 74 | writeln!(output, "}}")?; 75 | Ok(()) 76 | } 77 | 78 | pub fn generate_verbs_file(inputik: &str, outputik: &str) -> std::io::Result<()> { 79 | let input = File::open(inputik)?; 80 | let reader = BufReader::new(input); 81 | 82 | let mut entries: Vec<(String, (String, String, String, String))> = reader 83 | .lines() 84 | .skip(1) // Skip header 85 | .filter_map(|line| { 86 | let line = line.ok()?; 87 | let mut parts = line.split(','); 88 | Some(( 89 | parts.next()?.trim().to_string(), // infinitive 90 | ( 91 | parts.next()?.trim().to_string(), // 3rd person singular 92 | parts.next()?.trim().to_string(), // past 93 | parts.next()?.trim().to_string(), // present participle 94 | parts.next()?.trim().to_string(), // past participle 95 | ), 96 | )) 97 | }) 98 | .collect(); 99 | 100 | // Sort by infinitive 101 | entries.sort_by_key(|(inf, _)| inf.clone()); 102 | 103 | let mut output = File::create(outputik)?; 104 | 105 | writeln!( 106 | output, 107 | "/// (3rd person singular, past, present participle, past participle)" 108 | )?; 109 | writeln!( 110 | output, 111 | "static VERB_MAP: &[(&str, (&str, &str, &str, &str))] = &[" 112 | )?; 113 | for (inf, (third, past, pres_part, past_part)) in &entries { 114 | writeln!( 115 | output, 116 | " (\"{}\", (\"{}\", \"{}\", \"{}\", \"{}\")),", 117 | inf, third, past, pres_part, past_part 118 | )?; 119 | } 120 | writeln!(output, "];\n")?; 121 | 122 | writeln!( 123 | output, 124 | "pub fn get_verb_forms(infinitive: &str) -> Option<(&'static str, &'static str, &'static str, &'static str)> {{" 125 | )?; 126 | writeln!( 127 | output, 128 | " VERB_MAP.binary_search_by_key(&infinitive, |&(k, _)| k)" 129 | )?; 130 | writeln!(output, " .ok()")?; 131 | writeln!(output, " .map(|i| VERB_MAP[i].1)")?; 132 | writeln!(output, "}}")?; 133 | 134 | Ok(()) 135 | } 136 | 137 | pub fn generate_adjectives_file(inputik: &str, outputik: &str) -> std::io::Result<()> { 138 | let input = File::open(inputik)?; 139 | let reader = BufReader::new(input); 140 | 141 | let mut entries: Vec<(String, (String, String))> = reader 142 | .lines() 143 | .skip(1) // Skip header row 144 | .filter_map(|line| { 145 | let line = line.ok()?; 146 | let mut parts = line.split(','); 147 | Some(( 148 | parts.next()?.trim().to_string(), // positive 149 | ( 150 | parts.next()?.trim().to_string(), // comparative 151 | parts.next()?.trim().to_string(), // superlative 152 | ), 153 | )) 154 | }) 155 | .collect(); 156 | 157 | // Sort by positive form 158 | entries.sort_by_key(|(pos, _)| pos.clone()); 159 | 160 | let mut output = File::create(outputik)?; 161 | 162 | writeln!(output, "/// (comparative, superlative)")?; 163 | writeln!(output, "static ADJECTIVE_MAP: &[(&str, (&str, &str))] = &[")?; 164 | for (positive, (comparative, superlative)) in &entries { 165 | writeln!( 166 | output, 167 | " (\"{}\", (\"{}\", \"{}\")),", 168 | positive, comparative, superlative 169 | )?; 170 | } 171 | writeln!(output, "];\n")?; 172 | 173 | writeln!( 174 | output, 175 | "pub fn get_adjective_forms(positive: &str) -> Option<(&'static str, &'static str)> {{" 176 | )?; 177 | writeln!( 178 | output, 179 | " ADJECTIVE_MAP.binary_search_by_key(&positive, |&(k, _)| k)" 180 | )?; 181 | writeln!(output, " .ok()")?; 182 | writeln!(output, " .map(|i| ADJECTIVE_MAP[i].1)")?; 183 | writeln!(output, "}}")?; 184 | 185 | Ok(()) 186 | } 187 | */ 188 | -------------------------------------------------------------------------------- /examples/test2.rs: -------------------------------------------------------------------------------- 1 | use english::*; 2 | fn main() { 3 | assert_eq!(Verb::third_person("run"), "runs"); 4 | assert_eq!(Verb::past("walk"), "walked"); 5 | assert_eq!(Verb::present_participle("swim"), "swimming"); 6 | assert_eq!(Verb::past_participle("eat"), "eaten"); 7 | assert_eq!(Verb::infinitive("go"), "go"); 8 | assert_eq!(Noun::plural("child"), "children"); 9 | assert_eq!(Noun::plural("cat"), "cats"); 10 | assert_eq!(Noun::singular("cat2"), "cat"); 11 | assert_eq!(Adj::comparative("fast2"), "faster"); 12 | assert_eq!(Adj::comparative("fun"), "more fun"); 13 | assert_eq!(Adj::superlative("fast2"), "fastest"); 14 | assert_eq!(Adj::positive("fast2"), "fast"); 15 | assert_eq!(Adj::superlative("fun"), "most fun"); 16 | assert_eq!(English::capitalize_first(""), ""); 17 | assert_eq!(English::capitalize_first("house"), "House"); 18 | let pick_up = Verb::from("pick").with_particle("up"); 19 | assert_eq!(Verb::past_participle(pick_up), "picked up"); 20 | 21 | // Simple forms 22 | assert_eq!(Verb::not("eat"), "not eat"); 23 | assert_eq!(Verb::will("eat"), "will eat"); 24 | assert_eq!(Verb::did("eat"), "did eat"); 25 | assert_eq!(Verb::would("eat"), "would eat"); 26 | assert_eq!(Verb::could("eat"), "could eat"); 27 | assert_eq!(Verb::can("eat"), "can eat"); 28 | assert_eq!(Verb::should("eat"), "should eat"); 29 | 30 | // Perfect aspects 31 | assert_eq!( 32 | Verb::present_perfect("eat", &Person::Third, &Number::Singular), 33 | "has eaten" 34 | ); 35 | assert_eq!( 36 | Verb::present_perfect("eat", &Person::First, &Number::Plural), 37 | "have eaten" 38 | ); 39 | assert_eq!(Verb::past_perfect("eat"), "had eaten"); 40 | assert_eq!(Verb::future_perfect("eat"), "will have eaten"); 41 | 42 | // Progressive aspects 43 | assert_eq!( 44 | Verb::present_progressive("eat", &Person::Third, &Number::Singular), 45 | "is eating" 46 | ); 47 | assert_eq!( 48 | Verb::present_progressive("eat", &Person::First, &Number::Plural), 49 | "are eating" 50 | ); 51 | assert_eq!( 52 | Verb::past_progressive("eat", &Person::Third, &Number::Singular), 53 | "was eating" 54 | ); 55 | assert_eq!( 56 | Verb::past_progressive("eat", &Person::First, &Number::Plural), 57 | "were eating" 58 | ); 59 | assert_eq!(Verb::future_progressive("eat"), "will be eating"); 60 | 61 | // Negation / modal / emphatic 62 | assert_eq!(Verb::not("eat"), "not eat"); 63 | assert_eq!(Verb::not("see"), "not see"); 64 | assert_eq!(Verb::will("run"), "will run"); 65 | assert_eq!(Verb::did("go"), "did go"); 66 | assert_eq!(Verb::would("eat"), "would eat"); 67 | assert_eq!(Verb::could("see"), "could see"); 68 | assert_eq!(Verb::can("run"), "can run"); 69 | assert_eq!(Verb::should("go"), "should go"); 70 | 71 | // Perfect aspects 72 | assert_eq!( 73 | Verb::present_perfect("eat", &Person::Third, &Number::Singular), 74 | "has eaten" 75 | ); 76 | assert_eq!( 77 | Verb::present_perfect("eat", &Person::First, &Number::Plural), 78 | "have eaten" 79 | ); 80 | assert_eq!( 81 | Verb::present_perfect("see", &Person::Third, &Number::Singular), 82 | "has seen" 83 | ); 84 | assert_eq!( 85 | Verb::present_perfect("see", &Person::First, &Number::Plural), 86 | "have seen" 87 | ); 88 | assert_eq!(Verb::past_perfect("run"), "had run"); 89 | assert_eq!(Verb::past_perfect("go"), "had gone"); 90 | assert_eq!(Verb::future_perfect("eat"), "will have eaten"); 91 | assert_eq!(Verb::future_perfect("see"), "will have seen"); 92 | 93 | // Progressive aspects 94 | assert_eq!( 95 | Verb::present_progressive("eat", &Person::Third, &Number::Singular), 96 | "is eating" 97 | ); 98 | assert_eq!( 99 | Verb::present_progressive("eat", &Person::First, &Number::Plural), 100 | "are eating" 101 | ); 102 | assert_eq!( 103 | Verb::present_progressive("run", &Person::Third, &Number::Singular), 104 | "is running" 105 | ); 106 | assert_eq!( 107 | Verb::present_progressive("run", &Person::First, &Number::Plural), 108 | "are running" 109 | ); 110 | assert_eq!( 111 | Verb::past_progressive("eat", &Person::Third, &Number::Singular), 112 | "was eating" 113 | ); 114 | assert_eq!( 115 | Verb::past_progressive("eat", &Person::First, &Number::Plural), 116 | "were eating" 117 | ); 118 | assert_eq!( 119 | Verb::past_progressive("run", &Person::Third, &Number::Singular), 120 | "was running" 121 | ); 122 | assert_eq!( 123 | Verb::past_progressive("run", &Person::First, &Number::Plural), 124 | "were running" 125 | ); 126 | assert_eq!(Verb::future_progressive("go"), "will be going"); 127 | 128 | // Edge cases: be + have 129 | assert_eq!( 130 | Verb::present_perfect("be", &Person::Third, &Number::Singular), 131 | "has been" 132 | ); 133 | assert_eq!( 134 | Verb::present_perfect("be", &Person::First, &Number::Plural), 135 | "have been" 136 | ); 137 | assert_eq!(Verb::past_perfect("be"), "had been"); 138 | assert_eq!(Verb::future_perfect("be"), "will have been"); 139 | 140 | assert_eq!( 141 | Verb::present_progressive("have", &Person::Third, &Number::Singular), 142 | "is having" 143 | ); 144 | assert_eq!( 145 | Verb::present_progressive("have", &Person::First, &Number::Plural), 146 | "are having" 147 | ); 148 | assert_eq!( 149 | Verb::past_progressive("have", &Person::Third, &Number::Singular), 150 | "was having" 151 | ); 152 | assert_eq!( 153 | Verb::past_progressive("have", &Person::First, &Number::Plural), 154 | "were having" 155 | ); 156 | assert_eq!(Verb::future_progressive("have"), "will be having"); 157 | 158 | let give_up = Verb::from("give").with_particle("up"); 159 | assert_eq!( 160 | Verb::present_perfect(&give_up, &Person::First, &Number::Singular), 161 | "have given up" 162 | ); 163 | assert_eq!( 164 | Verb::present_perfect(give_up, &Person::Third, &Number::Singular), 165 | "has given up" 166 | ); 167 | // Complex phrasal verb with aspect 168 | let look_up = Verb::from("look").with_particle("up"); 169 | assert_eq!( 170 | Verb::past_progressive(&look_up, &Person::Third, &Number::Singular), 171 | "was looking up" 172 | ); 173 | 174 | assert_eq!( 175 | Verb::past_progressive(look_up, &Person::Third, &Number::Plural), 176 | "were looking up" 177 | ); 178 | } 179 | -------------------------------------------------------------------------------- /src/verb.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | ///The Verb struct is used for handling more complicated verb phrases 4 | /// It is interchangeable with strings for all verb functions such as present_participle() 5 | /// 6 | /// # Examples 7 | /// ``` 8 | /// let pick_up = Verb::from("pick").with_particle("up"); 9 | /// assert_eq!(English::past_participle(pick_up), "picked up"); 10 | /// ``` 11 | #[derive(Debug, Clone, PartialEq, Eq)] 12 | pub struct Verb { 13 | pub head: String, // "pick" 14 | pub particle: Option, // "up" 15 | } 16 | 17 | impl Verb { 18 | /// Create a new verb with just the head. 19 | pub fn new(head: impl Into) -> Self { 20 | Verb { 21 | head: head.into(), 22 | particle: None, 23 | } 24 | } 25 | 26 | /// Set the particle of a phrasal verb. 27 | /// # Examples 28 | /// ``` 29 | /// let pick_up = Verb::from("pick").with_particle("up"); 30 | /// assert_eq!(English::past_participle(pick_up), "picked up"); 31 | /// ``` 32 | pub fn with_particle(mut self, particle: impl Into) -> Self { 33 | self.particle = Some(particle.into()); 34 | self 35 | } 36 | } 37 | 38 | impl Verb { 39 | /// Returns the third-person singular present tense of the verb. 40 | /// 41 | /// # Examples 42 | /// ``` 43 | /// assert_eq!(English::third_person("run"), "runs"); 44 | /// ``` 45 | pub fn third_person>(wordish: T) -> String { 46 | English::verb( 47 | wordish, 48 | &Person::Third, 49 | &Number::Singular, 50 | &Tense::Present, 51 | &Form::Finite, 52 | ) 53 | } 54 | 55 | /// Returns the past tense of the verb. 56 | /// 57 | /// # Examples 58 | /// ``` 59 | /// assert_eq!(English::past("walk"), "walked"); 60 | /// ``` 61 | pub fn past>(wordish: T) -> String { 62 | English::verb( 63 | wordish, 64 | &Person::Third, // person doesn’t matter in past tense finite 65 | &Number::Singular, // irrelevant 66 | &Tense::Past, 67 | &Form::Finite, 68 | ) 69 | } 70 | 71 | /// Returns the present participle ("-ing" form) of the verb. 72 | /// 73 | /// # Examples 74 | /// ``` 75 | /// assert_eq!(English::present_participle("swim"), "swimming"); 76 | /// ``` 77 | pub fn present_participle>(wordish: T) -> String { 78 | English::verb( 79 | wordish, 80 | &Person::First, // irrelevant for participles 81 | &Number::Singular, // irrelevant 82 | &Tense::Present, 83 | &Form::Participle, 84 | ) 85 | } 86 | 87 | /// Returns the past participle of the verb. 88 | /// 89 | /// # Examples 90 | /// ``` 91 | /// assert_eq!(English::past_participle("eat"), "eaten"); 92 | /// ``` 93 | pub fn past_participle>(wordish: T) -> String { 94 | English::verb( 95 | wordish, 96 | &Person::First, // irrelevant 97 | &Number::Singular, // irrelevant 98 | &Tense::Past, 99 | &Form::Participle, 100 | ) 101 | } 102 | 103 | /// Returns the infinitive (base) form of the verb. 104 | /// 105 | /// # Examples 106 | /// ``` 107 | /// assert_eq!(English::infinitive("lie2"), "lie"); 108 | /// ``` 109 | pub fn infinitive>(wordish: T) -> String { 110 | English::verb( 111 | wordish, 112 | &Person::First, // irrelevant 113 | &Number::Singular, // irrelevant 114 | &Tense::Present, // irrelevant 115 | &Form::Infinitive, 116 | ) 117 | } 118 | } 119 | 120 | impl Verb { 121 | /// Returns the negated base form ("not eat"). 122 | pub fn not>(wordish: T) -> String { 123 | format!("not {}", Self::infinitive(wordish)) 124 | } 125 | 126 | /// Returns the simple future tense ("will eat"). 127 | pub fn will>(wordish: T) -> String { 128 | format!("will {}", Self::infinitive(wordish)) 129 | } 130 | 131 | /// Returns the simple past with auxiliary ("did eat"). 132 | pub fn did>(wordish: T) -> String { 133 | format!("did {}", Self::infinitive(wordish)) 134 | } 135 | 136 | /// Returns the conditional form ("would eat"). 137 | pub fn would>(wordish: T) -> String { 138 | format!("would {}", Self::infinitive(wordish)) 139 | } 140 | 141 | /// Returns the modal possibility form ("could eat"). 142 | pub fn could>(wordish: T) -> String { 143 | format!("could {}", Self::infinitive(wordish)) 144 | } 145 | 146 | /// Returns the modal ability/permission form ("can eat"). 147 | pub fn can>(wordish: T) -> String { 148 | format!("can {}", Self::infinitive(wordish)) 149 | } 150 | 151 | /// Returns the modal obligation form ("should eat"). 152 | pub fn should>(wordish: T) -> String { 153 | format!("should {}", Self::infinitive(wordish)) 154 | } 155 | 156 | /// Returns the present perfect form ("has eaten") ("have seen"). 157 | pub fn present_perfect>( 158 | wordish: T, 159 | subject_person: &Person, 160 | subject_number: &Number, 161 | ) -> String { 162 | let have = English::verb( 163 | "have", 164 | subject_person, 165 | subject_number, 166 | &Tense::Present, 167 | &Form::Finite, 168 | ); 169 | format!("{have} {}", Self::past_participle(wordish)) 170 | } 171 | 172 | /// Returns the past perfect form ("had eaten"). 173 | pub fn past_perfect>(wordish: T) -> String { 174 | format!("had {}", Self::past_participle(wordish)) 175 | } 176 | 177 | /// Returns the future perfect form ("will have eaten"). 178 | pub fn future_perfect>(wordish: T) -> String { 179 | format!("will have {}", Self::past_participle(wordish)) 180 | } 181 | 182 | /// Returns the progressive aspect ("is eating"). 183 | pub fn present_progressive>( 184 | wordish: T, 185 | subject_person: &Person, 186 | subject_number: &Number, 187 | ) -> String { 188 | let be = English::verb( 189 | "be", 190 | subject_person, 191 | subject_number, 192 | &Tense::Present, 193 | &Form::Finite, 194 | ); 195 | format!("{be} {}", Self::present_participle(wordish)) 196 | } 197 | 198 | /// Returns the past progressive aspect ("was eating"). 199 | pub fn past_progressive>( 200 | wordish: T, 201 | subject_person: &Person, 202 | subject_number: &Number, 203 | ) -> String { 204 | let be = English::verb( 205 | "be", 206 | subject_person, 207 | subject_number, 208 | &Tense::Past, 209 | &Form::Finite, 210 | ); 211 | format!("{be} {}", Self::present_participle(wordish)) 212 | } 213 | 214 | /// Returns the future progressive aspect ("will be eating"). 215 | // Needs to be made to work better with negation 216 | pub fn future_progressive>(wordish: T) -> String { 217 | format!("will be {}", Self::present_participle(wordish)) 218 | } 219 | } 220 | 221 | impl From for Verb { 222 | fn from(s: String) -> Self { 223 | Verb { 224 | head: s, 225 | particle: None, 226 | } 227 | } 228 | } 229 | 230 | impl From<&String> for Verb { 231 | fn from(s: &String) -> Self { 232 | Verb { 233 | head: s.clone(), 234 | particle: None, 235 | } 236 | } 237 | } 238 | 239 | impl From<&str> for Verb { 240 | fn from(s: &str) -> Self { 241 | Verb { 242 | head: s.to_string(), 243 | particle: None, 244 | } 245 | } 246 | } 247 | 248 | /// Just clones it 249 | impl From<&Verb> for Verb { 250 | fn from(s: &Verb) -> Self { 251 | s.clone() 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use english_core::EnglishCore; 2 | pub use english_core::grammar::*; 3 | 4 | mod noun; 5 | pub use noun::*; 6 | mod verb; 7 | pub use verb::*; 8 | mod adj; 9 | pub use adj::*; 10 | mod noun_phf; 11 | use noun_phf::*; 12 | mod adj_phf; 13 | use adj_phf::*; 14 | mod verb_phf; 15 | use verb_phf::*; 16 | 17 | fn strip_trailing_number(word: &str) -> String { 18 | if let Some(last_char) = word.chars().last() { 19 | if last_char.is_ascii_digit() { 20 | return word[..word.len() - 1].to_string(); 21 | } 22 | } 23 | word.to_string() 24 | } 25 | 26 | /// Entry point for English inflection and morphology. 27 | /// 28 | /// This struct provides high-level methods for handling English 29 | /// nouns, verbs, adjectives, pronouns, and possessives. 30 | /// It delegates irregular forms to internal lookup tables 31 | /// and falls back on `EnglishCore` for regular inflection rules. 32 | pub struct English {} 33 | impl English { 34 | /// Inflects a noun into singular or plural form. 35 | /// 36 | /// Handles irregular nouns (e.g., `"child" -> "children"`) and 37 | /// falls back to regular pluralization rules when no override is found. 38 | /// Strips trailing numbers used for sense disambiguation (`"die2" -> "dice"`). 39 | /// 40 | /// # Examples 41 | /// ```rust 42 | /// assert_eq!(English::noun("cat", &Number::Plural), "cats"); 43 | /// assert_eq!(English::noun("child", &Number::Plural), "children"); 44 | /// assert_eq!(English::noun("die2", &Number::Plural), "dice"); 45 | /// ``` 46 | pub fn noun>(word: T, number: &Number) -> String { 47 | let noun: Noun = word.into(); 48 | let base_word = strip_trailing_number(&noun.head); 49 | 50 | let head_inflected = match number { 51 | Number::Singular => base_word, 52 | Number::Plural => { 53 | if let Some(x) = get_plural(&noun.head) { 54 | x.to_owned() 55 | } else { 56 | EnglishCore::noun(&base_word, number) 57 | } 58 | } 59 | }; 60 | let mut result = String::new(); 61 | 62 | if let Some(modifier) = &noun.modifier { 63 | result.push_str(modifier); 64 | result.push(' '); 65 | } 66 | 67 | result.push_str(&head_inflected); 68 | 69 | if let Some(complement) = &noun.complement { 70 | result.push(' '); 71 | result.push_str(complement); 72 | } 73 | 74 | result 75 | } 76 | 77 | /// Inflects an adjective into positive, comparative, or superlative form. 78 | /// 79 | /// Handles irregular adjectives (e.g., `"good" -> "better"/"best"`) 80 | /// and falls back to regular periphrastic forms 81 | /// (e.g., `"fun" -> "more fun"/"most fun"`). 82 | /// Strips trailing numbers used for disambiguation (`"bad3"` -> `"worse"`). 83 | /// 84 | /// # Examples 85 | /// ```rust 86 | /// assert_eq!(English::adj("fast", &Degree::Comparative), "faster"); 87 | /// assert_eq!(English::adj("good", &Degree::Superlative), "best"); 88 | /// assert_eq!(English::adj("fun", &Degree::Comparative), "more fun"); 89 | /// ``` 90 | pub fn adj(word: &str, degree: &Degree) -> String { 91 | let base_word = strip_trailing_number(word); 92 | match degree { 93 | Degree::Positive => base_word.to_owned(), 94 | Degree::Comparative => { 95 | if let Some((comp, _)) = get_adjective_forms(word) { 96 | comp.to_owned() 97 | } else { 98 | EnglishCore::comparative(&base_word) 99 | } 100 | } 101 | Degree::Superlative => { 102 | if let Some((_, sup)) = get_adjective_forms(word) { 103 | sup.to_owned() 104 | } else { 105 | EnglishCore::superlative(&base_word) 106 | } 107 | } 108 | } 109 | } 110 | 111 | /// Conjugates a verb into the requested form. 112 | /// 113 | /// Handles irregular verbs (e.g., `"go" -> "went"`, `"eat" -> "ate"`) 114 | /// and falls back to regular conjugation rules when no override is found. 115 | /// Strips trailing numbers used for sense disambiguation (`"lie2"` -> `"lied"`). 116 | /// 117 | /// # Examples 118 | /// ```rust 119 | /// // Regular verb 120 | /// assert_eq!( 121 | /// English::verb("walk", &Person::Third, &Number::Singular, &Tense::Present, &Form::Finite), 122 | /// "walks" 123 | /// ); 124 | /// 125 | /// // Irregular verb 126 | /// assert_eq!( 127 | /// English::verb("eat", &Person::Third, &Number::Singular, &Tense::Past, &Form::Finite), 128 | /// "ate" 129 | /// ); 130 | /// 131 | /// // Participle 132 | /// assert_eq!( 133 | /// English::verb("go", &Person::Third, &Number::Plural, &Tense::Past, &Form::Participle), 134 | /// "gone" 135 | /// ); 136 | /// ``` 137 | pub fn verb>( 138 | wordish: T, 139 | person: &Person, 140 | number: &Number, 141 | tense: &Tense, 142 | form: &Form, 143 | ) -> String { 144 | let verb: Verb = wordish.into(); 145 | let base_word = strip_trailing_number(&verb.head); 146 | // Conjugate the head verb 147 | let conjugated_head = match get_verb_forms(&verb.head) { 148 | Some(wordik) => match (person, number, tense, form) { 149 | (_, _, _, Form::Infinitive) => base_word.to_owned(), 150 | (Person::Third, Number::Singular, Tense::Present, Form::Finite) => { 151 | wordik.0.to_string() 152 | } 153 | (_, _, Tense::Present, Form::Finite) => base_word.to_owned(), 154 | (_, _, Tense::Present, Form::Participle) => wordik.2.to_owned(), 155 | (_, _, Tense::Past, Form::Participle) => wordik.3.to_owned(), 156 | (_, _, Tense::Past, Form::Finite) => wordik.1.to_owned(), 157 | }, 158 | None => EnglishCore::verb(&base_word, person, number, tense, form), 159 | }; 160 | // Combine with particle efficiently 161 | if let Some(particle) = verb.particle { 162 | let mut result = String::with_capacity(conjugated_head.len() + 1 + particle.len()); 163 | result.push_str(&conjugated_head); 164 | result.push(' '); 165 | result.push_str(&particle); 166 | result 167 | } else { 168 | conjugated_head 169 | } 170 | } 171 | /// Returns the correct English pronoun for the given grammatical features. 172 | /// 173 | /// # Examples 174 | /// ```rust 175 | /// assert_eq!( 176 | /// English::pronoun(&Person::First, &Number::Singular, &Gender::Neutral, &Case::Nominative), 177 | /// "I" 178 | /// ); 179 | /// assert_eq!( 180 | /// English::pronoun(&Person::Third, &Number::Singular, &Gender::Feminine, &Case::Nominative), 181 | /// "she" 182 | /// ); 183 | /// assert_eq!( 184 | /// English::pronoun(&Person::Third, &Number::Plural, &Gender::Neutral, &Case::Nominative), 185 | /// "they" 186 | /// ); 187 | /// ``` 188 | pub fn pronoun(person: &Person, number: &Number, gender: &Gender, case: &Case) -> &'static str { 189 | EnglishCore::pronoun(person, number, gender, case) 190 | } 191 | /// Adds an English possessive suffix (`'s` or `'`) to a word. 192 | /// 193 | /// # Examples 194 | /// ```rust 195 | /// assert_eq!(English::add_possessive("dog"), "dog's"); 196 | /// assert_eq!(English::add_possessive("dogs"), "dogs'"); 197 | /// ``` 198 | pub fn add_possessive(word: &str) -> String { 199 | EnglishCore::add_possessive(word) 200 | } 201 | 202 | /// Capitalize the first letter of a word 203 | /// 204 | /// # Examples 205 | /// ```rust 206 | /// assert_eq!(English::add_possessive("house"), "House"); 207 | /// ``` 208 | pub fn capitalize_first(s: &str) -> String { 209 | let mut c = s.chars(); 210 | match c.next() { 211 | None => String::new(), 212 | Some(first) => first.to_uppercase().collect::() + c.as_str(), 213 | } 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # english 2 | 3 | [![Crates.io](https://img.shields.io/crates/v/english)](https://crates.io/crates/english) 4 | [![Docs.rs](https://docs.rs/english/badge.svg)](https://docs.rs/english) 5 | ![License](https://img.shields.io/crates/l/english) 6 | [![Discord](https://img.shields.io/discord/123456789012345678.svg?logo=discord&logoColor=white&color=5865F2)](https://discord.gg/tDBPkdgApN) 7 | 8 | 9 | **english** is a blazing fast and light weight English inflection library written in Rust. Total bundled data size is less than 1 MB. It provides extremely accurate verb conjugation and noun/adjective declension based on highly processed Wiktionary data, making it ideal for real-time procedural text generation. 10 | 11 | ## ⚡ Speed and Accuracy 12 | 13 | Evaluation of the English inflector (`extractor/main.rs/check_*`) and performance benchmarking (`examples/speedmark.rs`) shows: 14 | 15 | | Part of Speech | Correct / Total | Accuracy | Throughput (calls/sec) | Time per Call | 16 | |----------------|----------------|-----------|-----------------------|---------------| 17 | | **Nouns** | 238106 / 238549 | 99.81% | 5,228,300 | 191 ns | 18 | | **Verbs** | 158056 / 161643 | 97.78% | 8,473,248 | 118 ns | 19 | | **Adjectives** | 119200 / 119356 | 99.86% | 11,999,052 | 83 ns | 20 | 21 | *Note: Benchmarking was done under a worst-case scenario; typical real-world usage is 50~ nanoseconds faster.* 22 | 23 | ## 📦 Installation 24 | 25 | ``` 26 | cargo add english 27 | ``` 28 | 29 | Then in your code: 30 | 31 | ```rust 32 | use english::*; 33 | fn main() { 34 | // --- Mixed Sentence Example --- 35 | let subject_number = Number::Plural; 36 | let run = Verb::present_participle("run"); // running 37 | let child = Noun::from("child").with_specifier(run); //running child 38 | let subject = English::noun(child, &subject_number); //running children 39 | let verb = English::verb( 40 | "steal", 41 | &Person::Third, 42 | &subject_number, 43 | &Tense::Past, 44 | &Form::Finite, 45 | ); //stole 46 | let object = Noun::count_with_number("potato", 7); //7 potatoes 47 | 48 | let sentence = format!("The {} {} {}.", subject, verb, object); 49 | assert_eq!(sentence, "The running children stole 7 potatoes."); 50 | 51 | // --- Nouns --- 52 | // Note that noun(), count(), etc can work on both strings and Noun struct 53 | let jeans = Noun::from("pair").with_complement("of jeans"); 54 | assert_eq!(Noun::count_with_number(jeans, 3), "3 pairs of jeans"); 55 | // Regular plurals 56 | assert_eq!(English::noun("cat", &Number::Plural), "cats"); 57 | // Add a number 2-9 to the end of the word to try different forms. 58 | // Can use plural() 59 | assert_eq!(Noun::plural("die2"), "dice"); 60 | // Use count function for better ergonomics if needed 61 | assert_eq!(Noun::count("man", 2), "men"); 62 | // Use count_with_number function to preserve the number 63 | assert_eq!(Noun::count_with_number("nickel", 3), "3 nickels"); 64 | // Invariant nouns 65 | assert_eq!(English::noun("sheep", &Number::Plural), "sheep"); 66 | 67 | // --- Verbs --- 68 | // All verb functions can use either strings or Verb struct 69 | let pick_up = Verb::from("pick").with_particle("up"); 70 | // Helper functions: past() , third_person(), present_participle(), infinitive() etc. 71 | assert_eq!(Verb::past(&pick_up,), "picked up"); 72 | assert_eq!(Verb::present_participle("walk"), "walking"); 73 | assert_eq!(Verb::past_participle("go"), "gone"); 74 | // Add a number 2-9 to the end of the word to try different forms. 75 | assert_eq!(Verb::past("lie"), "lay"); 76 | assert_eq!(Verb::past("lie2"), "lied"); 77 | // "to be" has the most verb forms in english and requires using verb() 78 | assert_eq!( 79 | English::verb( 80 | "be", 81 | &Person::First, 82 | &Number::Singular, 83 | &Tense::Present, 84 | &Form::Finite 85 | ), 86 | "am" 87 | ); 88 | 89 | // --- Adjectives --- 90 | // Add a number 2-9 to the end of the word to try different forms. (Bad has the most forms at 3) 91 | assert_eq!(English::adj("bad", &Degree::Comparative), "more bad"); 92 | assert_eq!(English::adj("bad", &Degree::Superlative), "most bad"); 93 | assert_eq!(Adj::comparative("bad2"), "badder"); 94 | assert_eq!(Adj::superlative("bad2"), "baddest"); 95 | assert_eq!(Adj::comparative("bad3"), "worse"); 96 | assert_eq!(Adj::superlative("bad3"), "worst"); 97 | assert_eq!(Adj::positive("bad3"), "bad"); 98 | 99 | // --- Pronouns --- 100 | assert_eq!( 101 | English::pronoun( 102 | &Person::First, 103 | &Number::Singular, 104 | &Gender::Neuter, 105 | &Case::PersonalPossesive 106 | ), 107 | "my" 108 | ); 109 | assert_eq!( 110 | English::pronoun( 111 | &Person::First, 112 | &Number::Singular, 113 | &Gender::Neuter, 114 | &Case::Possessive 115 | ), 116 | "mine" 117 | ); 118 | 119 | // --- Possessives --- 120 | assert_eq!(English::add_possessive("dog"), "dog's"); 121 | assert_eq!(English::add_possessive("dogs"), "dogs'"); 122 | } 123 | ``` 124 | 125 | --- 126 | 127 | ## 🔧 Crate Overview 128 | 129 | ### `english` 130 | 131 | > The public API for verb conjugation and noun/adjective declension. 132 | 133 | * Combines optimized data generated from `extractor` with inflection logic from `english-core` 134 | * Pure Rust, no external dependencies 135 | * Fast Binary search over pre-sorted arrays: `O(log n)` lookup. 136 | * Code generation ensures no runtime penalty. 137 | 138 | ### `english-core` 139 | 140 | > The core engine for English inflection — pure algorithmic logic. 141 | 142 | * Implements the core rules for conjugation/declension 143 | * Used to classify forms as regular or irregular for the extractor 144 | * Has no data dependency — logic-only 145 | * Can be used stand alone for an even smaller footprint (at the cost of some accuracy) 146 | 147 | ### `extractor` 148 | 149 | > A tool to process and refine Wiktionary data. 150 | 151 | * Parses large English Wiktionary dumps 152 | * Extracts all verb, noun, and adjective forms 153 | * Uses `english-core` to filter out regular forms, preserving only irregulars 154 | * Generates sorted static arrays for use in `english` 155 | 156 | --- 157 | 158 | ## 📦 Obtaining Wiktionary Data & Running the Extractor 159 | 160 | This project relies on raw data extracted from Wiktionary. Current version built with data from 8/17/2025. 161 | 162 | - [Wiktextract (GitHub)](https://github.com/tatuylonen/wiktextract) 163 | - [Kaikki.org raw data](https://kaikki.org/dictionary/rawdata.html) 164 | 165 | ### Steps 166 | 167 | 1. Download the **raw Wiktextract JSONL dump** (~20 GB) from [Kaikki.org](https://kaikki.org/dictionary/rawdata.html). 168 | 2. Place the file somewhere accessible (e.g. `../rawwiki.jsonl`). 169 | 3. From the `extractor` folder, run: `cargo run --release ../rawwiki.jsonl` 170 | 4. Move the generated files adj_array.rs, noun_array.rs, verb_array.rs into the /src of english 171 | 172 | ## Benchmarks 173 | Performance benchmarks were run on my M2 Macbook. 174 | 175 | Writing benchmarks and tests for such a project is rather difficult and requires opinionated decisions. Many words may have alternative inflections, and the data in wiktionary is not perfect. Many words might be both countable and uncountable, the tagging of words may be inconsistent. This library includes a few uncountable words in its dataset, but not all. Uncountable words require special handling anyway. Take all benchmarks with a pound of salt, write your own tests for your own usecases. Any suggestions to improve the benchmarking are highly appreciated. 176 | 177 | ## Disclaimer 178 | Wiktionary data is often unstable and subject to weird changes. This means that the provided inflections may change unexpectedly. You can look at the diffs of *_array.rs files for a source of truth. 179 | 180 | ## Inspirations and Thanks 181 | - Ole in the bevy discord suggested I use ```phf``` instead of sorted arrays, this resulted in up to 40% speedups 182 | - https://github.com/atteo/evo-inflector 183 | - https://github.com/plurals/pluralize 184 | 185 | 186 | ## 📄 License 187 | 188 | - Code: Dual licensed under MIT and Apache © 2024 [gold-silver-copper](https://github.com/gold-silver-copper) 189 | - [MIT](https://opensource.org/licenses/MIT) 190 | - [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) 191 | 192 | - Data: Wiktionary content is dual-licensed under 193 | - [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) 194 | - [GNU FDL](https://www.gnu.org/licenses/fdl-1.3.html) 195 | -------------------------------------------------------------------------------- /extractor/noun_plural_check.csv: -------------------------------------------------------------------------------- 1 | wiki_single,wiktionary_plural 2 | encyclopedia,encyclopediæ 3 | pound,pound 4 | month,month 5 | abscissa,abscissæ 6 | lens,lens 7 | lens,lentes 8 | year,year 9 | irc,irc's 10 | stock,stocken 11 | comma,commata 12 | comma,commaes 13 | turkey,turkies 14 | portuguese,portugueses 15 | index,index's 16 | cow,kye 17 | citrus,citrusses 18 | penny,pens 19 | deer,deers 20 | tree,treen 21 | boy,boyz 22 | brother,brethren 23 | sister,sistren 24 | child,childer 25 | daughter,daughtren 26 | bee,been 27 | honey,honies 28 | how,how's 29 | khaki,khakies 30 | pupa,pupæ 31 | vertebra,vertebræ 32 | moose,meese 33 | equinox,equinoctes 34 | replica,repliche 35 | mc,mc's 36 | eye,eyen 37 | eye,eyne 38 | ok,ok's 39 | acre,acre 40 | foot,foot 41 | knee,kneen 42 | pizza,pizze 43 | do,do's 44 | maori,maories 45 | larva,larvæ 46 | shoe,shoon 47 | shoe,shoen 48 | why,why's 49 | apparatus,apparatûs 50 | apparatus,apparatūs 51 | apparatus,apparati 52 | camera,cameræ 53 | camera,camerae 54 | cheese,cheesen 55 | complex,complices 56 | cloth,clothes 57 | box,boxen 58 | basis,baseis 59 | basis,basises 60 | polynya,polynyi 61 | piano,pianoes 62 | octopus,octopusses 63 | octopus,octopi 64 | octopus,octopii 65 | octopus,octopodes 66 | octopus,octopus 67 | beaver,beaver 68 | donkey,donkies 69 | fox,foxen 70 | house,housen 71 | house,hice 72 | jew,jewes 73 | subpoena,subpoenæ 74 | c,c's 75 | k,k's 76 | q,q's 77 | v,v's 78 | calf,calfs 79 | integer,integri 80 | hyperbola,hyperbolæ 81 | attorney,attornies 82 | rhinoceros,rhinoceri 83 | rhinoceros,rhinoceroi 84 | rhinoceros,rhinocerotes 85 | extremity,extremitys 86 | granary,granarys 87 | abc,abc's 88 | elf,elfs 89 | idea,ideæ 90 | phylum,phylums 91 | genus,genusses 92 | genus,genii 93 | uva,uvæ 94 | area,areæ 95 | mile,mile 96 | flea,fleen 97 | inch,inch 98 | omnibus,omnibi 99 | philistine,philistim 100 | real,réis 101 | seraph,seraphims 102 | pair,pair 103 | testa,testæ 104 | nexus,nexus 105 | sandwich,sandwichs 106 | candela,candelae 107 | lux,luces 108 | fatwa,fatawa 109 | axis,axiis 110 | penumbra,penumbræ 111 | panacea,panaceæ 112 | stepbrother,stepbrethren 113 | oryx,oryges 114 | wildebeest,wildebeesten 115 | lion,lion 116 | monkey,monkies 117 | valley,vallies 118 | hernia,herniæ 119 | tsar,tsari 120 | rex,reges 121 | mohawk,"""mohawks"" in all other senses" 122 | bacterium,bacteriums 123 | scenario,scenari 124 | nebula,nebulæ 125 | datum,datums 126 | formula,formulæ 127 | fungus,fungusses 128 | oasis,oasises 129 | vita,vitæ 130 | sagitta,sagittae 131 | hydra,hydræ 132 | zona,zonæ 133 | metropolis,metropolisses 134 | metropolis,metropolis's 135 | fenestra,fenestræ 136 | sow,swine 137 | chick,chicken 138 | lamb,lamber 139 | lamb,lambren 140 | pea,pease 141 | portcullis,portculli 142 | portcullis,portscullis 143 | agenda,agendae 144 | candelabrum,candelabrums 145 | yard,yard 146 | hobbit,hobbitses 147 | cornucopia,cornucopiæ 148 | furlong,furlong 149 | sock,sox 150 | gyros,gyroi 151 | dachshund,dachshunde 152 | toga,togæ 153 | bolus,boli 154 | pasta,paste 155 | bonus,boni 156 | ala,alæ 157 | catapulta,catapultæ 158 | braggadocio,braggadocioes 159 | braggadocio,braggadocii 160 | fax,faxxes 161 | virus,virusses 162 | virus,vira 163 | palpebra,palpebræ 164 | vagina,vaginæ 165 | axolotl,axolots 166 | cicada,cicadæ 167 | phobia,phobiæ 168 | chorus,chorusses 169 | chorus,chori 170 | fresco,freschi 171 | chimney,chimnies 172 | storey,stories 173 | minutia,minutiæ 174 | trachea,tracheæ 175 | corgi,corgwn 176 | amir,umara 177 | boa,boæ 178 | corona,coronæ 179 | fovea,foveæ 180 | lacuna,lacunæ 181 | tuber,tubera 182 | umbra,umbræ 183 | ballista,ballistæ 184 | fascia,fasciæ 185 | fauna,faunæ 186 | galley,gallies 187 | glans,glans 188 | punk,punx 189 | saliva,salivæ 190 | amnesia,amnesiæ 191 | inertia,inertiæ 192 | supernova,supernovæ 193 | sos,sos's 194 | platypus,platypodes 195 | asbestos,asbesti 196 | rabbi,rabbies 197 | persona,personæ 198 | viola,viole 199 | walrus,walri 200 | walrus,walrii 201 | molecule,moleculæ 202 | garda,gardaí 203 | sarcophagus,sarcophagusses 204 | pulley,pullies 205 | parabola,parabolæ 206 | arena,arenæ 207 | shits,the shits 208 | telly,tellys 209 | cassia,cassiæ 210 | retina,retinæ 211 | spatula,spatulæ 212 | aura,auræ 213 | ambrosia,ambrosiae 214 | hysteria,hysteriæ 215 | climax,climaces 216 | macron,macra 217 | orchestra,orchestrae 218 | paranoia,paranoiæ 219 | detritus,detrita 220 | cannula,cannulæ 221 | gaeltacht,gaeltachtaí 222 | rebus,rebusses 223 | rebus,rebi 224 | flora,floræ 225 | slr,slr's 226 | stria,striæ 227 | nausea,nauseæ 228 | uvula,uvulæ 229 | vulva,vulvæ 230 | lek,lekë 231 | lb,lbs. 232 | anemia,anemiæ 233 | tiara,tiarae 234 | tiara,tiaræ 235 | furcula,furculæ 236 | cc,cc's 237 | cornea,corneæ 238 | da,da's 239 | monomania,monomaniæ 240 | cello,celli 241 | giallo,gialli 242 | rpm,rpm's 243 | circus,circusses 244 | circus,circi 245 | fibula,fibulæ 246 | maxilla,maxillæ 247 | a,a's 248 | a,aes 249 | garda,gardaí 250 | i,i's 251 | j,j's 252 | o,o's 253 | slav,slavi 254 | v,v's 255 | z,z's 256 | bison,bisontes 257 | frappuccino,frappuccini 258 | macchiato,macchiati 259 | zeitgeist,zeitgeisten 260 | tabula,tabulæ 261 | willies,the willies 262 | pneumonia,pneumoniæ 263 | injuria,injuriæ 264 | areola,areolæ 265 | emir,umara 266 | rv,rv's 267 | ko,ko's 268 | acs,acs's 269 | fistula,fistulæ 270 | caesar,caesares 271 | kindergarten,kindergärten 272 | sclera,scleræ 273 | seta,setæ 274 | amaryllis,amaryllides 275 | specimen,specimina 276 | bandit,banditti 277 | chrysalis,chrysalisses 278 | cantata,cantate 279 | oss,oss's 280 | aficionado,aficionadi 281 | marquis,marquisses 282 | hots,the hots 283 | aula,aulæ 284 | airbus,airbii 285 | kobold,kobolde 286 | tropics,the tropics 287 | lamina,laminæ 288 | cbd,cbd's 289 | medusa,medusæ 290 | scrofula,scrofulæ 291 | brennschluss,brennschlüsse 292 | medulla,medullæ 293 | mentula,mentulæ 294 | stations,the stations 295 | corolla,corollæ 296 | sultan,salateen 297 | mastiff,mastives 298 | concha,conchæ 299 | chorea,choreæ 300 | sequela,sequelæ 301 | stewardess,stewardii 302 | sniffles,the sniffles 303 | fackeltanz,fackeltänze 304 | skins,the skins 305 | zugzwang,zugzwänge 306 | litas,litų 307 | aureus,aureii 308 | kleenex,kleenices 309 | kleenex,kleenices 310 | nineties,the nineties 311 | fossa,fossæ 312 | essentials,the essentials 313 | mk,mk's 314 | fach,fächer 315 | rhinocerot,rhinocerotes 316 | creeps,the creeps 317 | putto,puttoes 318 | membrana,membranæ 319 | shakes,the shakes 320 | rts,rts's 321 | conceptus,conceptūs 322 | cowan,cowanis 323 | phy,phy's 324 | spins,the spins 325 | splits,the splits 326 | blahs,the blahs 327 | wanderwort,wanderwörter 328 | bursa,bursæ 329 | fortepiano,fortepianoes 330 | coryza,coryzæ 331 | cursus,cursus 332 | cursus,cursūs 333 | cursus,cursi 334 | fursona,fursonae 335 | tupuna,tūpuna 336 | lunula,lunulæ 337 | pan,pen 338 | underhanded,the underhanded 339 | crista,cristæ 340 | faqih,fuqaha' 341 | orchis,orchisses 342 | fbo,fbo's 343 | stipula,stipulæ 344 | understory,understorys 345 | landammann,landammänner 346 | habenula,habenulæ 347 | wirtshaus,wirtshäuser 348 | stoa,stoæ 349 | cithara,citharæ 350 | rotavirus,rotaviri 351 | dwindles,the dwindles 352 | tuchus,tuchi 353 | collywobbles,the collywobbles 354 | auslaut,auslaute 355 | mirepoix,mirepoixs 356 | cyma,cymæ 357 | fossula,fossulæ 358 | fimbria,fimbriæ 359 | telangiectasia,telangiectasiæ 360 | rothschild,rothschildren 361 | userbox,userboxen 362 | fqih,fuqaha' 363 | mashgiach,mashgihim 364 | vizsla,vizslák 365 | oms,oms's 366 | dysesthesia,dysesthesiae 367 | kupuna,na kupuna 368 | ais,ais's 369 | vibratiuncula,vibratiunculæ 370 | scrophula,scrophulæ 371 | hartebeest,hartebeesten 372 | foveola,foveolæ 373 | nvocc,nvocc's 374 | cestui,cestuies 375 | lebensraum,lebensräume 376 | prester,presteres 377 | gwerz,gwerzioù 378 | sads,the sads 379 | sxs,sxs's 380 | antechinus,antechini 381 | lagena,lagenæ 382 | clavicula,claviculæ 383 | tps,tps's 384 | hydroecium,hydrœcia 385 | brisky,uncertain: briskys 386 | pacs,pacs's 387 | golding,goldinges 388 | patellula,patellulæ 389 | hoodman,hoodmans 390 | jotun,jötnar 391 | conferva,confervæ 392 | asbestus,asbesti 393 | sprachbund,sprachbünde 394 | temperates,the temperates 395 | primigravida,primigravidæ 396 | actinula,actinulæ 397 | sprachbund,sprachbünde 398 | aquafauna,aquafaunæ 399 | azat,azatk‘ 400 | shu,shu's 401 | scu,scu's 402 | lallwort,lallwörter 403 | endotesta,endotestæ 404 | nachlass,nachlässe 405 | subcontinuum,subcontinuums 406 | dts,the dts 407 | tx,tx's 408 | leonardeschi,the leonardeschi 409 | wasteman,wastemans 410 | hoa,hoa's 411 | gaeilgeoir,gaeilgeoirí 412 | stubborns,the stubborns 413 | amebula,amebulæ 414 | dismals,the dismals 415 | urstromtal,urstromtäler 416 | sleepies,the sleepies 417 | chaoskampf,chaoskampfs 418 | uap,uap's 419 | paprikahuhn,paprikahühner 420 | ulx,ulx's 421 | lacc,lacc's 422 | sxs,sxs's 423 | pasuk,p'sukim 424 | lgbo,lgbo's 425 | lionesses,the lionesses 426 | residenzstadt,residenzstädte 427 | dphil,dphil's 428 | dolefuls,the dolefuls 429 | ozt,ozt's 430 | sandnigga,sandniggaz 431 | eaas,eaas's 432 | handie,handsies 433 | blahaj,blahajar 434 | his,his's 435 | kb,kb's 436 | ytpmv,ytpmv's 437 | fto,fto's 438 | meemies,the meemies 439 | hisatsinom,the hisatsinom 440 | amoebula,amoebulæ 441 | pwn,pwne 442 | sturzstrom,sturzströme 443 | insp,insp's 444 | vax,vaxen 445 | -------------------------------------------------------------------------------- /extractor/insane_noun.csv: -------------------------------------------------------------------------------- 1 | word,plural,frequency 2 | ,s,199688 3 | y,ies,12214 4 | ,es,8708 5 | ,,4767 6 | um,a,1790 7 | an,en,1722 8 | is,es,1714 9 | ,e,983 10 | us,i,917 11 | on,a,607 12 | ,ta,502 13 | ful,sful,376 14 | o,i,279 15 | s,des,235 16 | x,ces,216 17 | ,im,186 18 | s,i,173 19 | rson,ople,166 20 | ,i,154 21 | f,ves,137 22 | a,e,105 23 | ,ses,84 24 | fe,ves,76 25 | ex,ices,71 26 | ah,ot,70 27 | e,i,66 28 | ,en,64 29 | ,x,63 30 | a,i,60 31 | ,n,51 32 | ah,oth,51 33 | x,ges,50 34 | ,a,48 35 | en,ina,41 36 | a,ot,39 37 | oot,eet,37 38 | n,,37 39 | ,y,37 40 | ouse,ice,35 41 | ,ren,35 42 | a,y,32 43 | ,z,27 44 | ey,ies,26 45 | ,zes,26 46 | s,tes,25 47 | e,ia,24 48 | e,ae,22 49 | a,oth,21 50 | ff,ves,20 51 | ,nes,20 52 | is,eis,19 53 | e,ai,19 54 | us,era,18 55 | o,ines,17 56 | o,,17 57 | ,m,16 58 | ,ae,16 59 | ,er,14 60 | ooth,eeth,14 61 | ful,esful,12 62 | a,os,12 63 | a,es,12 64 | ,ot,11 65 | ah,os,11 66 | oose,eese,11 67 | s,,10 68 | yful,iesful,10 69 | ,t,10 70 | ,oth,10 71 | es,ai,10 72 | o,hi,10 73 | o,a,10 74 | x,kes,8 75 | s,es,8 76 | us,a,8 77 | e,a,7 78 | ets,tsy,7 79 | ,un,7 80 | ,ob,7 81 | ,j,7 82 | ,in,7 83 | ,au,6 84 | h,t,6 85 | s,ta,6 86 | es,ites,6 87 | s,tia,6 88 | ok,ki,6 89 | z,ces,5 90 | um,i,5 91 | os,e,5 92 | er,arim,5 93 | ,g,5 94 | ,k,5 95 | ,ach,5 96 | ,ia,5 97 | ,tes,4 98 | ed,dim,4 99 | us,ii,4 100 | ,at,4 101 | ,een,4 102 | ,u,4 103 | us,ora,4 104 | ,ie,4 105 | a,,4 106 | el,lach,4 107 | ,ech,4 108 | ,ata,4 109 | us,es,4 110 | en,ines,4 111 | ,it,4 112 | ,r,4 113 | ur,ora,4 114 | us,odes,4 115 | er,rim,4 116 | q,t,4 117 | ,ar,3 118 | us,is,3 119 | os,i,3 120 | ,lar,3 121 | um,ae,3 122 | k,it,3 123 | ,ion,3 124 | ,ok,3 125 | ,is,3 126 | ,den,3 127 | f,vim,3 128 | ut,ita,3 129 | s,ra,3 130 | es,ae,3 131 | l,ux,3 132 | et,tim,3 133 | l,is,3 134 | a,or,3 135 | eps,ipes,3 136 | ad,den,3 137 | el,alim,3 138 | i,a,3 139 | a,ur,3 140 | ,os,3 141 | an,onim,2 142 | us,ae,2 143 | i,,2 144 | sm,,2 145 | os,sim,2 146 | erfamilias,resfamilias,2 147 | us,des,2 148 | ,ai,2 149 | aon,eonim,2 150 | a,in,2 151 | full,sfull,2 152 | m,ns,2 153 | en,anim,2 154 | ia,es,2 155 | ot,ten,2 156 | n,i,2 157 | k,t,2 158 | ar,roth,2 159 | ,se,2 160 | u,i,2 161 | et,ot,2 162 | ek,ky,2 163 | ,q,2 164 | onseigneur,esseigneurs,2 165 | ,des,2 166 | adam,esdames,2 167 | f,ven,2 168 | o,e,2 169 | ,oi,2 170 | e,ies,2 171 | s,res,2 172 | ah,im,2 173 | t,s,2 174 | ya,i,2 175 | e,oi,2 176 | ,ov,2 177 | eh,oth,2 178 | fful,vesful,2 179 | each,igh,2 180 | il,ux,2 181 | e,ce,2 182 | fidi,wafid,2 183 | anservant,enservants,2 184 | ademoiselle,esdemoiselles,2 185 | ak,ci,2 186 | f,vs,2 187 | ,eanna,2 188 | anchild,enchildren,2 189 | el,lech,2 190 | eder,adarim,2 191 | ,th,2 192 | ka,ok,2 193 | a,een,2 194 | ny,ce,2 195 | li,tin,2 196 | s,e,2 197 | ar,our,2 198 | e,s,2 199 | os,atim,2 200 | er,ri,2 201 | tis,des,2 202 | q,it,2 203 | ,ean,2 204 | ia,e,2 205 | e,ata,2 206 | y,ia,2 207 | adame,esdames,2 208 | onsieur,essieurs,2 209 | os,ea,2 210 | a,he,2 211 | i,een,2 212 | ,ir,2 213 | sibi,wasib,2 214 | x,ches,2 215 | eps,ipites,2 216 | ets,tsi,2 217 | juz,ajza,1 218 | ah,iyos,1 219 | rijite,warij,1 220 | em,omim,1 221 | oof,eef,1 222 | okhol,akhly,1 223 | umlungu,abelungu,1 224 | ee,,1 225 | ein,anim,1 226 | saw,asaw,1 227 | ,le,1 228 | misr,amsar,1 229 | le,rren,1 230 | orafe,erafe,1 231 | eg,agim,1 232 | uramba,iramba,1 233 | sa,jes,1 234 | t,ot,1 235 | ax,ices,1 236 | ,essrs,1 237 | nduna,zinduna,1 238 | an,ni,1 239 | est,sunt,1 240 | khara,akhir,1 241 | nasheed,anasheed,1 242 | y,e,1 243 | c,,1 244 | amenukal,imenukalen,1 245 | add,udud,1 246 | w,ois,1 247 | zil,azil,1 248 | ann,enn,1 249 | di,oud,1 250 | ovabitch,sovbitches,1 251 | en,onim,1 252 | ek,iky,1 253 | mganga,waganga,1 254 | y,ce,1 255 | ,xes,1 256 | ra,war,1 257 | es,osim,1 258 | h,ot,1 259 | tl,,1 260 | h,s,1 261 | nia,en,1 262 | moran,ilmoran,1 263 | umkhwetha,abakwetha,1 264 | inego,sinegoes,1 265 | ah,ioth,1 266 | by,sby,1 267 | us,,1 268 | at,ot,1 269 | i,onin,1 270 | es,ides,1 271 | rs,esdames,1 272 | kgotla,dikgotla,1 273 | ,ovi,1 274 | riya,wari,1 275 | amazigh,imazighen,1 276 | ri,li,1 277 | ,tsy,1 278 | ,ja,1 279 | ,bi,1 280 | umuzungu,abazungu,1 281 | y,ees,1 282 | e,tes,1 283 | soa,osi,1 284 | ngaka,dingaka,1 285 | r,yr,1 286 | erg,areg,1 287 | kibanja,bibanja,1 288 | umkhwetha,abakhwetha,1 289 | ,yat,1 290 | h,e,1 291 | ie,ce,1 292 | aphil,ephilim,1 293 | ,tys,1 294 | on,i,1 295 | s,is,1 296 | oste,atim,1 297 | umuzungu,bazungu,1 298 | ,ens,1 299 | es,aisim,1 300 | eikh,uyookh,1 301 | adol,edolim,1 302 | itur,untur,1 303 | er,res,1 304 | us,ina,1 305 | o,is,1 306 | ie,s,1 307 | uvabitch,suvbitches,1 308 | ashah,shioth,1 309 | a,s,1 310 | s,x,1 311 | onsr,essrs,1 312 | hisname,theirnames,1 313 | j,ljj,1 314 | da,ren,1 315 | sangoma,zangoma,1 316 | indiq,andaqa,1 317 | oote,eete,1 318 | woman,swomen,1 319 | ,ah,1 320 | mzungu,wazungu,1 321 | e,ren,1 322 | ongolawi,anagla,1 323 | wa,awat,1 324 | tambala,matambala,1 325 | lma,walim,1 326 | tar,darka,1 327 | uy,luim,1 328 | mbongi,zimbongi,1 329 | sibongo,zibongo,1 330 | omanhene,amanhene,1 331 | als,ulus,1 332 | eder,adorim,1 333 | martial,smartial,1 334 | esh,ashim,1 335 | ,ons,1 336 | aliah,elihim,1 337 | ootful,eetful,1 338 | id,eda,1 339 | ,il,1 340 | ils,ulus,1 341 | zeeyeh,wazee,1 342 | myometritis,endometritides,1 343 | it,aisim,1 344 | tur,ntur,1 345 | imrah,emirot,1 346 | it,eysim,1 347 | sente,lisente,1 348 | e,ides,1 349 | s,isim,1 350 | ,as,1 351 | x,ctes,1 352 | mbongi,imbongi,1 353 | ification,fications,1 354 | ,hang,1 355 | ol,ly,1 356 | hizb,ahzab,1 357 | ya,at,1 358 | oothmark,eethmarks,1 359 | afiz,uffaz,1 360 | umzulu,amazulu,1 361 | al,is,1 362 | waqf,awqaf,1 363 | eq,aqim,1 364 | ,na,1 365 | an,n,1 366 | ,eh,1 367 | ,em,1 368 | afek,fekot,1 369 | is,u,1 370 | ah,iyoth,1 371 | ushaf,asahif,1 372 | er,ren,1 373 | ,zim,1 374 | achresis,echreses,1 375 | r,ar,1 376 | s,thes,1 377 | mosarwa,basarwa,1 378 | nn,wan,1 379 | ars,uroos,1 380 | en,onem,1 381 | ach,chim,1 382 | ow,attle,1 383 | nyanga,zinyanga,1 384 | esh,shayim,1 385 | tl,meh,1 386 | ed,de,1 387 | ilbab,alabib,1 388 | ok,ukim,1 389 | mbizo,zimbizo,1 390 | ars,urus,1 391 | araph,eraphim,1 392 | eole,olae,1 393 | wa,een,1 394 | intaqah,anatiq,1 395 | ah,iyot,1 396 | sir,asir,1 397 | r,essrs,1 398 | ces,ses,1 399 | ek,kim,1 400 | lme,walim,1 401 | nie,ce,1 402 | ace,i,1 403 | ootpaw,eetpaws,1 404 | do,be,1 405 | mon,yn,1 406 | i,anim,1 407 | ah,in,1 408 | ann,inn,1 409 | c,x,1 410 | alim,ulama,1 411 | ful,sfuls,1 412 | rani,ara,1 413 | is,eisim,1 414 | itongo,amatongo,1 415 | ote,a,1 416 | wmwd,ymydau,1 417 | eyrir,aurar,1 418 | lilangeni,emalangeni,1 419 | ,la,1 420 | hadith,ahadith,1 421 | sharif,ashraf,1 422 | kungwi,makungwi,1 423 | ,me,1 424 | wakf,awkaf,1 425 | anfriend,enfriends,1 426 | y,ata,1 427 | e,lun,1 428 | un,en,1 429 | us,sim,1 430 | s,ysim,1 431 | ,ch,1 432 | aliah,elichim,1 433 | omyn,ymyn,1 434 | diptotes,nouns,1 435 | mus,wamis,1 436 | kwerekwere,amakwerekwere,1 437 | nie,s,1 438 | y,sies,1 439 | ek,ki,1 440 | i,ah,1 441 | am,omim,1 442 | jid,ajid,1 443 | aykh,uyookh,1 444 | esh,ashoth,1 445 | s,tot,1 446 | oret,urot,1 447 | us,oi,1 448 | moloi,baloi,1 449 | ipsis,eipses,1 450 | lolwapa,malwapa,1 451 | gepik,qepik,1 452 | et,itten,1 453 | singhalese,cingalese,1 454 | ariqa,uruq,1 455 | o,eaux,1 456 | ,ut,1 457 | ,iv,1 458 | apanca,epanche,1 459 | ofabitch,sabitches,1 460 | oop,eep,1 461 | inkhosi,amakhosi,1 462 | ebsi,bassa,1 463 | ekwele,bipkwele,1 464 | omin,imin,1 465 | otshelo,etshelo,1 466 | mosotho,basotho,1 467 | an,,1 468 | ,thes,1 469 | eder,iddarim,1 470 | muzungu,wazungu,1 471 | os,ai,1 472 | is,asa,1 473 | triptotes,nouns,1 474 | rs,mes,1 475 | ,jev,1 476 | beast,,1 477 | ,sa,1 478 | sicoco,zicoco,1 479 | inn,awan,1 480 | l,aux,1 481 | s,zes,1 482 | asul,usul,1 483 | ets,tsiv,1 484 | mwalimu,walimu,1 485 | add,udood,1 486 | ayit,eytim,1 487 | elet,lot,1 488 | eh,im,1 489 | ,ada,1 490 | iss,lles,1 491 | lmah,walim,1 492 | aki,eke,1 493 | feful,vesful,1 494 | ophato,ephato,1 495 | ard,nards,1 496 | iec,ce,1 497 | as,es,1 498 | i,anin,1 499 | ,gun,1 500 | er,orim,1 501 | ofabitch,sofbitches,1 502 | aliach,luchim,1 503 | mzee,wazee,1 504 | ommin,immin,1 505 | er,,1 506 | us,eres,1 507 | wa,a,1 508 | ,h,1 509 | i,ar,1 510 | ratal,irtal,1 511 | ko,ot,1 512 | eq,qim,1 513 | homme,shommes,1 514 | n,sa,1 515 | s,ces,1 516 | di,adim,1 517 | ,ekh,1 518 | okoro,ekoro,1 519 | rotl,artal,1 520 | cow,kine,1 521 | nik,ota,1 522 | aliah,luchim,1 523 | i,onim,1 524 | ex,iges,1 525 | tan,atin,1 526 | desac,sdesac,1 527 | nkisi,minkisi,1 528 | e,tini,1 529 | aliah,lihim,1 530 | ,an,1 531 | in,a,1 532 | kama,cama,1 533 | ann,innan,1 534 | cimibundu,vimbundu,1 535 | o,ya,1 536 | osanto,isanti,1 537 | ach,chaot,1 538 | af,efim,1 539 | eder,idarim,1 540 | lmeh,walim,1 541 | elet,loth,1 542 | loti,maloti,1 543 | in,e,1 544 | afir,uffar,1 545 | thorax,horaces,1 546 | s,ntes,1 547 | et,ittim,1 548 | wthyn,ythynnod,1 549 | ,kuna,1 550 | hanedd,aneddion,1 551 | rah,war,1 552 | mot,smots,1 553 | um,en,1 554 | is,eson,1 555 | arif,urafa,1 556 | yot,ik,1 557 | ut,eet,1 558 | th,yoth,1 559 | aliah,lichim,1 560 | ,ni,1 561 | orena,arena,1 562 | ,len,1 563 | hadeeth,ahadeeth,1 564 | is,ai,1 565 | r,ir,1 566 | anne,enne,1 567 | d,ood,1 568 | eh,ot,1 569 | nik,ari,1 570 | eh,os,1 571 | gid,agid,1 572 | each,ich,1 573 | o,mus,1 574 | el,loch,1 575 | imrah,emiroth,1 576 | ,ims,1 577 | as,ones,1 578 | ,te,1 579 | ,och,1 580 | achua,wochua,1 581 | d,ud,1 582 | ex,icia,1 583 | huk,uit,1 584 | aliah,luhim,1 585 | ,ke,1 586 | ya,,1 587 | oritz,ritzim,1 588 | motswana,batswana,1 589 | k,cy,1 590 | an,ns,1 591 | likuta,makuta,1 592 | muwashshah,tawashih,1 593 | nkishi,minkishi,1 594 | y,es,1 595 | ys,ies,1 596 | ok,ky,1 597 | darme,sdarmes,1 598 | egetz,kotzim,1 599 | dah,ada,1 600 | ah,iot,1 601 | it,tin,1 602 | rasah,aris,1 603 | alian,elsh,1 604 | e,ya,1 605 | nill,illion,1 606 | elek,alakim,1 607 | ti,pol,1 608 | al,len,1 609 | ann,innah,1 610 | kgosana,dikgosana,1 611 | ,od,1 612 | ,ji,1 613 | eh,at,1 614 | kgosi,dikgosi,1 615 | sak,iskei,1 616 | etrum,hertum,1 617 | igqirha,amagqirha,1 618 | itrix,rices,1 619 | hil,jil,1 620 | lal,alil,1 621 | funa,nagu,1 622 | y,s,1 623 | a,ek,1 624 | herero,ovaherero,1 625 | alim,ulema,1 626 | inkosi,amakhosi,1 627 | -------------------------------------------------------------------------------- /extractor/analyzed_endings.csv: -------------------------------------------------------------------------------- 1 | singular_suffix,plural_suffix,count 2 | a,as,9629 3 | m,ms,1276 4 | f,fs,918 5 | i,i,724 6 | s,s,688 7 | a,a,568 8 | o,oes,522 9 | a,ata,499 10 | h,hs,375 11 | n,n,364 12 | e,e,339 13 | h,hes,324 14 | o,o,245 15 | y,ys,227 16 | lus,li,226 17 | is,ides,222 18 | u,u,222 19 | ron,ra,199 20 | ion,ia,189 21 | n,ns,180 22 | s,ss,179 23 | os,oi,166 24 | erson,eople,165 25 | z,zes,164 26 | t,t,134 27 | k,k,119 28 | h,h,110 29 | x,xes,107 30 | g,g,98 31 | r,r,97 32 | eus,ei,91 33 | s,ses,90 34 | to,ti,85 35 | e,es,84 36 | cus,ci,83 37 | l,l,82 38 | s,sses,81 39 | tis,tes,74 40 | ius,ii,72 41 | ife,ives,70 42 | tus,ti,70 43 | no,ni,67 44 | rus,ri,65 45 | eful,esful,64 46 | d,d,63 47 | u,us,59 48 | tful,tsful,58 49 | m,m,57 50 | sus,si,53 51 | gus,gi,52 52 | k,ki,52 53 | nful,nsful,52 54 | r,res,50 55 | y,y,49 56 | rful,rsful,46 57 | z,z,46 58 | lo,li,45 59 | i,ies,43 60 | hus,hi,42 61 | lon,la,42 62 | c,c,41 63 | men,mina,41 64 | nus,ni,40 65 | ix,ices,39 66 | n,nim,39 67 | n,nes,38 68 | non,na,38 69 | ton,ta,38 70 | ka,ki,37 71 | an,a,36 72 | e,en,36 73 | x,x,36 74 | mus,mi,35 75 | t,ts,35 76 | d,dren,34 77 | r,rim,32 78 | x,xs,30 79 | a,ai,29 80 | bus,bi,29 81 | h,him,29 82 | kful,ksful,29 83 | lis,les,29 84 | tex,tices,28 85 | lful,lsful,27 86 | p,p,27 87 | l,les,25 88 | ne,ni,25 89 | w,w,25 90 | r,ri,24 91 | z,zzes,24 92 | pus,pi,23 93 | j,jes,22 94 | pful,psful,22 95 | ax,aces,21 96 | aff,aves,20 97 | do,di,20 98 | o,ones,20 99 | ta,te,20 100 | fan,fen,19 101 | dful,dsful,17 102 | f,f,17 103 | io,i,17 104 | la,le,17 105 | le,lia,17 106 | dus,di,16 107 | gful,gsful,16 108 | ns,ntes,16 109 | b,b,15 110 | con,ca,15 111 | eon,ea,15 112 | n,ne,15 113 | na,ne,15 114 | r,rs,15 115 | d,des,14 116 | i,ia,14 117 | lex,lices,14 118 | lis,leis,14 119 | ra,re,14 120 | rah,rot,14 121 | t,ten,14 122 | d,dae,13 123 | n,ni,13 124 | xon,xa,13 125 | zo,zi,13 126 | d,dim,12 127 | h,hen,12 128 | i,im,12 129 | ia,ie,12 130 | so,si,12 131 | go,gines,11 132 | ka,ky,11 133 | mful,msful,11 134 | nus,nera,11 135 | ris,res,11 136 | ro,ri,11 137 | g,ges,10 138 | j,j,10 139 | k,kim,10 140 | l,lim,10 141 | na,ny,10 142 | rah,roth,10 143 | t,tim,10 144 | tes,tai,10 145 | wful,wsful,10 146 | co,chi,9 147 | h,hi,9 148 | pex,pices,9 149 | t,te,9 150 | t,tes,9 151 | u,ux,9 152 | zah,zot,9 153 | a,at,8 154 | dex,dices,8 155 | g,ge,8 156 | g,gim,8 157 | hful,hsful,8 158 | is,i,8 159 | le,lae,8 160 | le,li,8 161 | mo,mi,8 162 | mon,ma,8 163 | nah,not,8 164 | ney,nies,8 165 | q,q,8 166 | re,ri,8 167 | ta,ti,8 168 | vus,vi,8 169 | y,yes,8 170 | yx,yces,8 171 | za,ze,8 172 | a,az,7 173 | bful,bsful,7 174 | k,ky,7 175 | lah,lot,7 176 | r,ra,7 177 | te,ti,7 178 | vis,ves,7 179 | z,zy,7 180 | as,ata,6 181 | bis,bes,6 182 | do,dines,6 183 | ha,hot,6 184 | hah,hot,6 185 | hon,ha,6 186 | lah,loth,6 187 | m,ma,6 188 | ma,me,6 189 | n,ny,6 190 | ns,ntia,6 191 | pah,pot,6 192 | ra,rot,6 193 | s,sen,6 194 | ta,tes,6 195 | u,ua,6 196 | v,v,6 197 | z,zim,6 198 | za,zot,6 199 | a,aim,5 200 | ah,at,5 201 | as,ades,5 202 | as,ai,5 203 | ax,akes,5 204 | co,ci,5 205 | g,gen,5 206 | ga,gi,5 207 | h,hy,5 208 | ha,hos,5 209 | hful,hesful,5 210 | l,lach,5 211 | le,lai,5 212 | me,mai,5 213 | nis,nes,5 214 | o,oj,5 215 | ra,roth,5 216 | ryful,riesful,5 217 | s,sim,5 218 | se,si,5 219 | sful,sesful,5 220 | yful,ysful,5 221 | yx,yges,5 222 | zah,zoth,5 223 | as,ates,4 224 | bah,both,4 225 | ce,ci,4 226 | es,etes,4 227 | ex,eces,4 228 | ex,eges,4 229 | ge,gi,4 230 | gon,ga,4 231 | hah,hoth,4 232 | io,ia,4 233 | key,kies,4 234 | l,lech,4 235 | l,li,4 236 | l,lz,4 237 | la,los,4 238 | la,lot,4 239 | lum,li,4 240 | m,mata,4 241 | m,mim,4 242 | mes,mites,4 243 | mis,mes,4 244 | mur,mora,4 245 | n,ntes,4 246 | nah,noth,4 247 | oon,oa,4 248 | pa,pot,4 249 | pus,podes,4 250 | sa,se,4 251 | t,ti,4 252 | ux,uces,4 253 | vo,vi,4 254 | x,xen,4 255 | ys,yes,4 256 | z,zi,4 257 | za,zoth,4 258 | aad,aden,3 259 | al,aux,3 260 | bex,bices,3 261 | ceps,cipes,3 262 | d,da,3 263 | d,dau,3 264 | d,der,3 265 | d,dy,3 266 | dah,dot,3 267 | der,darim,3 268 | e,ees,3 269 | e,er,3 270 | eus,eis,3 271 | fah,fot,3 272 | fex,fices,3 273 | h,ha,3 274 | h,he,3 275 | h,hot,3 276 | h,hoth,3 277 | ha,hoth,3 278 | hok,hki,3 279 | ix,ikes,3 280 | k,ken,3 281 | kah,koth,3 282 | kon,ka,3 283 | l,lia,3 284 | l,ln,3 285 | la,li,3 286 | m,mi,3 287 | me,ma,3 288 | n,nob,3 289 | n,not,3 290 | n,noth,3 291 | n,nz,3 292 | na,ni,3 293 | ne,na,3 294 | nets,ntsy,3 295 | o,og,3 296 | pah,poth,3 297 | pe,pae,3 298 | po,pi,3 299 | ps,pes,3 300 | pus,pora,3 301 | put,pita,3 302 | q,qim,3 303 | r,re,3 304 | r,ren,3 305 | ra,ry,3 306 | re,ria,3 307 | rex,rices,3 308 | s,se,3 309 | sa,sy,3 310 | sey,sies,3 311 | t,ter,3 312 | t,ty,3 313 | u,ues,3 314 | u,uit,3 315 | uk,uit,3 316 | v,vim,3 317 | va,vot,3 318 | vah,voth,3 319 | vey,vies,3 320 | ya,yot,3 321 | yfe,yves,3 322 | yon,ya,3 323 | z,ze,3 324 | aful,asful,2 325 | ail,aux,2 326 | ak,at,2 327 | al,ais,2 328 | aq,at,2 329 | as,ae,2 330 | az,aces,2 331 | b,bz,2 332 | bah,bot,2 333 | ban,ben,2 334 | bos,batim,2 335 | ce,cies,2 336 | ceps,cipites,2 337 | cus,cera,2 338 | d,dden,2 339 | d,de,2 340 | d,deen,2 341 | d,dz,2 342 | da,des,2 343 | da,dy,2 344 | di,deen,2 345 | e,eg,2 346 | e,eis,2 347 | e,ek,2 348 | e,ez,2 349 | es,edes,2 350 | ez,eces,2 351 | fer,frim,2 352 | fo,fi,2 353 | fus,fi,2 354 | g,ga,2 355 | g,gi,2 356 | ga,ge,2 357 | ged,gdim,2 358 | gey,gies,2 359 | gia,ge,2 360 | h,hie,2 361 | h,hz,2 362 | han,honim,2 363 | he,hae,2 364 | heder,hadarim,2 365 | hen,hanim,2 366 | het,htim,2 367 | hex,hices,2 368 | hos,he,2 369 | i,iz,2 370 | ia,i,2 371 | iah,iot,2 372 | io,ii,2 373 | iq,iit,2 374 | ism,i,2 375 | itis,ides,2 376 | ius,ia,2 377 | ix,iches,2 378 | j,jj,2 379 | k,ke,2 380 | k,ker,2 381 | l,lion,2 382 | l,llar,2 383 | la,loth,2 384 | lah,los,2 385 | ley,lies,2 386 | lfe,lves,2 387 | lli,ltin,2 388 | lum,lae,2 389 | lx,lces,2 390 | m,mat,2 391 | m,my,2 392 | mah,mot,2 393 | mah,moth,2 394 | manchild,menchildren,2 395 | manservant,menservants,2 396 | me,mata,2 397 | mel,mlech,2 398 | men,mines,2 399 | n,nen,2 400 | n,ner,2 401 | ne,nae,2 402 | ne,nai,2 403 | nets,ntsi,2 404 | nny,nce,2 405 | nos,ne,2 406 | nx,nces,2 407 | oe,oae,2 408 | oe,oai,2 409 | on,oi,2 410 | oot,oten,2 411 | ous,odes,2 412 | ous,oes,2 413 | ox,oces,2 414 | p,pen,2 415 | p,py,2 416 | pa,pe,2 417 | pe,pai,2 418 | q,qq,2 419 | q,qun,2 420 | r,rot,2 421 | r,roth,2 422 | rah,ros,2 423 | rey,ries,2 424 | rf,rven,2 425 | ros,rsim,2 426 | rus,rii,2 427 | s,sa,2 428 | sa,si,2 429 | sar,sour,2 430 | sia,ses,2 431 | sis,seis,2 432 | t,ta,2 433 | t,tn,2 434 | t,tot,2 435 | ta,tot,2 436 | ta,ty,2 437 | te,tae,2 438 | terfamilias,tresfamilias,2 439 | tes,tae,2 440 | tus,ta,2 441 | tus,tera,2 442 | tyful,tiesful,2 443 | u,uen,2 444 | uis,ues,2 445 | uq,ut,2 446 | us,udes,2 447 | us,ura,2 448 | uus,ui,2 449 | v,ve,2 450 | v,vin,2 451 | veh,voth,2 452 | vets,vtsy,2 453 | vo,va,2 454 | w,wen,2 455 | wa,ween,2 456 | xful,xesful,2 457 | xus,xi,2 458 | ya,yoth,2 459 | yah,yot,2 460 | ys,ydes,2 461 | zis,zes,2 462 | a,aa,1 463 | a,ades,1 464 | a,aes,1 465 | a,ak,1 466 | a,ala,1 467 | a,alar,1 468 | a,am,1 469 | a,ar,1 470 | a,athes,1 471 | aal,alen,1 472 | aan,ani,1 473 | aan,ans,1 474 | achua,wochua,1 475 | ae,ai,1 476 | afful,avesful,1 477 | afidi,awafid,1 478 | ah,ae,1 479 | ais,aeson,1 480 | alim,ulama,1 481 | alim,ulema,1 482 | alma,awalim,1 483 | almah,awalim,1 484 | alme,awalim,1 485 | almeh,awalim,1 486 | amazigh,imazighen,1 487 | amenukal,imenukalen,1 488 | amus,awamis,1 489 | an,asa,1 490 | ann,awan,1 491 | arijite,awarij,1 492 | ariya,awari,1 493 | as,aces,1 494 | as,antes,1 495 | asibi,awasib,1 496 | atl,a,1 497 | atur,antur,1 498 | azeeyeh,awazee,1 499 | b,be,1 500 | b,bim,1 501 | b,bims,1 502 | b,bin,1 503 | b,bob,1 504 | ba,be,1 505 | bel,blach,1 506 | ber,bri,1 507 | bi,ba,1 508 | bi,bah,1 509 | bi,banim,1 510 | bi,banin,1 511 | bi,bonim,1 512 | bi,bonin,1 513 | bit,btin,1 514 | bkhara,bakhir,1 515 | bok,bki,1 516 | boste,batim,1 517 | bs,bes,1 518 | bwthyn,bythynnod,1 519 | c,cada,1 520 | c,ces,1 521 | c,ci,1 522 | ca,che,1 523 | cah,cos,1 524 | cah,cot,1 525 | cah,coth,1 526 | ce,cs,1 527 | cek,cky,1 528 | cen,cines,1 529 | cful,csful,1 530 | cow,cattle,1 531 | cow,kine,1 532 | cwmwd,cymydau,1 533 | d,dai,1 534 | d,den,1 535 | d,di,1 536 | d,dia,1 537 | d,din,1 538 | d,dn,1 539 | d,dun,1 540 | da,de,1 541 | da,din,1 542 | da,dot,1 543 | dah,dos,1 544 | dah,doth,1 545 | dars,duroos,1 546 | dars,durus,1 547 | dd,dood,1 548 | dd,dud,1 549 | dda,dren,1 550 | de,dia,1 551 | de,dren,1 552 | del,dlach,1 553 | del,dloch,1 554 | der,dren,1 555 | di,d,1 556 | di,da,1 557 | diptotes,nouns,1 558 | do,dis,1 559 | dongolawi,danagla,1 560 | drasah,daris,1 561 | dy,dsies,1 562 | dyful,diesful,1 563 | e,ebi,1 564 | e,edes,1 565 | e,em,1 566 | e,eob,1 567 | e,ese,1 568 | e,eta,1 569 | ea,ee,1 570 | ea,es,1 571 | eal,eis,1 572 | ebeast,e,1 573 | ee,es,1 574 | eed,ede,1 575 | eest,esunt,1 576 | ef,evim,1 577 | eh,eot,1 578 | eis,easa,1 579 | ekwele,bipkwele,1 580 | el,eaux,1 581 | el,eis,1 582 | em,ens,1 583 | eo,ei,1 584 | eo,emus,1 585 | erg,areg,1 586 | es,e,1 587 | es,eis,1 588 | es,eisim,1 589 | es,eysim,1 590 | etl,emeh,1 591 | eu,ei,1 592 | eus,ea,1 593 | ew,eois,1 594 | eyrir,aurar,1 595 | f,fe,1 596 | f,fi,1 597 | f,fim,1 598 | fals,fulus,1 599 | fanne,fenne,1 600 | far,froth,1 601 | fer,f,1 602 | fer,farim,1 603 | fet,ftim,1 604 | fid,feda,1 605 | fils,fulus,1 606 | foote,feete,1 607 | footful,feetful,1 608 | footpaw,feetpaws,1 609 | fsir,fasir,1 610 | fut,feet,1 611 | g,gar,1 612 | g,ghang,1 613 | g,gin,1 614 | g,gn,1 615 | ga,ghe,1 616 | gadol,gedolim,1 617 | gaon,geonim,1 618 | gas,ges,1 619 | ge,goi,1 620 | gepik,qepik,1 621 | ger,gri,1 622 | get,gitten,1 623 | get,gittim,1 624 | ghanedd,ganeddion,1 625 | go,ghi,1 626 | gr,gar,1 627 | gus,ges,1 628 | h,hae,1 629 | h,hat,1 630 | h,hean,1 631 | h,heen,1 632 | h,hin,1 633 | h,hiv,1 634 | h,hod,1 635 | h,hoi,1 636 | h,hons,1 637 | h,hos,1 638 | h,hun,1 639 | ha,hi,1 640 | hadd,hudood,1 641 | hadd,hudud,1 642 | hadeeth,ahadeeth,1 643 | hadith,ahadith,1 644 | hafiz,huffaz,1 645 | hah,hiot,1 646 | hah,hioth,1 647 | hah,hiyos,1 648 | hah,hiyot,1 649 | hah,hiyoth,1 650 | hah,hos,1 651 | haliach,hluchim,1 652 | haliah,helichim,1 653 | haliah,helihim,1 654 | haliah,hlichim,1 655 | haliah,hlihim,1 656 | haliah,hluchim,1 657 | haliah,hluhim,1 658 | ham,homim,1 659 | har,hroth,1 660 | harif,hurafa,1 661 | hat,hot,1 662 | haykh,huyookh,1 663 | he,hai,1 664 | he,hi,1 665 | heder,hadorim,1 666 | hegetz,hkotzim,1 667 | heikh,huyookh,1 668 | hein,hanim,1 669 | hel,halim,1 670 | hel,hlach,1 671 | helek,halakim,1 672 | hem,homim,1 673 | her,harim,1 674 | herero,ovaherero,1 675 | hesh,hashoth,1 676 | hetrum,hhertum,1 677 | hf,hvs,1 678 | hizb,ahzab,1 679 | ho,heaux,1 680 | hok,hky,1 681 | hok,hukim,1 682 | hokhol,hakhly,1 683 | hol,hly,1 684 | hoop,heep,1 685 | hos,hea,1 686 | hus,ha,1 687 | i,iim,1 688 | i,iit,1 689 | i,ik,1 690 | i,iu,1 691 | i,iyat,1 692 | ia,ies,1 693 | ia,ii,1 694 | ia,iot,1 695 | iach,ichim,1 696 | iah,ioth,1 697 | ian,i,1 698 | ic,i,1 699 | ic,ix,1 700 | ido,ibe,1 701 | ie,ice,1 702 | ie,ii,1 703 | ie,iya,1 704 | if,ivim,1 705 | ifeful,ivesful,1 706 | iful,isful,1 707 | ifuna,inagu,1 708 | igqirha,amagqirha,1 709 | ika,iok,1 710 | ile,irren,1 711 | imbizo,izimbizo,1 712 | imbongi,iimbongi,1 713 | imbongi,izimbongi,1 714 | in,i,1 715 | induna,izinduna,1 716 | inkhosi,amakhosi,1 717 | inkosi,amakhosi,1 718 | inyanga,izinyanga,1 719 | is,ii,1 720 | is,ires,1 721 | is,ithes,1 722 | is,itot,1 723 | is,ix,1 724 | is,izes,1 725 | isangoma,izangoma,1 726 | isibongo,izibongo,1 727 | isicoco,izicoco,1 728 | it,iot,1 729 | it,is,1 730 | ith,iyoth,1 731 | itongo,amatongo,1 732 | ium,ien,1 733 | ius,i,1 734 | ix,iges,1 735 | iya,i,1 736 | iya,ii,1 737 | iyot,iik,1 738 | ja,je,1 739 | jann,jinn,1 740 | jann,jinnah,1 741 | jann,jinnan,1 742 | jilbab,jalabib,1 743 | jinn,jawan,1 744 | jo,je,1 745 | jsa,jjes,1 746 | juz,ajza,1 747 | k,kie,1 748 | k,kil,1 749 | k,kke,1 750 | k,ku,1 751 | k,kun,1 752 | ka,kes,1 753 | ka,kor,1 754 | ka,koth,1 755 | kafir,kuffar,1 756 | kah,kos,1 757 | kah,kot,1 758 | ke,kae,1 759 | ke,kai,1 760 | kel,kalim,1 761 | ket,kot,1 762 | kgosana,dikgosana,1 763 | kgosi,dikgosi,1 764 | kgotla,dikgotla,1 765 | ki,kar,1 766 | kibanja,bibanja,1 767 | kko,kot,1 768 | klal,kalil,1 769 | ko,ka,1 770 | kos,kai,1 771 | ksaw,kasaw,1 772 | kungwi,makungwi,1 773 | kwerekwere,amakwerekwere,1 774 | kyful,kiesful,1 775 | l,lau,1 776 | l,leanna,1 777 | l,lekh,1 778 | l,len,1 779 | l,llen,1 780 | l,lob,1 781 | l,loch,1 782 | l,ly,1 783 | la,les,1 784 | la,lin,1 785 | la,lur,1 786 | lace,li,1 787 | laf,lefim,1 788 | lah,lin,1 789 | ldesac,lsdesac,1 790 | le,llun,1 791 | leach,lich,1 792 | leh,lim,1 793 | les,laisim,1 794 | lets,ltsy,1 795 | lex,licia,1 796 | lf,lvs,1 797 | lfful,lvesful,1 798 | lfull,lsfull,1 799 | lhomme,lshommes,1 800 | likuta,makuta,1 801 | lilangeni,emalangeni,1 802 | lin,la,1 803 | lipsis,leipses,1 804 | lis,leisim,1 805 | lis,lu,1 806 | lit,laisim,1 807 | lit,leysim,1 808 | lj,lljj,1 809 | lo,la,1 810 | lolwapa,malwapa,1 811 | lon,li,1 812 | los,le,1 813 | los,lea,1 814 | los,li,1 815 | loti,maloti,1 816 | lr,lir,1 817 | lus,lae,1 818 | lus,lera,1 819 | lus,loi,1 820 | luy,lluim,1 821 | ly,lia,1 822 | lyful,liesful,1 823 | m,mae,1 824 | m,mas,1 825 | m,me,1 826 | m,messrs,1 827 | m,mm,1 828 | m,mme,1 829 | m,mni,1 830 | m,mu,1 831 | m,mun,1 832 | m,mut,1 833 | ma,mek,1 834 | ma,my,1 835 | madam,mesdames,1 836 | madame,mesdames,1 837 | mademoiselle,mesdemoiselles,1 838 | manfriend,menfriends,1 839 | mann,menn,1 840 | mdah,mada,1 841 | me,mae,1 842 | me,mi,1 843 | med,mdim,1 844 | mer,morim,1 845 | mer,mrim,1 846 | mes,mae,1 847 | mes,mosim,1 848 | mex,mices,1 849 | mex,miges,1 850 | mey,mies,1 851 | mful,msfuls,1 852 | mfull,msfull,1 853 | mganga,waganga,1 854 | mintaqah,manatiq,1 855 | misr,amsar,1 856 | miss,mlles,1 857 | mmon,myn,1 858 | mokoro,mekoro,1 859 | moloi,baloi,1 860 | monseigneur,messeigneurs,1 861 | monsieur,messieurs,1 862 | monsr,messrs,1 863 | mophato,mephato,1 864 | morafe,merafe,1 865 | moran,ilmoran,1 866 | morena,marena,1 867 | mos,mi,1 868 | mosarwa,basarwa,1 869 | mosotho,basotho,1 870 | motshelo,metshelo,1 871 | motswana,batswana,1 872 | mr,messrs,1 873 | mrs,mesdames,1 874 | mrs,mmes,1 875 | msoa,mosi,1 876 | mun,men,1 877 | muramba,miramba,1 878 | mus,ma,1 879 | mus,mera,1 880 | mushaf,masahif,1 881 | muwashshah,tawashih,1 882 | muzungu,wazungu,1 883 | mwalimu,walimu,1 884 | myometritis,endometritides,1 885 | mzee,wazee,1 886 | mzungu,wazungu,1 887 | n,na,1 888 | n,nah,1 889 | n,nan,1 890 | n,nat,1 891 | n,neh,1 892 | n,ngun,1 893 | n,nie,1 894 | n,nion,1 895 | n,nn,1 896 | n,nna,1 897 | n,nov,1 898 | na,n,1 899 | na,nes,1 900 | na,nor,1 901 | na,not,1 902 | na,noth,1 903 | na,nur,1 904 | nak,nci,1 905 | naki,neke,1 906 | naphil,nephilim,1 907 | nard,nnards,1 908 | nasheed,anasheed,1 909 | nces,nses,1 910 | ndarme,nsdarmes,1 911 | ndi,noud,1 912 | ne,nia,1 913 | ne,noi,1 914 | ne,ntes,1 915 | ne,ntini,1 916 | nee,n,1 917 | nek,niky,1 918 | nets,ntsiv,1 919 | ngaka,dingaka,1 920 | ni,n,1 921 | nie,nce,1 922 | nie,ns,1 923 | nis,neis,1 924 | nkishi,minkishi,1 925 | nkisi,minkisi,1 926 | nmot,nsmots,1 927 | nnie,nce,1 928 | nnie,ns,1 929 | nnill,nillion,1 930 | no,na,1 931 | no,ne,1 932 | no,nya,1 933 | nofabitch,nsabitches,1 934 | nofabitch,nsofbitches,1 935 | nok,nki,1 936 | novabitch,nsovbitches,1 937 | ns,ndes,1 938 | ntar,ndarka,1 939 | nti,npol,1 940 | nus,nae,1 941 | nus,neres,1 942 | nus,nes,1 943 | nuvabitch,nsuvbitches,1 944 | ny,nce,1 945 | ny,ns,1 946 | nzil,nazil,1 947 | o,oi,1 948 | o,on,1 949 | o,oos,1 950 | o,ot,1 951 | o,oth,1 952 | o,oz,1 953 | ocimibundu,ovimbundu,1 954 | of,ovim,1 955 | oh,os,1 956 | oh,ot,1 957 | okama,ocama,1 958 | om,ons,1 959 | omanhene,amanhene,1 960 | oos,oi,1 961 | os,odes,1 962 | os,ora,1 963 | os,ores,1 964 | os,otes,1 965 | othorax,ohoraces,1 966 | ous,oi,1 967 | ox,octes,1 968 | oz,oces,1 969 | p,pes,1 970 | p,pi,1 971 | p,ple,1 972 | p,ps,1 973 | p,pth,1 974 | pa,por,1 975 | pa,pur,1 976 | pas,pones,1 977 | pe,pi,1 978 | pelet,plot,1 979 | pelet,ploth,1 980 | peole,polae,1 981 | pes,pites,1 982 | pis,pes,1 983 | pnik,pari,1 984 | pnik,pota,1 985 | pon,pa,1 986 | poritz,pritzim,1 987 | posanto,pisanti,1 988 | psak,piskei,1 989 | pus,pera,1 990 | qel,qalim,1 991 | r,rar,1 992 | r,rau,1 993 | r,reen,1 994 | r,ria,1 995 | r,rja,1 996 | r,rjev,1 997 | r,rji,1 998 | r,rn,1 999 | r,rok,1 1000 | r,rov,1 1001 | r,rtsy,1 1002 | r,ru,1 1003 | r,run,1 1004 | r,rz,1 1005 | ra,ros,1 1006 | rah,rim,1 1007 | rak,rci,1 1008 | rashah,rshioth,1 1009 | rasul,rusul,1 1010 | ratal,irtal,1 1011 | rby,rsby,1 1012 | rdi,radim,1 1013 | re,rai,1 1014 | red,rdim,1 1015 | reh,rat,1 1016 | req,raqim,1 1017 | resh,rashim,1 1018 | resh,rshayim,1 1019 | ret,rot,1 1020 | rets,rtsy,1 1021 | ri,li,1 1022 | rin,re,1 1023 | ris,reis,1 1024 | rotl,artal,1 1025 | rs,r,1 1026 | ru,ri,1 1027 | rum,rae,1 1028 | rus,ra,1 1029 | rus,rina,1 1030 | rus,rsim,1 1031 | ry,rata,1 1032 | ry,res,1 1033 | ry,ria,1 1034 | s,sar,1 1035 | s,seanna,1 1036 | s,sir,1 1037 | s,sm,1 1038 | s,soi,1 1039 | s,ssa,1 1040 | s,sse,1 1041 | s,ste,1 1042 | s,sy,1 1043 | sach,schaot,1 1044 | safek,sfekot,1 1045 | sapanca,sepanche,1 1046 | saraph,seraphim,1 1047 | se,sia,1 1048 | se,sides,1 1049 | seach,sigh,1 1050 | sebsi,sbassa,1 1051 | seder,sidarim,1 1052 | seder,siddarim,1 1053 | sek,skim,1 1054 | sel,slach,1 1055 | sente,lisente,1 1056 | seq,sqim,1 1057 | ser,srim,1 1058 | ses,sides,1 1059 | sgid,sagid,1 1060 | sharif,ashraf,1 1061 | shisname,stheirnames,1 1062 | shuk,suit,1 1063 | sinego,ssinegoes,1 1064 | singhalese,cingalese,1 1065 | sjid,sajid,1 1066 | son,sa,1 1067 | srani,sara,1 1068 | st,ss,1 1069 | sus,sii,1 1070 | sya,si,1 1071 | t,tae,1 1072 | t,tai,1 1073 | t,tean,1 1074 | t,th,1 1075 | t,tir,1 1076 | t,toth,1 1077 | t,tovi,1 1078 | t,tt,1 1079 | t,ttys,1 1080 | ta,t,1 1081 | ta,toth,1 1082 | tachresis,techreses,1 1083 | tah,tim,1 1084 | tah,tot,1 1085 | tah,toth,1 1086 | tambala,matambala,1 1087 | tariqa,turuq,1 1088 | tax,tices,1 1089 | te,ta,1 1090 | te,tia,1 1091 | teg,tagim,1 1092 | ten,tines,1 1093 | ter,tres,1 1094 | tey,ties,1 1095 | tis,tai,1 1096 | titrix,trices,1 1097 | tmartial,tsmartial,1 1098 | to,ta,1 1099 | toof,teef,1 1100 | toothmark,teethmarks,1 1101 | triptotes,nouns,1 1102 | tum,ti,1 1103 | tus,tii,1 1104 | tus,tora,1 1105 | ty,te,1 1106 | ty,tees,1 1107 | tys,ties,1 1108 | u,uens,1 1109 | u,ukuna,1 1110 | uah,uos,1 1111 | uah,uot,1 1112 | uah,uoth,1 1113 | uan,un,1 1114 | ues,uites,1 1115 | uhil,ujil,1 1116 | uitur,uuntur,1 1117 | umkhwetha,abakhwetha,1 1118 | umkhwetha,abakwetha,1 1119 | umlungu,abelungu,1 1120 | umuzungu,abazungu,1 1121 | umuzungu,bazungu,1 1122 | umzulu,amazulu,1 1123 | ura,uwar,1 1124 | urah,uwar,1 1125 | uwa,ua,1 1126 | v,va,1 1127 | v,vz,1 1128 | va,ve,1 1129 | va,voth,1 1130 | vah,vos,1 1131 | vah,vot,1 1132 | veh,vos,1 1133 | veh,vot,1 1134 | vnia,ven,1 1135 | vok,vki,1 1136 | w,wau,1 1137 | w,wes,1 1138 | w,wis,1 1139 | wakf,awkaf,1 1140 | walian,welsh,1 1141 | waqf,awqaf,1 1142 | wiec,wce,1 1143 | womin,wimin,1 1144 | wommin,wimmin,1 1145 | womyn,wymyn,1 1146 | woose,weese,1 1147 | wr,wyr,1 1148 | wwa,ween,1 1149 | x,xob,1 1150 | x,xxes,1 1151 | xis,xeis,1 1152 | y,ych,1 1153 | y,yem,1 1154 | y,yim,1 1155 | y,yin,1 1156 | y,yos,1 1157 | y,yren,1 1158 | ya,yos,1 1159 | yah,yos,1 1160 | yah,yoth,1 1161 | ye,yce,1 1162 | yen,yonem,1 1163 | yen,yonim,1 1164 | yification,yfications,1 1165 | yk,ycy,1 1166 | yka,yok,1 1167 | yoret,yurot,1 1168 | yote,ya,1 1169 | ytan,yatin,1 1170 | ywoman,yswomen,1 1171 | yya,yat,1 1172 | z,zen,1 1173 | z,zer,1 1174 | z,zok,1 1175 | z,zzim,1 1176 | zayit,zeytim,1 1177 | zek,zki,1 1178 | zek,zky,1 1179 | zimrah,zemirot,1 1180 | zimrah,zemiroth,1 1181 | zindiq,zandaqa,1 1182 | zwa,zawat,1 1183 | -------------------------------------------------------------------------------- /extractor/src/main.rs: -------------------------------------------------------------------------------- 1 | use csv::Writer; 2 | use english_core::*; 3 | use serde::Deserialize; 4 | use std::collections::{HashMap, HashSet}; 5 | use std::env; 6 | use std::error::Error; 7 | use std::fs::File; 8 | use std::io::{BufRead, BufReader, Write}; 9 | mod file_generation; 10 | use file_generation::*; 11 | mod helpers; 12 | use csv::{ReaderBuilder, WriterBuilder}; 13 | pub use helpers::*; 14 | 15 | fn main() -> Result<(), Box> { 16 | let args: Vec = env::args().collect(); 17 | 18 | if args.len() != 2 { 19 | eprintln!("Usage: cargo run --release rawwiki.jsonl"); 20 | std::process::exit(1); 21 | } 22 | 23 | let input_path = &args[1]; 24 | 25 | let filtered_json_path = "english_filtered.jsonl"; 26 | 27 | //filter_english_entries(input_path, filtered_json_path); 28 | 29 | //let input_path = "../../english.jsonl"; 30 | check_noun_plurals(filtered_json_path, "noun_plural_check.csv")?; 31 | check_verb_conjugations(filtered_json_path, "verbs_check.csv")?; 32 | check_adjective_forms(filtered_json_path, "adj_check.csv")?; 33 | 34 | extract_verb_conjugations_new(filtered_json_path, "verb_conjugations.csv")?; 35 | extract_irregular_nouns(filtered_json_path, "nouns_with_plurals.csv")?; 36 | 37 | extract_irregular_adjectives(filtered_json_path, "adjectives.csv")?; 38 | 39 | generate_nouns_phf("nouns_with_plurals.csv", "noun_phf.rs"); 40 | generate_adjectives_phf("adjectives.csv", "adj_phf.rs"); 41 | generate_verbs_phf("verb_conjugations.csv", "verb_phf.rs"); 42 | 43 | // analyze_and_write_suffix_rules("nouns_with_plurals.csv", "analyzed_endings.csv"); 44 | Ok(()) 45 | } 46 | 47 | /// Extracts irregular noun plurals and writes them to a CSV. 48 | fn extract_irregular_nouns(input_path: &str, output_path: &str) -> Result<(), Box> { 49 | let mut forms_map: HashMap> = HashMap::new(); 50 | 51 | let (reader, mut writer) = base_setup(input_path, output_path); 52 | writer.write_record(&["word", "plural"])?; 53 | 54 | for line in reader.lines() { 55 | let line = line?; 56 | let entry: Entry = match serde_json::from_str(&line) { 57 | Ok(e) => e, 58 | Err(e) => { 59 | println!("{:#?}", e); 60 | continue; 61 | } 62 | }; 63 | 64 | if !entry_is_proper(&entry, "noun") { 65 | continue; 66 | } 67 | 68 | let infinitive = entry.word.to_lowercase(); 69 | 70 | if !forms_map.contains_key(&infinitive) { 71 | forms_map.insert(infinitive.clone(), HashSet::new()); 72 | } 73 | 74 | let mut plural_found = false; 75 | if let Some(forms) = entry.forms { 76 | for form in &forms { 77 | let tags = &form.tags; 78 | 79 | let entry_form = form.form.to_lowercase(); 80 | if entry_form == "dubious" { 81 | continue; 82 | } 83 | if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) { 84 | continue; 85 | } 86 | 87 | if tags.contains(&"plural".into()) { 88 | forms_map 89 | .get_mut(&infinitive) 90 | .unwrap() 91 | .insert(entry_form.clone()); 92 | } 93 | } 94 | } 95 | } 96 | 97 | for (inf, setik) in forms_map.iter_mut() { 98 | let predicted_plural = EnglishCore::pluralize_noun(&inf); 99 | if setik.is_empty() { 100 | continue; 101 | } 102 | let alr_cont = setik.remove(&predicted_plural); 103 | let mut index = match alr_cont { 104 | true => 2, 105 | false => 1, 106 | }; 107 | let mut sorted_vec: Vec = setik.clone().into_iter().collect(); 108 | sorted_vec.sort(); // uses Ord for sorting 109 | for thing in sorted_vec.iter() { 110 | let word_key = if index == 1 { 111 | inf.clone() 112 | } else { 113 | format!("{inf}{index}") 114 | }; 115 | let keyd_struct = [word_key.clone(), thing.clone()]; 116 | 117 | if index < 10 { 118 | writer.write_record(&keyd_struct)?; 119 | } 120 | index += 1; 121 | } 122 | } 123 | 124 | writer.flush()?; 125 | println!("Done! Output written to {}", output_path); 126 | Ok(()) 127 | } 128 | 129 | fn extract_irregular_adjectives(input_path: &str, output_path: &str) -> Result<(), Box> { 130 | let mut forms_map: HashMap> = HashMap::new(); 131 | let (reader, mut writer) = base_setup(input_path, output_path); 132 | writer.write_record(&["positive", "comparative", "superlative"])?; 133 | 134 | for line in reader.lines() { 135 | let line = line?; 136 | let entry: Entry = match serde_json::from_str(&line) { 137 | Ok(e) => e, 138 | Err(e) => { 139 | println!("{:#?}", e); 140 | continue; 141 | } 142 | }; 143 | if !entry_is_proper(&entry, "adj") { 144 | continue; 145 | } 146 | 147 | let infinitive = entry.word.to_lowercase(); 148 | if !forms_map.contains_key(&infinitive) { 149 | forms_map.insert(infinitive.clone(), HashSet::new()); 150 | } 151 | let mut adjik = AdjParts::default(); 152 | adjik.positive = infinitive.clone(); 153 | 154 | if let Some(forms) = entry.forms { 155 | for form in &forms { 156 | let tags = &form.tags; 157 | let entry_form = form.form.to_lowercase(); 158 | if entry_form == "dubious" { 159 | continue; 160 | } 161 | if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) { 162 | continue; 163 | } 164 | 165 | if tags.contains(&"comparative".into()) && adjik.comparative == "" { 166 | adjik.comparative = entry_form.clone(); 167 | } 168 | 169 | if tags.contains(&"superlative".into()) && adjik.superlative == "" { 170 | adjik.superlative = entry_form.clone(); 171 | } 172 | } 173 | } 174 | 175 | let predicted_comparative = EnglishCore::comparative(&infinitive); 176 | let predicted_superlative = EnglishCore::superlative(&infinitive); 177 | if adjik.comparative == "" { 178 | adjik.comparative = predicted_comparative.clone(); 179 | } 180 | if adjik.superlative == "" { 181 | adjik.superlative = predicted_superlative.clone(); 182 | } 183 | 184 | forms_map 185 | .get_mut(&infinitive) 186 | .unwrap() 187 | .insert(adjik.clone()); 188 | } 189 | for (inf, setik) in forms_map.iter_mut() { 190 | let predicted_comparative = EnglishCore::comparative(&inf); 191 | let predicted_superlative = EnglishCore::superlative(&inf); 192 | 193 | let mut predicted_adj = AdjParts::default(); 194 | predicted_adj.positive = inf.clone(); 195 | predicted_adj.comparative = predicted_comparative.clone(); 196 | predicted_adj.superlative = predicted_superlative.clone(); 197 | if setik.is_empty() { 198 | continue; 199 | } 200 | 201 | let mut index = match setik.remove(&predicted_adj) { 202 | true => 2, 203 | false => 1, 204 | }; 205 | let mut sorted_vec: Vec = setik.clone().into_iter().collect(); 206 | sorted_vec.sort(); // uses Ord for sorting 207 | for thing in sorted_vec.iter() { 208 | let word_key = if index == 1 { 209 | inf.clone() 210 | } else { 211 | format!("{inf}{index}") 212 | }; 213 | //positive,comparative,superlative 214 | let keyd_struct = [ 215 | word_key.clone(), 216 | thing.comparative.clone(), 217 | thing.superlative.clone(), 218 | ]; 219 | index += 1; 220 | writer.write_record(&keyd_struct)?; 221 | } 222 | } 223 | writer.flush()?; 224 | println!("Done! Output written to {}", output_path); 225 | Ok(()) 226 | } 227 | 228 | /// Extracts verb conjugations and writes them to a CSV. 229 | fn extract_verb_conjugations_new( 230 | input_path: &str, 231 | output_path: &str, 232 | ) -> Result<(), Box> { 233 | let mut forms_map: HashMap> = HashMap::new(); 234 | let (reader, mut writer) = base_setup(input_path, output_path); 235 | writer.write_record(&[ 236 | "infinitive", 237 | "third_person_singular", 238 | "past", 239 | "present_participle", 240 | "past_participle", 241 | ])?; 242 | 243 | for line in reader.lines() { 244 | let line = line?; 245 | let entry: Entry = match serde_json::from_str(&line) { 246 | Ok(e) => e, 247 | Err(e) => { 248 | println!("{:#?}", e); 249 | continue; 250 | } 251 | }; 252 | if !entry_is_proper(&entry, "verb") { 253 | continue; 254 | } 255 | 256 | let mut has_third = false; 257 | let infinitive = entry.word.to_lowercase(); 258 | if !forms_map.contains_key(&infinitive) { 259 | forms_map.insert(infinitive.clone(), HashSet::new()); 260 | } 261 | let mut verbik = VerbParts::default(); 262 | verbik.inf = infinitive.clone(); 263 | 264 | if verbik.inf == "be" { 265 | continue; 266 | } 267 | 268 | if let Some(forms) = entry.forms { 269 | for form in &forms { 270 | let tags = &form.tags; 271 | let entry_form = form.form.to_lowercase(); 272 | if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) { 273 | continue; 274 | } 275 | 276 | if tags.contains(&"third-person".into()) 277 | && tags.contains(&"singular".into()) 278 | && tags.contains(&"present".into()) 279 | && !has_third 280 | { 281 | has_third = true; 282 | verbik.third = entry_form.clone(); 283 | } 284 | 285 | if tags.contains(&"past".into()) 286 | && !tags.contains(&"participle".into()) 287 | && verbik.past == "" 288 | { 289 | verbik.past = entry_form.clone(); 290 | } 291 | 292 | if tags.contains(&"participle".into()) 293 | && tags.contains(&"present".into()) 294 | && verbik.present_part == "" 295 | { 296 | verbik.present_part = entry_form.clone(); 297 | } 298 | 299 | if tags.contains(&"participle".into()) 300 | && tags.contains(&"past".into()) 301 | && verbik.past_part == "" 302 | { 303 | verbik.past_part = entry_form.clone(); 304 | } 305 | } 306 | } 307 | 308 | let predicted_past = EnglishCore::verb( 309 | &infinitive, 310 | &Person::Third, 311 | &Number::Singular, 312 | &Tense::Past, 313 | &Form::Finite, 314 | ); 315 | let predicted_participle = EnglishCore::verb( 316 | &infinitive, 317 | &Person::Third, 318 | &Number::Singular, 319 | &Tense::Present, 320 | &Form::Participle, 321 | ); 322 | 323 | if verbik.past == "" { 324 | verbik.past = predicted_past.clone(); 325 | } 326 | if verbik.past_part == "" { 327 | verbik.past_part = verbik.past.clone(); 328 | } 329 | if verbik.present_part == "" { 330 | verbik.present_part = predicted_participle.clone(); 331 | } 332 | 333 | if has_third { 334 | forms_map 335 | .get_mut(&infinitive) 336 | .unwrap() 337 | .insert(verbik.clone()); 338 | } 339 | } 340 | for (inf, setik) in forms_map.iter_mut() { 341 | let predicted_third = EnglishCore::verb( 342 | &inf, 343 | &Person::Third, 344 | &Number::Singular, 345 | &Tense::Present, 346 | &Form::Finite, 347 | ); 348 | let predicted_past = EnglishCore::verb( 349 | &inf, 350 | &Person::Third, 351 | &Number::Singular, 352 | &Tense::Past, 353 | &Form::Finite, 354 | ); 355 | let predicted_participle = EnglishCore::verb( 356 | &inf, 357 | &Person::Third, 358 | &Number::Singular, 359 | &Tense::Present, 360 | &Form::Participle, 361 | ); 362 | 363 | let mut predicted_verb = VerbParts::default(); 364 | predicted_verb.inf = inf.clone(); 365 | predicted_verb.third = predicted_third.clone(); 366 | predicted_verb.past = predicted_past.clone(); 367 | predicted_verb.past_part = predicted_past.clone(); 368 | predicted_verb.present_part = predicted_participle.clone(); 369 | if setik.is_empty() { 370 | continue; 371 | } 372 | 373 | let mut index = match setik.remove(&predicted_verb) { 374 | true => 2, 375 | false => 1, 376 | }; 377 | let mut sorted_vec: Vec = setik.clone().into_iter().collect(); 378 | sorted_vec.sort(); // uses Ord for sorting 379 | for thing in sorted_vec.iter() { 380 | let word_key = if index == 1 { 381 | inf.clone() 382 | } else { 383 | format!("{inf}{index}") 384 | }; 385 | //infinitive,third_person_singular,past,present_participle,past_participle 386 | let keyd_struct = [ 387 | word_key.clone(), 388 | thing.third.clone(), 389 | thing.past.clone(), 390 | thing.present_part.clone(), 391 | thing.past_part.clone(), 392 | ]; 393 | index += 1; 394 | writer.write_record(&keyd_struct)?; 395 | } 396 | } 397 | 398 | writer.flush()?; 399 | println!("Done! Output written to {}", output_path); 400 | Ok(()) 401 | } 402 | 403 | pub fn check_noun_plurals(input_path: &str, output_path: &str) -> Result<(), Box> { 404 | use english::*; 405 | let (reader, mut writer) = base_setup(input_path, output_path); 406 | writer.write_record(&["wiki_single", "wiktionary_plural"])?; 407 | 408 | let mut total_counter = 0; 409 | let mut match_counter = 0; 410 | 411 | for line in reader.lines() { 412 | let line = line?; 413 | let entry: Entry = match serde_json::from_str(&line) { 414 | Ok(e) => e, 415 | Err(_) => continue, 416 | }; 417 | 418 | // Only proper English nouns 419 | if !entry_is_proper(&entry, "noun") { 420 | continue; 421 | } 422 | let lowercased_entry = entry.word.to_lowercase(); 423 | 424 | // Gather all plural forms from Wiktionary 425 | let mut wiktionary_plurals = Vec::new(); 426 | if let Some(forms) = entry.forms { 427 | for form in forms { 428 | if form.tags.contains(&"plural".into()) 429 | // && !contains_bad_tag(form.tags.clone()) 430 | // && word_is_proper(&form.form) 431 | { 432 | wiktionary_plurals.push(form.form.to_lowercase()); 433 | } 434 | } 435 | } 436 | if wiktionary_plurals.is_empty() { 437 | continue; 438 | } 439 | 440 | // Try base word and numbered variants 441 | let mut variants = vec![lowercased_entry.clone()]; 442 | for i in 2..=9 { 443 | variants.push(format!("{}{}", lowercased_entry, i)); 444 | } 445 | 446 | for wiki_plural in &wiktionary_plurals { 447 | let wiki_plural = wiki_plural.to_lowercase(); 448 | total_counter += 1; 449 | let mut matched = false; 450 | for variant in &variants { 451 | let generated_plural = English::noun(variant, &Number::Plural); 452 | matched = generated_plural == wiki_plural; 453 | if matched { 454 | match_counter += 1; 455 | break; 456 | } 457 | } 458 | if !matched { 459 | writer.write_record(&[lowercased_entry.clone(), wiki_plural.clone()])?; 460 | } 461 | } 462 | } 463 | 464 | writer.flush()?; 465 | println!("Done! Output written to {}", output_path); 466 | println!("total match amount: {} / {}", match_counter, total_counter); 467 | Ok(()) 468 | } 469 | 470 | pub fn check_verb_conjugations(input_path: &str, output_path: &str) -> Result<(), Box> { 471 | use english::*; 472 | let (reader, mut writer) = base_setup(input_path, output_path); 473 | 474 | writer.write_record(&["wiktionary_form", "person", "number", "tense", "form"])?; 475 | 476 | let mut total_counter = 0; 477 | let mut match_counter = 0; 478 | 479 | for line in reader.lines() { 480 | let line = line?; 481 | let entry: Entry = match serde_json::from_str(&line) { 482 | Ok(e) => e, 483 | Err(_) => continue, 484 | }; 485 | 486 | // Only proper English verbs 487 | if !entry_is_proper(&entry, "verb") { 488 | continue; 489 | } 490 | 491 | let lowercased_entry = entry.word.to_lowercase(); 492 | 493 | // Collect (form, person, number, tense, form_type) from Wiktionary 494 | let mut wiktionary_forms = Vec::new(); 495 | if let Some(forms) = entry.forms { 496 | for form in forms { 497 | let tags = form 498 | .tags 499 | .iter() 500 | .map(|t| t.to_lowercase()) 501 | .collect::>(); 502 | let form_str = form.form.to_lowercase(); 503 | 504 | // Skip bad data 505 | if form_str == "dubious" 506 | || contains_bad_tag(form.tags.clone()) 507 | || !word_is_proper(&form.form) 508 | { 509 | continue; 510 | } 511 | 512 | // Determine grammatical properties 513 | //Only check third person, first and second are always the infinitive 514 | let person = if tags.contains(&"first-person".into()) { 515 | continue; 516 | } else if tags.contains(&"second-person".into()) { 517 | continue; 518 | } else { 519 | Person::Third 520 | }; 521 | 522 | //only check singular, plural is always same as singular except for third singular present 523 | let number = if tags.contains(&"plural".into()) { 524 | continue; 525 | } else { 526 | Number::Singular 527 | }; 528 | 529 | let tense = if tags.contains(&"present".into()) { 530 | Tense::Present 531 | } else if tags.contains(&"past".into()) { 532 | Tense::Past 533 | } else { 534 | Tense::Present 535 | }; 536 | 537 | let form_type = if tags.contains(&"participle".into()) { 538 | Form::Participle 539 | } else if tags.contains(&"infinitive".into()) { 540 | continue; 541 | } else { 542 | Form::Finite 543 | }; 544 | 545 | wiktionary_forms.push((form_str, person, number, tense, form_type)); 546 | } 547 | } 548 | 549 | if wiktionary_forms.is_empty() { 550 | continue; 551 | } 552 | 553 | // Try base and numbered variants 554 | let mut variants = vec![lowercased_entry.clone()]; 555 | for i in 2..=9 { 556 | variants.push(format!("{}{}", lowercased_entry, i)); 557 | } 558 | 559 | for (wiki_form, person, number, tense, form_type) in wiktionary_forms { 560 | total_counter += 1; 561 | let mut matched = false; 562 | 563 | for variant in &variants { 564 | let generated_form = English::verb(variant, &person, &number, &tense, &form_type); 565 | matched = generated_form == wiki_form; 566 | if matched { 567 | match_counter += 1; 568 | 569 | break; 570 | } 571 | } 572 | 573 | if !matched { 574 | writer.write_record(&[ 575 | wiki_form.clone(), 576 | format!("{:?}", person), 577 | format!("{:?}", number), 578 | format!("{:?}", tense), 579 | format!("{:?}", form_type), 580 | ])?; 581 | } 582 | } 583 | } 584 | 585 | writer.flush()?; 586 | println!("Done! Output written to {}", output_path); 587 | println!("total match amount: {} / {}", match_counter, total_counter); 588 | Ok(()) 589 | } 590 | 591 | pub fn check_adjective_forms(input_path: &str, output_path: &str) -> Result<(), Box> { 592 | use english::*; 593 | let (reader, mut writer) = base_setup(input_path, output_path); 594 | writer.write_record(&["wiktionary_form", "degree"])?; 595 | 596 | let mut total_counter = 0; 597 | let mut match_counter = 0; 598 | 599 | for line in reader.lines() { 600 | let line = line?; 601 | let entry: Entry = match serde_json::from_str(&line) { 602 | Ok(e) => e, 603 | Err(_) => continue, 604 | }; 605 | 606 | // Only proper English adjectives 607 | if !entry_is_proper(&entry, "adj") { 608 | continue; 609 | } 610 | 611 | let lowercased_entry = entry.word.to_lowercase(); 612 | 613 | // Gather all adjective forms from Wiktionary 614 | let mut wiki_comparative: Option = None; 615 | let mut wiki_superlative: Option = None; 616 | 617 | if let Some(forms) = entry.forms { 618 | for form in forms { 619 | let form_str = form.form.to_lowercase(); 620 | let tags_lower: Vec = form.tags.iter().map(|t| t.to_lowercase()).collect(); 621 | 622 | if tags_lower.contains(&"comparative".into()) { 623 | wiki_comparative = Some(form_str); 624 | } else if tags_lower.contains(&"superlative".into()) { 625 | wiki_superlative = Some(form_str); 626 | } 627 | } 628 | } 629 | 630 | // If Wiktionary has no comparative or superlative, skip 631 | if wiki_comparative.is_none() && wiki_superlative.is_none() { 632 | continue; 633 | } 634 | 635 | // Try base and numbered variants 636 | let mut variants = vec![lowercased_entry.clone()]; 637 | for i in 2..=9 { 638 | variants.push(format!("{}{}", lowercased_entry, i)); 639 | } 640 | 641 | // Comparative 642 | if let Some(wiki_comp) = &wiki_comparative { 643 | let wiki_comp = wiki_comp.to_lowercase(); 644 | total_counter += 1; 645 | let mut matched = false; 646 | for variant in &variants { 647 | let generated_comp = English::adj(variant, &Degree::Comparative); 648 | if generated_comp == wiki_comp { 649 | match_counter += 1; 650 | matched = true; 651 | break; 652 | } 653 | } 654 | if !matched { 655 | writer.write_record(&[wiki_comp.clone(), "Comparative".into()])?; 656 | } 657 | } 658 | 659 | // Superlative 660 | if let Some(wiki_sup) = &wiki_superlative { 661 | let wiki_sup = wiki_sup.to_lowercase(); 662 | total_counter += 1; 663 | let mut matched = false; 664 | for variant in &variants { 665 | let generated_sup = English::adj(variant, &Degree::Superlative); 666 | if generated_sup == wiki_sup { 667 | match_counter += 1; 668 | matched = true; 669 | break; 670 | } 671 | } 672 | if !matched { 673 | writer.write_record(&[wiki_sup.clone(), "Superlative".into()])?; 674 | } 675 | } 676 | } 677 | 678 | writer.flush()?; 679 | println!("Done! Output written to {}", output_path); 680 | println!("total match amount: {} / {}", match_counter, total_counter); 681 | Ok(()) 682 | } 683 | 684 | pub fn filter_english_entries(input_path: &str, output_path: &str) -> Result<(), Box> { 685 | let input = File::open(input_path)?; 686 | let reader = BufReader::new(input); 687 | 688 | let mut output = File::create(output_path)?; 689 | 690 | for line in reader.lines() { 691 | let line = line?; 692 | let entry: Entry = match serde_json::from_str(&line) { 693 | Ok(e) => e, 694 | Err(_) => continue, // skip bad lines 695 | }; 696 | 697 | // Keep only English words 698 | if entry.lang_code != "en" { 699 | continue; 700 | } 701 | 702 | // Write valid entry back as JSON 703 | writeln!(output, "{}", line)?; 704 | } 705 | 706 | println!("Filtered dataset saved to {}", output_path); 707 | Ok(()) 708 | } 709 | 710 | /// Strip trailing digits from a word (e.g., "walrus2" -> "walrus") 711 | pub fn strip_trailing_number(word: &str) -> &str { 712 | word.trim_end_matches(|c: char| c.is_ascii_digit()) 713 | } 714 | 715 | /// Process CSV and write suffix rules with frequencies to output CSV 716 | pub fn analyze_and_write_suffix_rules( 717 | input_path: &str, 718 | output_path: &str, 719 | ) -> Result<(), Box> { 720 | let file = File::open(input_path)?; 721 | let mut rdr = ReaderBuilder::new().from_reader(BufReader::new(file)); 722 | 723 | let mut freq: HashMap<(String, String), usize> = HashMap::new(); 724 | 725 | for result in rdr.records() { 726 | let record = result?; 727 | let singular_raw = record.get(0).unwrap(); 728 | let plural = record.get(1).unwrap(); 729 | 730 | let singular = strip_trailing_number(singular_raw); 731 | 732 | let pair = suffix_rule(singular, plural); 733 | *freq.entry(pair).or_insert(0) += 1; 734 | } 735 | 736 | // Sort by frequency descending, then singular suffix, then plural suffix 737 | let mut freq_vec: Vec<_> = freq.into_iter().collect(); 738 | freq_vec.sort_by(|a, b| { 739 | b.1.cmp(&a.1) 740 | .then_with(|| a.0.0.cmp(&b.0.0)) 741 | .then_with(|| a.0.1.cmp(&b.0.1)) 742 | }); 743 | 744 | let mut wtr = WriterBuilder::new().from_path(output_path)?; 745 | wtr.write_record(&["singular_suffix", "plural_suffix", "count"])?; 746 | 747 | for ((sing_suf, plur_suf), count) in freq_vec { 748 | wtr.write_record(&[sing_suf, plur_suf, count.to_string()])?; 749 | } 750 | 751 | wtr.flush()?; 752 | Ok(()) 753 | } 754 | --------------------------------------------------------------------------------