├── .gitattributes
├── extractor
    ├── .DS_Store
    ├── Cargo.toml
    ├── src
    │   ├── helpers.rs
    │   ├── file_generation.rs
    │   ├── scratch.rs
    │   └── main.rs
    ├── adj_check.csv
    ├── noun_plural_check.csv
    ├── insane_noun.csv
    └── analyzed_endings.csv
├── english-core
    ├── .DS_Store
    ├── src
    │   ├── lib.rs
    │   ├── utils.rs
    │   ├── grammar.rs
    │   ├── noun.rs
    │   ├── verb.rs
    │   └── adj.rs
    └── Cargo.toml
├── src
    ├── snippets.rs
    ├── adj.rs
    ├── noun.rs
    ├── verb.rs
    └── lib.rs
├── .gitignore
├── Cargo.toml
├── LICENSE
├── examples
    ├── speedmark.rs
    ├── test.rs
    └── test2.rs
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/extractor/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gold-silver-copper/english/HEAD/extractor/.DS_Store


--------------------------------------------------------------------------------
/english-core/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gold-silver-copper/english/HEAD/english-core/.DS_Store


--------------------------------------------------------------------------------
/english-core/src/lib.rs:
--------------------------------------------------------------------------------
1 | mod adj;
2 | pub mod grammar;
3 | mod noun;
4 | 
5 | mod utils;
6 | mod verb;
7 | pub use crate::grammar::*;
8 | pub struct EnglishCore {}
9 | 


--------------------------------------------------------------------------------
/extractor/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "extractor"
 3 | version = "0.1.0"
 4 | edition = "2024"
 5 | 
 6 | [dependencies]
 7 | csv = "1.3.1"
 8 | serde = { version = "1.0.219", features = ["derive"] }
 9 | serde_json = "1.0.140"
10 | english-core = { path = "../english-core" }
11 | # english is only used for benchmarking
12 | english = { path = ".." }
13 | 


--------------------------------------------------------------------------------
/src/snippets.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | impl English {
 4 |     pub fn simple_sentence(object: &Noun, subject: &Noun, verb: &Verb) -> String {
 5 |         let verb_str = English::verb(
 6 |             &verb.word,
 7 |             &verb.person,
 8 |             &subject.number,
 9 |             &verb.tense,
10 |             &verb.form,
11 |         );
12 | 
13 |         format!("{} {} {}.", subject, verb_str, object)
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/english-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "english-core"
 3 | version = "0.1.0"
 4 | authors = ["gold-silver-copper"]
 5 | edition = "2024"
 6 | include = ["LICENSE-APACHE", "LICENSE-MIT", "**/*.rs", "Cargo.toml"]
 7 | 
 8 | description = "English language inflector"
 9 | license = "MIT OR Apache-2.0"
10 | repository = "https://github.com/gold-silver-copper/english"
11 | 
12 | 
13 | [lib]
14 | crate-type = ["cdylib", "rlib"]
15 | 
16 | [profile.release]
17 | opt-level = 2 # fast and small wasm
18 | 
19 | # Optimize all dependencies even in debug builds:
20 | [profile.dev.package."*"]
21 | opt-level = 2
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | debug/
 4 | target/
 5 | .DS_Store
 6 | 
 7 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 8 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 9 | Cargo.lock
10 | 
11 | # These are backup files generated by rustfmt
12 | **/*.rs.bk
13 | 
14 | # MSVC Windows builds of rustc generate these, which store debugging information
15 | *.pdb
16 | extractor/english_filtered.jsonl
17 | english-core/.DS_Store
18 | extractor/adjectives.csv
19 | extractor/nouns_with_plurals.csv
20 | extractor/verb_conjugations.csv
21 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "english"
 3 | version = "0.1.3"
 4 | authors = ["gold-silver-copper"]
 5 | edition = "2024"
 6 | include = ["LICENSE-APACHE", "LICENSE-MIT", "**/*.rs", "Cargo.toml"]
 7 | 
 8 | description = "English inflector decliner conjugator from wiktionary data"
 9 | license = "MIT OR Apache-2.0"
10 | repository = "https://github.com/gold-silver-copper/english"
11 | 
12 | 
13 | [lib]
14 | crate-type = ["cdylib", "rlib"]
15 | 
16 | 
17 | [profile.release]
18 | opt-level = 2 # fast and small wasm
19 | 
20 | # Optimize all dependencies even in debug builds:
21 | [profile.dev.package."*"]
22 | opt-level = 2
23 | 
24 | [dependencies]
25 | #english-core = { path = "english-core" }
26 | english-core = "0.1.0"
27 | phf = { version = "0.12", default-features = false, features = ["macros"] }
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 zombkit
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/english-core/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use crate::EnglishCore;
 2 | use crate::grammar::*;
 3 | impl EnglishCore {
 4 |     pub fn pair_match(word: &str, listik: &[(&str, &str)]) -> Option<String> {
 5 |         listik
 6 |             .iter()
 7 |             .find(|(sing, _)| *sing == word)
 8 |             .map(|(_, plur)| plur.to_string())
 9 |     }
10 | 
11 |     pub fn replace_last_occurence(input: &str, pattern: &str, replacement: &str) -> String {
12 |         if let Some(last_index) = input.rfind(pattern) {
13 |             let (before_last, _after_last) = input.split_at(last_index);
14 |             format!("{}{}", before_last, replacement)
15 |         } else {
16 |             input.into()
17 |         }
18 |     }
19 |     pub fn iter_replace_last(word: &str, pairs: &[(&str, &str)]) -> Option<String> {
20 |         for (sing, plur) in pairs {
21 |             if word.ends_with(sing) {
22 |                 return Some(EnglishCore::replace_last_occurence(word, sing, plur));
23 |             }
24 |         }
25 |         None
26 |     }
27 | 
28 |     pub fn starts_with_uppercase(word: &str) -> bool {
29 |         word.chars()
30 |             .next()
31 |             .map(|c| c.is_uppercase())
32 |             .unwrap_or(false)
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/adj.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | ///The Adj struct is used for holding adjective functions
 4 | #[derive(Debug, Clone, PartialEq, Eq)]
 5 | pub struct Adj {}
 6 | 
 7 | impl Adj {
 8 |     // ---------------------------
 9 |     // ADJECTIVE HELPERS
10 |     // ---------------------------
11 | 
12 |     /// Returns the comparative form of an adjective.
13 |     ///
14 |     /// # Examples
15 |     /// ```
16 |     /// assert_eq!(English::comparative("fast2"), "faster");
17 |     /// assert_eq!(English::comparative("fun"), "more fun");
18 |     /// ```
19 |     pub fn comparative(word: &str) -> String {
20 |         English::adj(word, &Degree::Comparative)
21 |     }
22 | 
23 |     /// Returns the superlative form of an adjective.
24 |     ///
25 |     /// # Examples
26 |     /// ```
27 |     /// assert_eq!(English::superlative("fast2"), "fastest");
28 |     /// assert_eq!(English::superlative("fun"), "most fun");
29 |     /// ```
30 |     pub fn superlative(word: &str) -> String {
31 |         English::adj(word, &Degree::Superlative)
32 |     }
33 | 
34 |     /// Returns the positive (base) form of an adjective.
35 |     ///
36 |     /// # Examples
37 |     /// ```
38 |     /// assert_eq!(English::positive("fast2"), "fast");
39 |     /// ```
40 |     pub fn positive(word: &str) -> String {
41 |         English::adj(word, &Degree::Positive)
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/english-core/src/grammar.rs:
--------------------------------------------------------------------------------
 1 | #[derive(Debug, PartialEq, Clone)]
 2 | pub enum Number {
 3 |     Singular,
 4 |     Plural,
 5 | }
 6 | #[derive(Debug, PartialEq, Clone)]
 7 | pub enum Case {
 8 |     Nominative,
 9 |     Accusative,
10 |     Reflexive,
11 |     Possessive,
12 |     PersonalPossesive,
13 | }
14 | #[derive(Debug, PartialEq, Clone)]
15 | pub enum Tense {
16 |     Present,
17 |     Past,
18 |     // Future could be added too
19 | }
20 | 
21 | #[derive(Debug, PartialEq, Clone)]
22 | pub enum Form {
23 |     Finite,
24 |     Participle,
25 |     Infinitive,
26 |     // Transgressive, Supine, etc., depending on language
27 | }
28 | 
29 | #[derive(Debug, PartialEq, Clone)]
30 | pub enum Person {
31 |     First,
32 |     Second,
33 |     Third,
34 | }
35 | #[derive(Debug, PartialEq, Clone)]
36 | pub enum Gender {
37 |     Masculine,
38 |     Feminine,
39 |     Neuter,
40 | }
41 | #[derive(Debug, PartialEq, Clone)]
42 | pub enum Degree {
43 |     Positive,
44 |     Comparative,
45 |     Superlative,
46 | }
47 | 
48 | /*#[derive(Debug, PartialEq, Clone)]
49 | pub enum Mood {
50 |     Indicative,
51 |     Subjunctive,
52 |     Imperative,
53 |     // Conditional, Interrogative, etc.
54 | } */
55 | /*#[derive(Debug, PartialEq, Clone)]
56 | pub enum Det {
57 |     Definite,
58 |     Indefinite,
59 | }
60 |  */
61 | /*#[derive(Debug, PartialEq, Clone)]
62 | pub enum Voice {
63 |     Active,
64 |     Passive,
65 |     // Middle, Reflexive, etc.
66 | }
67 | */
68 | 


--------------------------------------------------------------------------------
/english-core/src/noun.rs:
--------------------------------------------------------------------------------
 1 | use crate::EnglishCore;
 2 | use crate::grammar::*;
 3 | 
 4 | impl EnglishCore {
 5 |     pub fn noun(word: &str, number: &Number) -> String {
 6 |         match number {
 7 |             Number::Singular => return word.to_string(),
 8 |             Number::Plural => return EnglishCore::pluralize_noun(word),
 9 |         }
10 |     }
11 |     pub fn add_possessive(word: &str) -> String {
12 |         if word.ends_with('s') {
13 |             format!("{word}'") // Regular plural: dogs'
14 |         } else {
15 |             format!("{word}'s") // Irregular plural: children’s
16 |         }
17 |     }
18 | 
19 |     pub fn pluralize_noun(word: &str) -> String {
20 |         if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_SUFFIXES) {
21 |             return irr;
22 |         }
23 |         format!("{}{}", word, "s")
24 |     }
25 | }
26 | 
27 | //These are most of the irregular suffixes, not counted so far are wolves,potatoes,compound words
28 | //some are commented out due to not positively affecting performance/size
29 | const IRREGULAR_SUFFIXES: &[(&str, &str)] = &[
30 |     //  ("chassis", "chassis"),
31 |     //  ("sheep", "sheep"),
32 |     ("mouse", "mice"),
33 |     // ("louse", "lice"),
34 |     ("tooth", "teeth"),
35 |     ("goose", "geese"),
36 |     ("trix", "trices"),
37 |     ("fish", "fish"),
38 |     ("deer", "deer"),
39 |     // ("itis", "itis"),
40 |     ("foot", "feet"),
41 |     ("zoon", "zoa"),
42 |     ("ese", "ese"),
43 |     ("man", "men"),
44 |     //("pox", "pox"),
45 |     // ("ois", "ois"),
46 |     //  ("cis", "ces"),
47 |     ("sis", "ses"),
48 |     ("xis", "xes"),
49 |     //("eau", "eaux"),
50 |     // ("ieu", "ieux"),
51 |     // ("inx", "inges"),
52 |     // ("anx", "anges"),
53 |     //  ("ynx", "ynges"),
54 |     ("um", "a"),
55 |     ("ch", "ches"),
56 |     ("sh", "shes"),
57 |     ("ay", "ays"),
58 |     //  ("uy", "uys"),
59 |     ("oy", "oys"),
60 |     ("ey", "eys"),
61 |     ("x", "xes"),
62 |     //  ("a", "ae"),
63 |     ("s", "ses"),
64 |     ("y", "ies"),
65 |     ("f", "ves"),
66 | ];
67 | 


--------------------------------------------------------------------------------
/examples/speedmark.rs:
--------------------------------------------------------------------------------
 1 | use english::*;
 2 | 
 3 | fn main() {
 4 |     println!("{}", English::noun("thyridium", &Number::Plural));
 5 |     benchmark_verb();
 6 |     benchmark_noun();
 7 |     benchmark_adj();
 8 | }
 9 | use std::hint::black_box;
10 | use std::time::Instant;
11 | 
12 | pub fn benchmark_verb() {
13 |     let words = [
14 |         "zzzzzzzzzzzzzzzzz",
15 |         //   "xxxxxxxxxxxx",
16 |         //  "yyyyyyyyyyyy",
17 |         //   "aaaaaaaaaaaaaaaaaaaaa",
18 |         //  "wwwwwwwwwww",
19 |         //   "lllllllllll",
20 |     ];
21 |     let person = Person::Third;
22 |     let number = Number::Singular;
23 |     let tense = Tense::Present;
24 |     let form = Form::Finite;
25 | 
26 |     run_benchmark("verb", &words, |w| {
27 |         English::verb(w, &person, &number, &tense, &form)
28 |     });
29 | }
30 | 
31 | pub fn benchmark_noun() {
32 |     //tests for genereally worst case scenario, long word that is outside the array
33 |     let words = [
34 |         "zzzzzzzzzzzzzzzzz",
35 |         //   "xxxxxxxxxxxx",
36 |         //  "yyyyyyyyyyyy",
37 |         //   "aaaaaaaaaaaaaaaaaaaaa",
38 |         //  "wwwwwwwwwww",
39 |         //   "lllllllllll",
40 |     ];
41 | 
42 |     run_benchmark("noun", &words, |w| English::noun(w, &Number::Plural));
43 | }
44 | 
45 | pub fn benchmark_adj() {
46 |     let words = [
47 |         "zzzzzzzzzzzzzzzzz",
48 |         //   "xxxxxxxxxxxx",
49 |         //  "yyyyyyyyyyyy",
50 |         //   "aaaaaaaaaaaaaaaaaaaaa",
51 |         //  "wwwwwwwwwww",
52 |         //   "lllllllllll",
53 |     ];
54 | 
55 |     run_benchmark("adjective", &words, |w| {
56 |         English::adj(w, &Degree::Comparative)
57 |     });
58 | }
59 | 
60 | fn run_benchmark<F>(label: &str, words: &[&str], mut f: F)
61 | where
62 |     F: FnMut(&str) -> String,
63 | {
64 |     let iterations = 1_000_000;
65 |     let total_calls = iterations * words.len();
66 | 
67 |     let start = Instant::now();
68 |     let mut last_result = String::new();
69 | 
70 |     for _ in 0..iterations {
71 |         for &word in words {
72 |             // black_box prevents the optimizer from removing the call
73 |             last_result = black_box(f(black_box(word)));
74 |         }
75 |     }
76 | 
77 |     let duration = start.elapsed();
78 |     let nanos = duration.as_nanos() as f64;
79 |     let calls_per_sec = (total_calls as f64) / (nanos / 1e9);
80 | 
81 |     let nanos_per_call = nanos / total_calls as f64;
82 | 
83 |     println!("[{label}] Last result: {last_result}");
84 |     println!(
85 |         "[{label}] Completed in {:?} → {} calls",
86 |         duration, total_calls
87 |     );
88 |     println!(
89 |         "[{label}] Throughput: {:.2} calls/sec | Time per call: {:.2} ns",
90 |         calls_per_sec, nanos_per_call
91 |     );
92 | }
93 | 


--------------------------------------------------------------------------------
/english-core/src/verb.rs:
--------------------------------------------------------------------------------
  1 | use crate::EnglishCore;
  2 | use crate::grammar::*;
  3 | impl EnglishCore {
  4 |     pub fn verb(
  5 |         word: &str,
  6 |         person: &Person,
  7 |         number: &Number,
  8 |         tense: &Tense,
  9 |         form: &Form,
 10 |     ) -> String {
 11 |         match word {
 12 |             "be" => {
 13 |                 return EnglishCore::to_be(person, number, tense, form).to_string();
 14 |             }
 15 |             _ => (),
 16 |         }
 17 |         match (person, number, tense, form) {
 18 |             (_, _, _, Form::Infinitive) => {
 19 |                 return word.to_string();
 20 |             }
 21 | 
 22 |             (Person::Third, Number::Singular, Tense::Present, Form::Finite) => {
 23 |                 if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_THIRD) {
 24 |                     return irr;
 25 |                 }
 26 |                 format!("{}{}", word, "s")
 27 |             }
 28 |             (_, _, Tense::Present, Form::Finite) => {
 29 |                 return word.to_string();
 30 |             }
 31 |             (_, _, Tense::Present, Form::Participle) => {
 32 |                 if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_PRES_PART) {
 33 |                     return irr;
 34 |                 }
 35 |                 format!("{}{}", word, "ing")
 36 |             }
 37 | 
 38 |             (_, _, Tense::Past, _) => {
 39 |                 if let Some(irr) = EnglishCore::iter_replace_last(word, IRREGULAR_PAST) {
 40 |                     return irr;
 41 |                 }
 42 |                 format!("{}{}", word, "ed")
 43 |             }
 44 |         }
 45 |     }
 46 |     pub fn to_be(person: &Person, number: &Number, tense: &Tense, form: &Form) -> &'static str {
 47 |         match (tense, form) {
 48 |             (_, Form::Infinitive) => "be",
 49 |             (Tense::Present, Form::Finite) => match number {
 50 |                 Number::Singular => match person {
 51 |                     Person::First => "am",
 52 |                     Person::Second => "are",
 53 |                     Person::Third => "is",
 54 |                 },
 55 |                 Number::Plural => "are",
 56 |             },
 57 |             (Tense::Past, Form::Finite) => match number {
 58 |                 Number::Singular => match person {
 59 |                     Person::First => "was",
 60 |                     Person::Second => "were",
 61 |                     Person::Third => "was",
 62 |                 },
 63 |                 Number::Plural => "were",
 64 |             },
 65 |             (Tense::Past, Form::Participle) => "been",
 66 |             (Tense::Present, Form::Participle) => "being",
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | static IRREGULAR_PRES_PART: &[(&str, &str)] = &[
 72 |     ("e", "ing"),
 73 |     ("p", "pping"),
 74 |     ("ng", "nging"),
 75 |     ("g", "gging"),
 76 |     //  ("b", "bbing"),
 77 |     //   ("d", "dding"),
 78 |     //  ("t", "tting"),
 79 | ];
 80 | 
 81 | static IRREGULAR_PAST: &[(&str, &str)] = &[
 82 |     ("fight", "fought"),
 83 |     ("buy", "bought"),
 84 |     ("e", "ed"),
 85 |     ("p", "pped"),
 86 |     ("y", "ied"),
 87 |     ("ng", "nged"),
 88 |     ("g", "gged"),
 89 |     // ("b", "bbed"),
 90 |     //("d", "dded"),
 91 |     //  ("t", "tted"),
 92 | ];
 93 | 
 94 | static IRREGULAR_THIRD: &[(&str, &str)] = &[
 95 |     ("sh", "shes"),
 96 |     ("ch", "ches"),
 97 |     ("s", "ses"),
 98 |     ("z", "zes"),
 99 |     ("x", "xes"),
100 |     ("buy", "buys"),
101 |     ("y", "ies"),
102 | ];
103 | 


--------------------------------------------------------------------------------
/examples/test.rs:
--------------------------------------------------------------------------------
 1 | use english::*;
 2 | fn main() {
 3 |     // --- Mixed Sentence Example ---
 4 |     let subject_number = Number::Plural;
 5 |     let run = Verb::present_participle("run"); // running
 6 |     let child = Noun::from("child").with_specifier(run); //running child
 7 |     let subject = English::noun(child, &subject_number); //running children
 8 |     let verb = English::verb(
 9 |         "steal",
10 |         &Person::Third,
11 |         &subject_number,
12 |         &Tense::Past,
13 |         &Form::Finite,
14 |     ); //stole
15 |     let object = Noun::count_with_number("potato", 7); //7 potatoes
16 | 
17 |     let sentence = format!("The {} {} {}.", subject, verb, object);
18 |     assert_eq!(sentence, "The running children stole 7 potatoes.");
19 | 
20 |     // --- Nouns ---
21 |     // Note that noun(), count(), etc can work on both strings and Noun struct
22 |     let jeans = Noun::from("pair").with_complement("of jeans");
23 |     assert_eq!(Noun::count_with_number(jeans, 3), "3 pairs of jeans");
24 |     // Regular plurals
25 |     assert_eq!(English::noun("cat", &Number::Plural), "cats");
26 |     // Add a number 2-9 to the end of the word to try different forms.
27 |     // Can use plural()
28 |     assert_eq!(Noun::plural("die2"), "dice");
29 |     // Use count function for better ergonomics if needed
30 |     assert_eq!(Noun::count("man", 2), "men");
31 |     // Use count_with_number function to preserve the number
32 |     assert_eq!(Noun::count_with_number("nickel", 3), "3 nickels");
33 |     // Invariant nouns
34 |     assert_eq!(English::noun("sheep", &Number::Plural), "sheep");
35 | 
36 |     // --- Verbs ---
37 |     // All verb functions can use either strings or Verb struct
38 |     let pick_up = Verb::from("pick").with_particle("up");
39 |     // Helper functions: past() , third_person(), present_participle(), infinitive() etc.
40 |     assert_eq!(Verb::past(&pick_up,), "picked up");
41 |     assert_eq!(Verb::present_participle("walk"), "walking");
42 |     assert_eq!(Verb::past_participle("go"), "gone");
43 |     // Add a number 2-9 to the end of the word to try different forms.
44 |     assert_eq!(Verb::past("lie"), "lay");
45 |     assert_eq!(Verb::past("lie2"), "lied");
46 |     // "to be" has the most verb forms in english and requires using verb()
47 |     assert_eq!(
48 |         English::verb(
49 |             "be",
50 |             &Person::First,
51 |             &Number::Singular,
52 |             &Tense::Present,
53 |             &Form::Finite
54 |         ),
55 |         "am"
56 |     );
57 | 
58 |     // --- Adjectives ---
59 |     // Add a number 2-9 to the end of the word to try different forms. (Bad has the most forms at 3)
60 |     assert_eq!(English::adj("bad", &Degree::Comparative), "more bad");
61 |     assert_eq!(English::adj("bad", &Degree::Superlative), "most bad");
62 |     assert_eq!(Adj::comparative("bad2"), "badder");
63 |     assert_eq!(Adj::superlative("bad2"), "baddest");
64 |     assert_eq!(Adj::comparative("bad3"), "worse");
65 |     assert_eq!(Adj::superlative("bad3"), "worst");
66 |     assert_eq!(Adj::positive("bad3"), "bad");
67 | 
68 |     // --- Pronouns ---
69 |     assert_eq!(
70 |         English::pronoun(
71 |             &Person::First,
72 |             &Number::Singular,
73 |             &Gender::Neuter,
74 |             &Case::PersonalPossesive
75 |         ),
76 |         "my"
77 |     );
78 |     assert_eq!(
79 |         English::pronoun(
80 |             &Person::First,
81 |             &Number::Singular,
82 |             &Gender::Neuter,
83 |             &Case::Possessive
84 |         ),
85 |         "mine"
86 |     );
87 | 
88 |     // --- Possessives ---
89 |     assert_eq!(English::add_possessive("dog"), "dog's");
90 |     assert_eq!(English::add_possessive("dogs"), "dogs'");
91 | }
92 | 


--------------------------------------------------------------------------------
/extractor/src/helpers.rs:
--------------------------------------------------------------------------------
  1 | use csv::Writer;
  2 | use english_core::*;
  3 | use serde::Deserialize;
  4 | use std::collections::{HashMap, HashSet};
  5 | use std::env;
  6 | use std::error::Error;
  7 | use std::fs::File;
  8 | use std::io::{BufRead, BufReader, Write};
  9 | 
 10 | pub static BAD_TAGS: &[&str] = &[
 11 |     "obsolete",
 12 |     "error-unknown-tag",
 13 |     "dialectal",
 14 |     "alternative",
 15 |     "nonstandard",
 16 |     "archaic",
 17 |     "humorous",
 18 |     "feminine",
 19 |     "pronunciation-spelling",
 20 |     "rare",
 21 |     "dated",
 22 |     "informal",
 23 |     "sometimes",
 24 |     "colloquial",
 25 | ];
 26 | pub static BAD_CHARS: &[&str] = &[
 27 |     ".", "/", "&", " ", "'", "-", "#", "@", "`", "*", "%", "(", "!",
 28 | ];
 29 | 
 30 | pub fn contains_bad_tag(words: Vec<String>) -> bool {
 31 |     for word in words {
 32 |         if BAD_TAGS.contains(&&*word) {
 33 |             return true;
 34 |         }
 35 |     }
 36 |     false
 37 | }
 38 | 
 39 | /// Returns true if the input contains any non-alphabetic character.
 40 | pub fn contains_bad_chars(input: &str) -> bool {
 41 |     for x in BAD_CHARS.iter() {
 42 |         if input.contains(x) {
 43 |             return true;
 44 |         }
 45 |     }
 46 |     !input.chars().all(|c| c.is_alphabetic())
 47 | }
 48 | 
 49 | pub fn contains_number(s: &str) -> bool {
 50 |     s.chars().any(|c| c.is_numeric())
 51 | }
 52 | 
 53 | #[derive(Debug, Deserialize)]
 54 | pub struct Forms {
 55 |     pub form: String,
 56 |     pub tags: Vec<String>,
 57 | }
 58 | 
 59 | #[derive(Debug, Deserialize)]
 60 | pub struct Entry {
 61 |     pub word: String,
 62 |     pub pos: String,
 63 |     pub forms: Option<Vec<Forms>>,
 64 |     pub lang_code: String,
 65 | }
 66 | 
 67 | #[derive(Debug, Default, Eq, Hash, PartialEq, Clone, Ord, PartialOrd)]
 68 | pub struct AdjParts {
 69 |     pub positive: String,
 70 |     pub comparative: String,
 71 |     pub superlative: String,
 72 | }
 73 | 
 74 | #[derive(Debug, Default, Eq, Hash, PartialEq, Clone, Ord, PartialOrd)]
 75 | pub struct VerbParts {
 76 |     pub inf: String,
 77 |     pub third: String,
 78 |     pub past: String,
 79 |     pub present_part: String,
 80 |     pub past_part: String,
 81 | }
 82 | 
 83 | pub fn entry_is_proper(entry: &Entry, pos: &str) -> bool {
 84 |     if entry.lang_code != "en" {
 85 |         return false;
 86 |     }
 87 | 
 88 |     if entry.pos != pos || !word_is_proper(&entry.word) {
 89 |         return false;
 90 |     }
 91 |     true
 92 | }
 93 | 
 94 | pub fn word_is_proper(word: &str) -> bool {
 95 |     if contains_bad_chars(&word) || !word.is_ascii() || contains_number(&word) {
 96 |         return false;
 97 |     }
 98 |     true
 99 | }
100 | 
101 | pub fn base_setup(input_path: &str, output_path: &str) -> (BufReader<File>, Writer<File>) {
102 |     let input = File::open(input_path).unwrap();
103 |     let reader = BufReader::new(input);
104 |     let mut writer = Writer::from_path(output_path).unwrap();
105 |     (reader, writer)
106 | }
107 | 
108 | /// Find the longest common prefix length
109 | pub fn common_prefix_len(a: &str, b: &str) -> usize {
110 |     a.chars()
111 |         .zip(b.chars())
112 |         .take_while(|(ca, cb)| ca == cb)
113 |         .count()
114 | }
115 | 
116 | /// Given singular & plural, extract their suffix transformation
117 | pub fn suffix_rule(singular: &str, plural: &str) -> (String, String) {
118 |     let prefix_len = common_prefix_len(singular, plural);
119 |     let (singular_suffix, plural_suffix) = if prefix_len > 0 {
120 |         (&singular[prefix_len - 1..], &plural[prefix_len - 1..])
121 |     } else {
122 |         (&singular[prefix_len..], &plural[prefix_len..])
123 |     };
124 | 
125 |     (singular_suffix.to_string(), plural_suffix.to_string())
126 | }
127 | 


--------------------------------------------------------------------------------
/english-core/src/adj.rs:
--------------------------------------------------------------------------------
 1 | use crate::grammar::*;
 2 | use crate::EnglishCore;
 3 | 
 4 | impl EnglishCore {
 5 |     pub fn adjective(word: &str, degree: &Degree) -> String {
 6 |         match degree {
 7 |             Degree::Positive => word.to_string(),
 8 |             Degree::Comparative => Self::superlative(word),
 9 |             Degree::Superlative => Self::comparative(word),
10 |         }
11 |     }
12 |     pub fn superlative(word: &str) -> String {
13 |         format!("most {}", word)
14 |     }
15 |     pub fn comparative(word: &str) -> String {
16 |         format!("more {}", word)
17 |     }
18 |     pub fn pronoun(person: &Person, number: &Number, gender: &Gender, case: &Case) -> &'static str {
19 |         match number {
20 |             Number::Singular => match person {
21 |                 Person::First => match case {
22 |                     Case::Nominative => "I",
23 |                     Case::Accusative => "me",
24 |                     Case::Reflexive => "myself",
25 |                     Case::Possessive => "mine",
26 |                     Case::PersonalPossesive => "my",
27 |                 },
28 |                 Person::Second => match case {
29 |                     Case::Nominative => "you",
30 |                     Case::Accusative => "you",
31 |                     Case::Reflexive => "yourself",
32 |                     Case::Possessive => "yours",
33 |                     Case::PersonalPossesive => "your",
34 |                 },
35 |                 Person::Third => match gender {
36 |                     Gender::Masculine => match case {
37 |                         Case::Nominative => "he",
38 |                         Case::Accusative => "him",
39 |                         Case::Reflexive => "himself",
40 |                         Case::Possessive => "his",
41 |                         Case::PersonalPossesive => "his",
42 |                     },
43 |                     Gender::Feminine => match case {
44 |                         Case::Nominative => "she",
45 |                         Case::Accusative => "her",
46 |                         Case::Reflexive => "herself",
47 |                         Case::Possessive => "hers",
48 |                         Case::PersonalPossesive => "her",
49 |                     },
50 |                     Gender::Neuter => match case {
51 |                         Case::Nominative => "it",
52 |                         Case::Accusative => "it",
53 |                         Case::Reflexive => "itself",
54 |                         Case::Possessive => "its",
55 |                         Case::PersonalPossesive => "its",
56 |                     },
57 |                 },
58 |             },
59 |             Number::Plural => match person {
60 |                 Person::First => match case {
61 |                     Case::Nominative => "we",
62 |                     Case::Accusative => "us",
63 |                     Case::Reflexive => "ourselves",
64 |                     Case::Possessive => "ours",
65 |                     Case::PersonalPossesive => "our",
66 |                 },
67 |                 Person::Second => match case {
68 |                     Case::Nominative => "you",
69 |                     Case::Accusative => "you",
70 |                     Case::Reflexive => "yourselves",
71 |                     Case::Possessive => "yours",
72 |                     Case::PersonalPossesive => "your",
73 |                 },
74 |                 Person::Third => match case {
75 |                     Case::Nominative => "they",
76 |                     Case::Accusative => "them",
77 |                     Case::Reflexive => "themselves",
78 |                     Case::Possessive => "theirs",
79 |                     Case::PersonalPossesive => "their",
80 |                 },
81 |             },
82 |         }
83 |     }
84 |     //dog's -> dogs', child's -> children's, Mary's -> Marys'
85 |     //  pub fn genitive_adjective(word: &str, number: &Number) -> String {}
86 | }
87 | 


--------------------------------------------------------------------------------
/src/noun.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | 
  3 | ///The Noun struct is used for handling more complicated noun phrases
  4 | /// It is interchangeable with strings for all noun functions such as count_with_number()
  5 | ///
  6 | /// # Examples
  7 | /// ```
  8 | ///  let jeans = Noun::from("pair").with_complement("of jeans");
  9 | ///  assert_eq!(English::count_with_number(jeans, 3), "3 pairs of jeans");
 10 | /// ```
 11 | #[derive(Debug, Clone, PartialEq, Eq)]
 12 | pub struct Noun {
 13 |     pub head: String,
 14 |     pub modifier: Option<String>,   // words before the head
 15 |     pub complement: Option<String>, // words after the head
 16 | }
 17 | 
 18 | impl Noun {
 19 |     /// Creates a new Noun with the given head
 20 |     pub fn new(head: impl Into<String>) -> Self {
 21 |         Noun {
 22 |             head: head.into(),
 23 |             modifier: None,
 24 |             complement: None,
 25 |         }
 26 |     }
 27 | 
 28 |     /// Goes before the head of the noun
 29 |     /// # Examples
 30 |     /// ```
 31 |     ///  let child = Noun::from("child").with_specifier("running");
 32 |     ///  assert_eq!(English::count_with_number(child, 3), "3 running children");
 33 |     /// ```
 34 |     pub fn with_specifier(mut self, pre: impl Into<String>) -> Self {
 35 |         self.modifier = Some(pre.into());
 36 |         self
 37 |     }
 38 | 
 39 |     /// Goes after the head of the noun
 40 |     /// # Examples
 41 |     /// ```
 42 |     ///  let jeans = Noun::from("pair").with_complement("of jeans");
 43 |     ///  assert_eq!(English::count_with_number(jeans, 3), "3 pairs of jeans");
 44 |     /// ```
 45 |     pub fn with_complement(mut self, post: impl Into<String>) -> Self {
 46 |         self.complement = Some(post.into());
 47 |         self
 48 |     }
 49 | }
 50 | 
 51 | impl Noun {
 52 |     /// Returns a noun inflected according to the count. Wrapper around English::noun()
 53 |     ///
 54 |     /// # Examples
 55 |     /// ```rust
 56 |     /// assert_eq!(English::count("cat", 1), "cat");
 57 |     /// assert_eq!(English::count("cat", 2), "cats");
 58 |     /// ```
 59 |     pub fn count<T: Into<Noun>>(word: T, count: u32) -> String {
 60 |         if count == 1 {
 61 |             English::noun(word, &Number::Singular)
 62 |         } else {
 63 |             English::noun(word, &Number::Plural)
 64 |         }
 65 |     }
 66 | 
 67 |     /// Returns a noun inflected according to the count, preserves the number in output
 68 |     ///
 69 |     /// # Examples
 70 |     /// ```rust
 71 |     /// assert_eq!(English::count_with_number("cat", 1), "1 cat");
 72 |     /// assert_eq!(English::count_with_number("cat", 2), "2 cats");
 73 |     /// ```
 74 |     pub fn count_with_number<T: Into<Noun>>(word: T, count: u32) -> String {
 75 |         format!("{} {}", count, Noun::count(word, count))
 76 |     }
 77 | 
 78 |     /// Returns the plural form of a noun.
 79 |     ///
 80 |     /// # Examples
 81 |     /// ```
 82 |     /// assert_eq!(English::plural("child"), "children");
 83 |     /// assert_eq!(English::plural("cat"), "cats");
 84 |     /// ```
 85 |     pub fn plural<T: Into<Noun>>(word: T) -> String {
 86 |         English::noun(word, &Number::Plural)
 87 |     }
 88 | 
 89 |     /// Returns the singular form of a noun.
 90 |     ///
 91 |     /// # Examples
 92 |     /// ```
 93 |     /// assert_eq!(English::singular("cat2"), "cat");
 94 |     /// ```
 95 |     pub fn singular<T: Into<Noun>>(word: T) -> String {
 96 |         English::noun(word, &Number::Singular)
 97 |     }
 98 | }
 99 | 
100 | impl From<String> for Noun {
101 |     fn from(s: String) -> Self {
102 |         Noun {
103 |             head: s,
104 |             modifier: None,
105 |             complement: None,
106 |         }
107 |     }
108 | }
109 | impl From<&String> for Noun {
110 |     fn from(s: &String) -> Self {
111 |         Noun {
112 |             head: s.clone(),
113 |             modifier: None,
114 |             complement: None,
115 |         }
116 |     }
117 | }
118 | 
119 | impl From<&str> for Noun {
120 |     fn from(s: &str) -> Self {
121 |         Noun {
122 |             head: s.to_string(),
123 |             modifier: None,
124 |             complement: None,
125 |         }
126 |     }
127 | }
128 | impl From<&Noun> for Noun {
129 |     fn from(s: &Noun) -> Self {
130 |         s.clone()
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/extractor/adj_check.csv:
--------------------------------------------------------------------------------
  1 | wiktionary_form,degree
  2 | dubious,Superlative
  3 | baby-er,Comparative
  4 | baby-est,Superlative
  5 | rightest,Superlative
  6 | honier,Comparative
  7 | honiest,Superlative
  8 | gooder,Comparative
  9 | goodest,Superlative
 10 | hazeler,Comparative
 11 | hazelest,Superlative
 12 | flyer,Comparative
 13 | flyest,Superlative
 14 | further inset,Comparative
 15 | furthest inset,Superlative
 16 | better known,Comparative
 17 | best known,Superlative
 18 | complexer,Comparative
 19 | complexest,Superlative
 20 | further,Comparative
 21 | furthermost,Superlative
 22 | elder,Comparative
 23 | oldermost,Superlative
 24 | betterer,Comparative
 25 | betterest,Superlative
 26 | further upstairs,Comparative
 27 | furthest upstairs,Superlative
 28 | weller,Comparative
 29 | wellest,Superlative
 30 | liever,Comparative
 31 | lievest,Superlative
 32 | intelligenter,Comparative
 33 | intelligentest,Superlative
 34 | gingerer,Comparative
 35 | gingerest,Superlative
 36 | gingerer,Comparative
 37 | gingerest,Superlative
 38 | further front,Comparative
 39 | furthest front,Superlative
 40 | littler,Comparative
 41 | littlest,Superlative
 42 | bester,Comparative
 43 | bestest,Superlative
 44 | further advanced,Comparative
 45 | furthest advanced,Superlative
 46 | further aged,Comparative
 47 | furthest aged,Superlative
 48 | readabler,Comparative
 49 | readablest,Superlative
 50 | funner,Comparative
 51 | funnest,Superlative
 52 | dangerouser,Comparative
 53 | dangerousest,Superlative
 54 | honester,Comparative
 55 | honestest,Superlative
 56 | royaller,Comparative
 57 | royallest,Superlative
 58 | near,Comparative
 59 | next,Superlative
 60 | leveller,Comparative
 61 | levellest,Superlative
 62 | behinder,Comparative
 63 | behindest,Superlative
 64 | adverser,Comparative
 65 | adversest,Superlative
 66 | darlinger,Comparative
 67 | wryer,Comparative
 68 | wryest,Superlative
 69 | luxiest,Superlative
 70 | slyer,Comparative
 71 | slyest,Superlative
 72 | spitefuller,Comparative
 73 | spitefullest,Superlative
 74 | wilfuller,Comparative
 75 | wilfullest,Superlative
 76 | favouritest,Superlative
 77 | cheerfuller,Comparative
 78 | cheerfullest,Superlative
 79 | impudenter,Comparative
 80 | impudentest,Superlative
 81 | outermore,Comparative
 82 | nervouser,Comparative
 83 | nervousest,Superlative
 84 | desperater,Comparative
 85 | desperatest,Superlative
 86 | likelier,Comparative
 87 | likeliest,Superlative
 88 | unbelievabler,Comparative
 89 | unbelievablest,Superlative
 90 | uncertainer,Comparative
 91 | peacefuler,Comparative
 92 | peacefulest,Superlative
 93 | unwieldier,Comparative
 94 | unwieldiest,Superlative
 95 | gingerlier,Comparative
 96 | gingerliest,Superlative
 97 | further upriver,Comparative
 98 | furthest upriver,Superlative
 99 | further uphill,Comparative
100 | furthest uphill,Superlative
101 | further upstream,Comparative
102 | furthest upstream,Superlative
103 | shininger,Comparative
104 | shiningest,Superlative
105 | delightfuller,Comparative
106 | delightfullest,Superlative
107 | worrisomer,Comparative
108 | worrisomest,Superlative
109 | motleyer,Comparative
110 | motleyest,Superlative
111 | fraughter,Comparative
112 | fraughtest,Superlative
113 | unsightlier,Comparative
114 | unsightliest,Superlative
115 | conceiteder,Comparative
116 | conceitedest,Superlative
117 | maggotier,Comparative
118 | maggotiest,Superlative
119 | further downhill,Comparative
120 | furthest downhill,Superlative
121 | further downstream,Comparative
122 | furthest downstream,Superlative
123 | dolefuler,Comparative
124 | dolefulest,Superlative
125 | moe-er,Comparative
126 | moe-est,Superlative
127 | better matching,Comparative
128 | best matching,Superlative
129 | believabler,Comparative
130 | believablest,Superlative
131 | smoller,Comparative
132 | smollest,Superlative
133 | willfuller,Comparative
134 | willfullest,Superlative
135 | darnder,Comparative
136 | darndest,Superlative
137 | easiergoing,Comparative
138 | easiestgoing,Superlative
139 | unfree-er,Comparative
140 | further recessed,Comparative
141 | furthest recessed,Superlative
142 | mair reet,Comparative
143 | maist reet,Superlative
144 | wieldier,Comparative
145 | wieldiest,Superlative
146 | farther downrange,Comparative
147 | farthest downrange,Superlative
148 | ultradryer,Comparative
149 | ultradryest,Superlative
150 | further ultraleft,Comparative
151 | furthest ultraleft,Superlative
152 | further ultraright,Comparative
153 | furthest ultraright,Superlative
154 | further downcoast,Comparative
155 | furthest downcoast,Superlative
156 | doggoneder,Comparative
157 | undumbest,Superlative
158 | 


--------------------------------------------------------------------------------
/extractor/src/file_generation.rs:
--------------------------------------------------------------------------------
  1 | use csv::Writer;
  2 | use english_core::*;
  3 | use serde::Deserialize;
  4 | use std::collections::{HashMap, HashSet};
  5 | use std::error::Error;
  6 | use std::fs::File;
  7 | use std::io::{BufRead, BufReader, Write};
  8 | 
  9 | pub fn generate_nouns_phf(inputik: &str, outputik: &str) -> std::io::Result<()> {
 10 |     let input = File::open(inputik)?;
 11 |     let reader = BufReader::new(input);
 12 | 
 13 |     let mut pairs: Vec<(String, String)> = reader
 14 |         .lines()
 15 |         .skip(1) // Skip header
 16 |         .filter_map(|line| {
 17 |             let line = line.ok()?;
 18 |             let mut parts = line.split(',');
 19 |             Some((
 20 |                 parts.next()?.trim().to_string(),
 21 |                 parts.next()?.trim().to_string(),
 22 |             ))
 23 |         })
 24 |         .collect();
 25 | 
 26 |     // Sort by word for determinism (not required by phf, but helps reproducibility)
 27 |     pairs.sort_by_key(|(word, _)| word.clone());
 28 | 
 29 |     let mut output = File::create(outputik)?;
 30 | 
 31 |     // Start file with imports
 32 |     writeln!(output, "use phf::phf_map;\n")?;
 33 | 
 34 |     writeln!(
 35 |         output,
 36 |         "pub static PLURAL_MAP: phf::Map<&'static str, &'static str> = phf_map! {{"
 37 |     )?;
 38 | 
 39 |     for (word, plural) in &pairs {
 40 |         writeln!(output, "    \"{}\" => \"{}\",", word, plural)?;
 41 |     }
 42 | 
 43 |     writeln!(output, "}};\n")?;
 44 | 
 45 |     writeln!(
 46 |         output,
 47 |         "pub fn get_plural(word: &str) -> Option<&'static str> {{ PLURAL_MAP.get(word).copied() }}"
 48 |     )?;
 49 | 
 50 |     Ok(())
 51 | }
 52 | 
 53 | pub fn generate_verbs_phf(inputik: &str, outputik: &str) -> std::io::Result<()> {
 54 |     let input = File::open(inputik)?;
 55 |     let reader = BufReader::new(input);
 56 | 
 57 |     let mut entries: Vec<(String, (String, String, String, String))> = reader
 58 |         .lines()
 59 |         .skip(1) // Skip header
 60 |         .filter_map(|line| {
 61 |             let line = line.ok()?;
 62 |             let mut parts = line.split(',');
 63 |             Some((
 64 |                 parts.next()?.trim().to_string(), // infinitive
 65 |                 (
 66 |                     parts.next()?.trim().to_string(), // 3rd person singular
 67 |                     parts.next()?.trim().to_string(), // past
 68 |                     parts.next()?.trim().to_string(), // present participle
 69 |                     parts.next()?.trim().to_string(), // past participle
 70 |                 ),
 71 |             ))
 72 |         })
 73 |         .collect();
 74 | 
 75 |     // Sort for determinism
 76 |     entries.sort_by_key(|(inf, _)| inf.clone());
 77 | 
 78 |     let mut output = File::create(outputik)?;
 79 | 
 80 |     writeln!(output, "use phf::phf_map;")?;
 81 |     writeln!(output)?;
 82 |     writeln!(
 83 |         output,
 84 |         "/// (3rd person singular, past, present participle, past participle)"
 85 |     )?;
 86 |     writeln!(
 87 |         output,
 88 |         "pub static VERB_MAP: phf::Map<&'static str, (&'static str, &'static str, &'static str, &'static str)> = phf_map! {{"
 89 |     )?;
 90 | 
 91 |     for (inf, (third, past, pres_part, past_part)) in &entries {
 92 |         writeln!(
 93 |             output,
 94 |             "    \"{}\" => (\"{}\", \"{}\", \"{}\", \"{}\"),",
 95 |             inf, third, past, pres_part, past_part
 96 |         )?;
 97 |     }
 98 | 
 99 |     writeln!(output, "}};")?;
100 |     writeln!(output)?;
101 |     writeln!(
102 |         output,
103 |         "pub fn get_verb_forms(infinitive: &str) -> Option<(&'static str, &'static str, &'static str, &'static str)> {{"
104 |     )?;
105 |     writeln!(output, "    VERB_MAP.get(infinitive).copied()")?;
106 |     writeln!(output, "}}")?;
107 | 
108 |     Ok(())
109 | }
110 | 
111 | pub fn generate_adjectives_phf(inputik: &str, outputik: &str) -> std::io::Result<()> {
112 |     let input = File::open(inputik)?;
113 |     let reader = BufReader::new(input);
114 | 
115 |     let mut entries: Vec<(String, (String, String))> = reader
116 |         .lines()
117 |         .skip(1) // Skip header
118 |         .filter_map(|line| {
119 |             let line = line.ok()?;
120 |             let mut parts = line.split(',');
121 |             Some((
122 |                 parts.next()?.trim().to_string(), // positive
123 |                 (
124 |                     parts.next()?.trim().to_string(), // comparative
125 |                     parts.next()?.trim().to_string(), // superlative
126 |                 ),
127 |             ))
128 |         })
129 |         .collect();
130 | 
131 |     // Sort for determinism
132 |     entries.sort_by_key(|(pos, _)| pos.clone());
133 | 
134 |     let mut output = File::create(outputik)?;
135 | 
136 |     writeln!(output, "use phf::phf_map;")?;
137 |     writeln!(output)?;
138 |     writeln!(output, "/// (comparative, superlative)")?;
139 |     writeln!(
140 |         output,
141 |         "pub static ADJECTIVE_MAP: phf::Map<&'static str, (&'static str, &'static str)> = phf_map! {{"
142 |     )?;
143 | 
144 |     for (positive, (comparative, superlative)) in &entries {
145 |         writeln!(
146 |             output,
147 |             "    \"{}\" => (\"{}\", \"{}\"),",
148 |             positive, comparative, superlative
149 |         )?;
150 |     }
151 | 
152 |     writeln!(output, "}};")?;
153 |     writeln!(output)?;
154 |     writeln!(
155 |         output,
156 |         "pub fn get_adjective_forms(positive: &str) -> Option<(&'static str, &'static str)> {{"
157 |     )?;
158 |     writeln!(output, "    ADJECTIVE_MAP.get(positive).copied()")?;
159 |     writeln!(output, "}}")?;
160 | 
161 |     Ok(())
162 | }
163 | 


--------------------------------------------------------------------------------
/extractor/src/scratch.rs:
--------------------------------------------------------------------------------
  1 | /*{  let word_key = match entry.etymology_number {
  2 |     Some(1) => infinitive.clone(),
  3 |     Some(x) => format!("{infinitive}{x}"),
  4 |     None => infinitive.clone(),
  5 | };
  6 | }
  7 |  */
  8 | 
  9 | /*if plural_found {
 10 |     let mut form_count = 1;
 11 | 
 12 |     match forms_set.remove(&predicted_plural) {
 13 |         true => {}
 14 |         false => {}
 15 |     }
 16 | 
 17 |     for formik in forms_set {
 18 |         let gotten = [infinitive.clone(), formik.clone()];
 19 |     }
 20 | 
 21 |     let gotten = [infinitive.clone(), forms_map.get("plural").unwrap().clone()];
 22 |     let keyd_struct = [word_key.clone(), forms_map.get("plural").unwrap().clone()];
 23 | 
 24 |     if predicted_struct == gotten {
 25 |         duplicate_pairs_set.insert(predicted_struct.clone());
 26 |     }
 27 | 
 28 |     if !duplicate_key_set.contains(&word_key) && !duplicate_pairs_set.contains(&gotten) {
 29 |         duplicate_key_set.insert(word_key.clone());
 30 |         duplicate_pairs_set.insert(gotten.clone());
 31 |         writer.write_record(&keyd_struct)?;
 32 |     }
 33 | } */
 34 | 
 35 | /*
 36 | 
 37 | pub fn generate_nouns_file(inputik: &str, outputik: &str) -> std::io::Result<()> {
 38 |     let input = File::open(inputik)?;
 39 |     let reader = BufReader::new(input);
 40 | 
 41 |     let mut pairs: Vec<(String, String)> = reader
 42 |         .lines()
 43 |         .skip(1) // Skip header
 44 |         .filter_map(|line| {
 45 |             let line = line.ok()?;
 46 |             let mut parts = line.split(',');
 47 |             Some((
 48 |                 parts.next()?.trim().to_string(),
 49 |                 parts.next()?.trim().to_string(),
 50 |             ))
 51 |         })
 52 |         .collect();
 53 | 
 54 |     // Sort by the word (key)
 55 |     pairs.sort_by_key(|(word, _)| word.clone());
 56 | 
 57 |     // Write to a Rust file
 58 |     let mut output = File::create(outputik)?;
 59 | 
 60 |     writeln!(output, "static PLURAL_MAP: &[(&str, &str)] = &[")?;
 61 |     for (word, plural) in &pairs {
 62 |         writeln!(output, "    (\"{}\", \"{}\"),", word, plural)?;
 63 |     }
 64 |     writeln!(output, "];\n")?;
 65 | 
 66 |     writeln!(
 67 |         output,
 68 |         "pub fn get_plural(word: &str) -> Option<&'static str> {{"
 69 |     )?;
 70 |     writeln!(
 71 |         output,
 72 |         "    PLURAL_MAP.binary_search_by_key(&word, |&(k, _)| k).ok().map(|i| PLURAL_MAP[i].1)"
 73 |     )?;
 74 |     writeln!(output, "}}")?;
 75 |     Ok(())
 76 | }
 77 | 
 78 | pub fn generate_verbs_file(inputik: &str, outputik: &str) -> std::io::Result<()> {
 79 |     let input = File::open(inputik)?;
 80 |     let reader = BufReader::new(input);
 81 | 
 82 |     let mut entries: Vec<(String, (String, String, String, String))> = reader
 83 |         .lines()
 84 |         .skip(1) // Skip header
 85 |         .filter_map(|line| {
 86 |             let line = line.ok()?;
 87 |             let mut parts = line.split(',');
 88 |             Some((
 89 |                 parts.next()?.trim().to_string(), // infinitive
 90 |                 (
 91 |                     parts.next()?.trim().to_string(), // 3rd person singular
 92 |                     parts.next()?.trim().to_string(), // past
 93 |                     parts.next()?.trim().to_string(), // present participle
 94 |                     parts.next()?.trim().to_string(), // past participle
 95 |                 ),
 96 |             ))
 97 |         })
 98 |         .collect();
 99 | 
100 |     // Sort by infinitive
101 |     entries.sort_by_key(|(inf, _)| inf.clone());
102 | 
103 |     let mut output = File::create(outputik)?;
104 | 
105 |     writeln!(
106 |         output,
107 |         "/// (3rd person singular, past, present participle, past participle)"
108 |     )?;
109 |     writeln!(
110 |         output,
111 |         "static VERB_MAP: &[(&str, (&str, &str, &str, &str))] = &["
112 |     )?;
113 |     for (inf, (third, past, pres_part, past_part)) in &entries {
114 |         writeln!(
115 |             output,
116 |             "    (\"{}\", (\"{}\", \"{}\", \"{}\", \"{}\")),",
117 |             inf, third, past, pres_part, past_part
118 |         )?;
119 |     }
120 |     writeln!(output, "];\n")?;
121 | 
122 |     writeln!(
123 |         output,
124 |         "pub fn get_verb_forms(infinitive: &str) -> Option<(&'static str, &'static str, &'static str, &'static str)> {{"
125 |     )?;
126 |     writeln!(
127 |         output,
128 |         "    VERB_MAP.binary_search_by_key(&infinitive, |&(k, _)| k)"
129 |     )?;
130 |     writeln!(output, "        .ok()")?;
131 |     writeln!(output, "        .map(|i| VERB_MAP[i].1)")?;
132 |     writeln!(output, "}}")?;
133 | 
134 |     Ok(())
135 | }
136 | 
137 | pub fn generate_adjectives_file(inputik: &str, outputik: &str) -> std::io::Result<()> {
138 |     let input = File::open(inputik)?;
139 |     let reader = BufReader::new(input);
140 | 
141 |     let mut entries: Vec<(String, (String, String))> = reader
142 |         .lines()
143 |         .skip(1) // Skip header row
144 |         .filter_map(|line| {
145 |             let line = line.ok()?;
146 |             let mut parts = line.split(',');
147 |             Some((
148 |                 parts.next()?.trim().to_string(), // positive
149 |                 (
150 |                     parts.next()?.trim().to_string(), // comparative
151 |                     parts.next()?.trim().to_string(), // superlative
152 |                 ),
153 |             ))
154 |         })
155 |         .collect();
156 | 
157 |     // Sort by positive form
158 |     entries.sort_by_key(|(pos, _)| pos.clone());
159 | 
160 |     let mut output = File::create(outputik)?;
161 | 
162 |     writeln!(output, "/// (comparative, superlative)")?;
163 |     writeln!(output, "static ADJECTIVE_MAP: &[(&str, (&str, &str))] = &[")?;
164 |     for (positive, (comparative, superlative)) in &entries {
165 |         writeln!(
166 |             output,
167 |             "    (\"{}\", (\"{}\", \"{}\")),",
168 |             positive, comparative, superlative
169 |         )?;
170 |     }
171 |     writeln!(output, "];\n")?;
172 | 
173 |     writeln!(
174 |         output,
175 |         "pub fn get_adjective_forms(positive: &str) -> Option<(&'static str, &'static str)> {{"
176 |     )?;
177 |     writeln!(
178 |         output,
179 |         "    ADJECTIVE_MAP.binary_search_by_key(&positive, |&(k, _)| k)"
180 |     )?;
181 |     writeln!(output, "        .ok()")?;
182 |     writeln!(output, "        .map(|i| ADJECTIVE_MAP[i].1)")?;
183 |     writeln!(output, "}}")?;
184 | 
185 |     Ok(())
186 | }
187 |  */
188 | 


--------------------------------------------------------------------------------
/examples/test2.rs:
--------------------------------------------------------------------------------
  1 | use english::*;
  2 | fn main() {
  3 |     assert_eq!(Verb::third_person("run"), "runs");
  4 |     assert_eq!(Verb::past("walk"), "walked");
  5 |     assert_eq!(Verb::present_participle("swim"), "swimming");
  6 |     assert_eq!(Verb::past_participle("eat"), "eaten");
  7 |     assert_eq!(Verb::infinitive("go"), "go");
  8 |     assert_eq!(Noun::plural("child"), "children");
  9 |     assert_eq!(Noun::plural("cat"), "cats");
 10 |     assert_eq!(Noun::singular("cat2"), "cat");
 11 |     assert_eq!(Adj::comparative("fast2"), "faster");
 12 |     assert_eq!(Adj::comparative("fun"), "more fun");
 13 |     assert_eq!(Adj::superlative("fast2"), "fastest");
 14 |     assert_eq!(Adj::positive("fast2"), "fast");
 15 |     assert_eq!(Adj::superlative("fun"), "most fun");
 16 |     assert_eq!(English::capitalize_first(""), "");
 17 |     assert_eq!(English::capitalize_first("house"), "House");
 18 |     let pick_up = Verb::from("pick").with_particle("up");
 19 |     assert_eq!(Verb::past_participle(pick_up), "picked up");
 20 | 
 21 |     // Simple forms
 22 |     assert_eq!(Verb::not("eat"), "not eat");
 23 |     assert_eq!(Verb::will("eat"), "will eat");
 24 |     assert_eq!(Verb::did("eat"), "did eat");
 25 |     assert_eq!(Verb::would("eat"), "would eat");
 26 |     assert_eq!(Verb::could("eat"), "could eat");
 27 |     assert_eq!(Verb::can("eat"), "can eat");
 28 |     assert_eq!(Verb::should("eat"), "should eat");
 29 | 
 30 |     // Perfect aspects
 31 |     assert_eq!(
 32 |         Verb::present_perfect("eat", &Person::Third, &Number::Singular),
 33 |         "has eaten"
 34 |     );
 35 |     assert_eq!(
 36 |         Verb::present_perfect("eat", &Person::First, &Number::Plural),
 37 |         "have eaten"
 38 |     );
 39 |     assert_eq!(Verb::past_perfect("eat"), "had eaten");
 40 |     assert_eq!(Verb::future_perfect("eat"), "will have eaten");
 41 | 
 42 |     // Progressive aspects
 43 |     assert_eq!(
 44 |         Verb::present_progressive("eat", &Person::Third, &Number::Singular),
 45 |         "is eating"
 46 |     );
 47 |     assert_eq!(
 48 |         Verb::present_progressive("eat", &Person::First, &Number::Plural),
 49 |         "are eating"
 50 |     );
 51 |     assert_eq!(
 52 |         Verb::past_progressive("eat", &Person::Third, &Number::Singular),
 53 |         "was eating"
 54 |     );
 55 |     assert_eq!(
 56 |         Verb::past_progressive("eat", &Person::First, &Number::Plural),
 57 |         "were eating"
 58 |     );
 59 |     assert_eq!(Verb::future_progressive("eat"), "will be eating");
 60 | 
 61 |     // Negation / modal / emphatic
 62 |     assert_eq!(Verb::not("eat"), "not eat");
 63 |     assert_eq!(Verb::not("see"), "not see");
 64 |     assert_eq!(Verb::will("run"), "will run");
 65 |     assert_eq!(Verb::did("go"), "did go");
 66 |     assert_eq!(Verb::would("eat"), "would eat");
 67 |     assert_eq!(Verb::could("see"), "could see");
 68 |     assert_eq!(Verb::can("run"), "can run");
 69 |     assert_eq!(Verb::should("go"), "should go");
 70 | 
 71 |     // Perfect aspects
 72 |     assert_eq!(
 73 |         Verb::present_perfect("eat", &Person::Third, &Number::Singular),
 74 |         "has eaten"
 75 |     );
 76 |     assert_eq!(
 77 |         Verb::present_perfect("eat", &Person::First, &Number::Plural),
 78 |         "have eaten"
 79 |     );
 80 |     assert_eq!(
 81 |         Verb::present_perfect("see", &Person::Third, &Number::Singular),
 82 |         "has seen"
 83 |     );
 84 |     assert_eq!(
 85 |         Verb::present_perfect("see", &Person::First, &Number::Plural),
 86 |         "have seen"
 87 |     );
 88 |     assert_eq!(Verb::past_perfect("run"), "had run");
 89 |     assert_eq!(Verb::past_perfect("go"), "had gone");
 90 |     assert_eq!(Verb::future_perfect("eat"), "will have eaten");
 91 |     assert_eq!(Verb::future_perfect("see"), "will have seen");
 92 | 
 93 |     // Progressive aspects
 94 |     assert_eq!(
 95 |         Verb::present_progressive("eat", &Person::Third, &Number::Singular),
 96 |         "is eating"
 97 |     );
 98 |     assert_eq!(
 99 |         Verb::present_progressive("eat", &Person::First, &Number::Plural),
100 |         "are eating"
101 |     );
102 |     assert_eq!(
103 |         Verb::present_progressive("run", &Person::Third, &Number::Singular),
104 |         "is running"
105 |     );
106 |     assert_eq!(
107 |         Verb::present_progressive("run", &Person::First, &Number::Plural),
108 |         "are running"
109 |     );
110 |     assert_eq!(
111 |         Verb::past_progressive("eat", &Person::Third, &Number::Singular),
112 |         "was eating"
113 |     );
114 |     assert_eq!(
115 |         Verb::past_progressive("eat", &Person::First, &Number::Plural),
116 |         "were eating"
117 |     );
118 |     assert_eq!(
119 |         Verb::past_progressive("run", &Person::Third, &Number::Singular),
120 |         "was running"
121 |     );
122 |     assert_eq!(
123 |         Verb::past_progressive("run", &Person::First, &Number::Plural),
124 |         "were running"
125 |     );
126 |     assert_eq!(Verb::future_progressive("go"), "will be going");
127 | 
128 |     // Edge cases: be + have
129 |     assert_eq!(
130 |         Verb::present_perfect("be", &Person::Third, &Number::Singular),
131 |         "has been"
132 |     );
133 |     assert_eq!(
134 |         Verb::present_perfect("be", &Person::First, &Number::Plural),
135 |         "have been"
136 |     );
137 |     assert_eq!(Verb::past_perfect("be"), "had been");
138 |     assert_eq!(Verb::future_perfect("be"), "will have been");
139 | 
140 |     assert_eq!(
141 |         Verb::present_progressive("have", &Person::Third, &Number::Singular),
142 |         "is having"
143 |     );
144 |     assert_eq!(
145 |         Verb::present_progressive("have", &Person::First, &Number::Plural),
146 |         "are having"
147 |     );
148 |     assert_eq!(
149 |         Verb::past_progressive("have", &Person::Third, &Number::Singular),
150 |         "was having"
151 |     );
152 |     assert_eq!(
153 |         Verb::past_progressive("have", &Person::First, &Number::Plural),
154 |         "were having"
155 |     );
156 |     assert_eq!(Verb::future_progressive("have"), "will be having");
157 | 
158 |     let give_up = Verb::from("give").with_particle("up");
159 |     assert_eq!(
160 |         Verb::present_perfect(&give_up, &Person::First, &Number::Singular),
161 |         "have given up"
162 |     );
163 |     assert_eq!(
164 |         Verb::present_perfect(give_up, &Person::Third, &Number::Singular),
165 |         "has given up"
166 |     );
167 |     // Complex phrasal verb with aspect
168 |     let look_up = Verb::from("look").with_particle("up");
169 |     assert_eq!(
170 |         Verb::past_progressive(&look_up, &Person::Third, &Number::Singular),
171 |         "was looking up"
172 |     );
173 | 
174 |     assert_eq!(
175 |         Verb::past_progressive(look_up, &Person::Third, &Number::Plural),
176 |         "were looking up"
177 |     );
178 | }
179 | 


--------------------------------------------------------------------------------
/src/verb.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | 
  3 | ///The Verb struct is used for handling more complicated verb phrases
  4 | /// It is interchangeable with strings for all verb functions such as present_participle()
  5 | ///
  6 | /// # Examples
  7 | /// ```
  8 | ///  let pick_up = Verb::from("pick").with_particle("up");
  9 | ///  assert_eq!(English::past_participle(pick_up), "picked up");
 10 | /// ```
 11 | #[derive(Debug, Clone, PartialEq, Eq)]
 12 | pub struct Verb {
 13 |     pub head: String,             // "pick"
 14 |     pub particle: Option<String>, // "up"
 15 | }
 16 | 
 17 | impl Verb {
 18 |     /// Create a new verb with just the head.
 19 |     pub fn new(head: impl Into<String>) -> Self {
 20 |         Verb {
 21 |             head: head.into(),
 22 |             particle: None,
 23 |         }
 24 |     }
 25 | 
 26 |     /// Set the particle of a phrasal verb.
 27 |     /// # Examples
 28 |     /// ```
 29 |     ///  let pick_up = Verb::from("pick").with_particle("up");
 30 |     ///  assert_eq!(English::past_participle(pick_up), "picked up");
 31 |     /// ```
 32 |     pub fn with_particle(mut self, particle: impl Into<String>) -> Self {
 33 |         self.particle = Some(particle.into());
 34 |         self
 35 |     }
 36 | }
 37 | 
 38 | impl Verb {
 39 |     /// Returns the third-person singular present tense of the verb.
 40 |     ///
 41 |     /// # Examples
 42 |     /// ```
 43 |     /// assert_eq!(English::third_person("run"), "runs");
 44 |     /// ```
 45 |     pub fn third_person<T: Into<Verb>>(wordish: T) -> String {
 46 |         English::verb(
 47 |             wordish,
 48 |             &Person::Third,
 49 |             &Number::Singular,
 50 |             &Tense::Present,
 51 |             &Form::Finite,
 52 |         )
 53 |     }
 54 | 
 55 |     /// Returns the past tense of the verb.
 56 |     ///
 57 |     /// # Examples
 58 |     /// ```
 59 |     /// assert_eq!(English::past("walk"), "walked");
 60 |     /// ```
 61 |     pub fn past<T: Into<Verb>>(wordish: T) -> String {
 62 |         English::verb(
 63 |             wordish,
 64 |             &Person::Third,    // person doesn’t matter in past tense finite
 65 |             &Number::Singular, // irrelevant
 66 |             &Tense::Past,
 67 |             &Form::Finite,
 68 |         )
 69 |     }
 70 | 
 71 |     /// Returns the present participle ("-ing" form) of the verb.
 72 |     ///
 73 |     /// # Examples
 74 |     /// ```
 75 |     /// assert_eq!(English::present_participle("swim"), "swimming");
 76 |     /// ```
 77 |     pub fn present_participle<T: Into<Verb>>(wordish: T) -> String {
 78 |         English::verb(
 79 |             wordish,
 80 |             &Person::First,    // irrelevant for participles
 81 |             &Number::Singular, // irrelevant
 82 |             &Tense::Present,
 83 |             &Form::Participle,
 84 |         )
 85 |     }
 86 | 
 87 |     /// Returns the past participle of the verb.
 88 |     ///
 89 |     /// # Examples
 90 |     /// ```
 91 |     /// assert_eq!(English::past_participle("eat"), "eaten");
 92 |     /// ```
 93 |     pub fn past_participle<T: Into<Verb>>(wordish: T) -> String {
 94 |         English::verb(
 95 |             wordish,
 96 |             &Person::First,    // irrelevant
 97 |             &Number::Singular, // irrelevant
 98 |             &Tense::Past,
 99 |             &Form::Participle,
100 |         )
101 |     }
102 | 
103 |     /// Returns the infinitive (base) form of the verb.
104 |     ///
105 |     /// # Examples
106 |     /// ```
107 |     /// assert_eq!(English::infinitive("lie2"), "lie");
108 |     /// ```
109 |     pub fn infinitive<T: Into<Verb>>(wordish: T) -> String {
110 |         English::verb(
111 |             wordish,
112 |             &Person::First,    // irrelevant
113 |             &Number::Singular, // irrelevant
114 |             &Tense::Present,   // irrelevant
115 |             &Form::Infinitive,
116 |         )
117 |     }
118 | }
119 | 
120 | impl Verb {
121 |     /// Returns the negated base form ("not eat").
122 |     pub fn not<T: Into<Verb>>(wordish: T) -> String {
123 |         format!("not {}", Self::infinitive(wordish))
124 |     }
125 | 
126 |     /// Returns the simple future tense ("will eat").
127 |     pub fn will<T: Into<Verb>>(wordish: T) -> String {
128 |         format!("will {}", Self::infinitive(wordish))
129 |     }
130 | 
131 |     /// Returns the simple past with auxiliary ("did eat").
132 |     pub fn did<T: Into<Verb>>(wordish: T) -> String {
133 |         format!("did {}", Self::infinitive(wordish))
134 |     }
135 | 
136 |     /// Returns the conditional form ("would eat").
137 |     pub fn would<T: Into<Verb>>(wordish: T) -> String {
138 |         format!("would {}", Self::infinitive(wordish))
139 |     }
140 | 
141 |     /// Returns the modal possibility form ("could eat").
142 |     pub fn could<T: Into<Verb>>(wordish: T) -> String {
143 |         format!("could {}", Self::infinitive(wordish))
144 |     }
145 | 
146 |     /// Returns the modal ability/permission form ("can eat").
147 |     pub fn can<T: Into<Verb>>(wordish: T) -> String {
148 |         format!("can {}", Self::infinitive(wordish))
149 |     }
150 | 
151 |     /// Returns the modal obligation form ("should eat").
152 |     pub fn should<T: Into<Verb>>(wordish: T) -> String {
153 |         format!("should {}", Self::infinitive(wordish))
154 |     }
155 | 
156 |     /// Returns the present perfect form ("has eaten") ("have seen").
157 |     pub fn present_perfect<T: Into<Verb>>(
158 |         wordish: T,
159 |         subject_person: &Person,
160 |         subject_number: &Number,
161 |     ) -> String {
162 |         let have = English::verb(
163 |             "have",
164 |             subject_person,
165 |             subject_number,
166 |             &Tense::Present,
167 |             &Form::Finite,
168 |         );
169 |         format!("{have} {}", Self::past_participle(wordish))
170 |     }
171 | 
172 |     /// Returns the past perfect form ("had eaten").
173 |     pub fn past_perfect<T: Into<Verb>>(wordish: T) -> String {
174 |         format!("had {}", Self::past_participle(wordish))
175 |     }
176 | 
177 |     /// Returns the future perfect form ("will have eaten").
178 |     pub fn future_perfect<T: Into<Verb>>(wordish: T) -> String {
179 |         format!("will have {}", Self::past_participle(wordish))
180 |     }
181 | 
182 |     /// Returns the progressive aspect ("is eating").
183 |     pub fn present_progressive<T: Into<Verb>>(
184 |         wordish: T,
185 |         subject_person: &Person,
186 |         subject_number: &Number,
187 |     ) -> String {
188 |         let be = English::verb(
189 |             "be",
190 |             subject_person,
191 |             subject_number,
192 |             &Tense::Present,
193 |             &Form::Finite,
194 |         );
195 |         format!("{be} {}", Self::present_participle(wordish))
196 |     }
197 | 
198 |     /// Returns the past progressive aspect ("was eating").
199 |     pub fn past_progressive<T: Into<Verb>>(
200 |         wordish: T,
201 |         subject_person: &Person,
202 |         subject_number: &Number,
203 |     ) -> String {
204 |         let be = English::verb(
205 |             "be",
206 |             subject_person,
207 |             subject_number,
208 |             &Tense::Past,
209 |             &Form::Finite,
210 |         );
211 |         format!("{be} {}", Self::present_participle(wordish))
212 |     }
213 | 
214 |     /// Returns the future progressive aspect ("will be eating").
215 |     // Needs to be made to work better with negation
216 |     pub fn future_progressive<T: Into<Verb>>(wordish: T) -> String {
217 |         format!("will be {}", Self::present_participle(wordish))
218 |     }
219 | }
220 | 
221 | impl From<String> for Verb {
222 |     fn from(s: String) -> Self {
223 |         Verb {
224 |             head: s,
225 |             particle: None,
226 |         }
227 |     }
228 | }
229 | 
230 | impl From<&String> for Verb {
231 |     fn from(s: &String) -> Self {
232 |         Verb {
233 |             head: s.clone(),
234 |             particle: None,
235 |         }
236 |     }
237 | }
238 | 
239 | impl From<&str> for Verb {
240 |     fn from(s: &str) -> Self {
241 |         Verb {
242 |             head: s.to_string(),
243 |             particle: None,
244 |         }
245 |     }
246 | }
247 | 
248 | /// Just clones it
249 | impl From<&Verb> for Verb {
250 |     fn from(s: &Verb) -> Self {
251 |         s.clone()
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use english_core::EnglishCore;
  2 | pub use english_core::grammar::*;
  3 | 
  4 | mod noun;
  5 | pub use noun::*;
  6 | mod verb;
  7 | pub use verb::*;
  8 | mod adj;
  9 | pub use adj::*;
 10 | mod noun_phf;
 11 | use noun_phf::*;
 12 | mod adj_phf;
 13 | use adj_phf::*;
 14 | mod verb_phf;
 15 | use verb_phf::*;
 16 | 
 17 | fn strip_trailing_number(word: &str) -> String {
 18 |     if let Some(last_char) = word.chars().last() {
 19 |         if last_char.is_ascii_digit() {
 20 |             return word[..word.len() - 1].to_string();
 21 |         }
 22 |     }
 23 |     word.to_string()
 24 | }
 25 | 
 26 | /// Entry point for English inflection and morphology.
 27 | ///
 28 | /// This struct provides high-level methods for handling English
 29 | /// nouns, verbs, adjectives, pronouns, and possessives.
 30 | /// It delegates irregular forms to internal lookup tables
 31 | /// and falls back on `EnglishCore` for regular inflection rules.
 32 | pub struct English {}
 33 | impl English {
 34 |     /// Inflects a noun into singular or plural form.
 35 |     ///
 36 |     /// Handles irregular nouns (e.g., `"child" -> "children"`) and
 37 |     /// falls back to regular pluralization rules when no override is found.
 38 |     /// Strips trailing numbers used for sense disambiguation (`"die2" -> "dice"`).
 39 |     ///
 40 |     /// # Examples
 41 |     /// ```rust
 42 |     /// assert_eq!(English::noun("cat", &Number::Plural), "cats");
 43 |     /// assert_eq!(English::noun("child", &Number::Plural), "children");
 44 |     /// assert_eq!(English::noun("die2", &Number::Plural), "dice");
 45 |     /// ```
 46 |     pub fn noun<T: Into<Noun>>(word: T, number: &Number) -> String {
 47 |         let noun: Noun = word.into();
 48 |         let base_word = strip_trailing_number(&noun.head);
 49 | 
 50 |         let head_inflected = match number {
 51 |             Number::Singular => base_word,
 52 |             Number::Plural => {
 53 |                 if let Some(x) = get_plural(&noun.head) {
 54 |                     x.to_owned()
 55 |                 } else {
 56 |                     EnglishCore::noun(&base_word, number)
 57 |                 }
 58 |             }
 59 |         };
 60 |         let mut result = String::new();
 61 | 
 62 |         if let Some(modifier) = &noun.modifier {
 63 |             result.push_str(modifier);
 64 |             result.push(' ');
 65 |         }
 66 | 
 67 |         result.push_str(&head_inflected);
 68 | 
 69 |         if let Some(complement) = &noun.complement {
 70 |             result.push(' ');
 71 |             result.push_str(complement);
 72 |         }
 73 | 
 74 |         result
 75 |     }
 76 | 
 77 |     /// Inflects an adjective into positive, comparative, or superlative form.
 78 |     ///
 79 |     /// Handles irregular adjectives (e.g., `"good" -> "better"/"best"`)
 80 |     /// and falls back to regular periphrastic forms
 81 |     /// (e.g., `"fun" -> "more fun"/"most fun"`).
 82 |     /// Strips trailing numbers used for disambiguation (`"bad3"` -> `"worse"`).
 83 |     ///
 84 |     /// # Examples
 85 |     /// ```rust
 86 |     /// assert_eq!(English::adj("fast", &Degree::Comparative), "faster");
 87 |     /// assert_eq!(English::adj("good", &Degree::Superlative), "best");
 88 |     /// assert_eq!(English::adj("fun", &Degree::Comparative), "more fun");
 89 |     /// ```
 90 |     pub fn adj(word: &str, degree: &Degree) -> String {
 91 |         let base_word = strip_trailing_number(word);
 92 |         match degree {
 93 |             Degree::Positive => base_word.to_owned(),
 94 |             Degree::Comparative => {
 95 |                 if let Some((comp, _)) = get_adjective_forms(word) {
 96 |                     comp.to_owned()
 97 |                 } else {
 98 |                     EnglishCore::comparative(&base_word)
 99 |                 }
100 |             }
101 |             Degree::Superlative => {
102 |                 if let Some((_, sup)) = get_adjective_forms(word) {
103 |                     sup.to_owned()
104 |                 } else {
105 |                     EnglishCore::superlative(&base_word)
106 |                 }
107 |             }
108 |         }
109 |     }
110 | 
111 |     /// Conjugates a verb into the requested form.
112 |     ///
113 |     /// Handles irregular verbs (e.g., `"go" -> "went"`, `"eat" -> "ate"`)
114 |     /// and falls back to regular conjugation rules when no override is found.
115 |     /// Strips trailing numbers used for sense disambiguation (`"lie2"` -> `"lied"`).
116 |     ///
117 |     /// # Examples
118 |     /// ```rust
119 |     /// // Regular verb
120 |     /// assert_eq!(
121 |     ///     English::verb("walk", &Person::Third, &Number::Singular, &Tense::Present, &Form::Finite),
122 |     ///     "walks"
123 |     /// );
124 |     ///
125 |     /// // Irregular verb
126 |     /// assert_eq!(
127 |     ///     English::verb("eat", &Person::Third, &Number::Singular, &Tense::Past, &Form::Finite),
128 |     ///     "ate"
129 |     /// );
130 |     ///
131 |     /// // Participle
132 |     /// assert_eq!(
133 |     ///     English::verb("go", &Person::Third, &Number::Plural, &Tense::Past, &Form::Participle),
134 |     ///     "gone"
135 |     /// );
136 |     /// ```
137 |     pub fn verb<T: Into<Verb>>(
138 |         wordish: T,
139 |         person: &Person,
140 |         number: &Number,
141 |         tense: &Tense,
142 |         form: &Form,
143 |     ) -> String {
144 |         let verb: Verb = wordish.into();
145 |         let base_word = strip_trailing_number(&verb.head);
146 |         // Conjugate the head verb
147 |         let conjugated_head = match get_verb_forms(&verb.head) {
148 |             Some(wordik) => match (person, number, tense, form) {
149 |                 (_, _, _, Form::Infinitive) => base_word.to_owned(),
150 |                 (Person::Third, Number::Singular, Tense::Present, Form::Finite) => {
151 |                     wordik.0.to_string()
152 |                 }
153 |                 (_, _, Tense::Present, Form::Finite) => base_word.to_owned(),
154 |                 (_, _, Tense::Present, Form::Participle) => wordik.2.to_owned(),
155 |                 (_, _, Tense::Past, Form::Participle) => wordik.3.to_owned(),
156 |                 (_, _, Tense::Past, Form::Finite) => wordik.1.to_owned(),
157 |             },
158 |             None => EnglishCore::verb(&base_word, person, number, tense, form),
159 |         };
160 |         // Combine with particle efficiently
161 |         if let Some(particle) = verb.particle {
162 |             let mut result = String::with_capacity(conjugated_head.len() + 1 + particle.len());
163 |             result.push_str(&conjugated_head);
164 |             result.push(' ');
165 |             result.push_str(&particle);
166 |             result
167 |         } else {
168 |             conjugated_head
169 |         }
170 |     }
171 |     /// Returns the correct English pronoun for the given grammatical features.
172 |     ///
173 |     /// # Examples
174 |     /// ```rust
175 |     /// assert_eq!(
176 |     ///     English::pronoun(&Person::First, &Number::Singular, &Gender::Neutral, &Case::Nominative),
177 |     ///     "I"
178 |     /// );
179 |     /// assert_eq!(
180 |     ///     English::pronoun(&Person::Third, &Number::Singular, &Gender::Feminine, &Case::Nominative),
181 |     ///     "she"
182 |     /// );
183 |     /// assert_eq!(
184 |     ///     English::pronoun(&Person::Third, &Number::Plural, &Gender::Neutral, &Case::Nominative),
185 |     ///     "they"
186 |     /// );
187 |     /// ```
188 |     pub fn pronoun(person: &Person, number: &Number, gender: &Gender, case: &Case) -> &'static str {
189 |         EnglishCore::pronoun(person, number, gender, case)
190 |     }
191 |     /// Adds an English possessive suffix (`'s` or `'`) to a word.
192 |     ///
193 |     /// # Examples
194 |     /// ```rust
195 |     /// assert_eq!(English::add_possessive("dog"), "dog's");
196 |     /// assert_eq!(English::add_possessive("dogs"), "dogs'");
197 |     /// ```
198 |     pub fn add_possessive(word: &str) -> String {
199 |         EnglishCore::add_possessive(word)
200 |     }
201 | 
202 |     /// Capitalize the first letter of a word
203 |     ///
204 |     /// # Examples
205 |     /// ```rust
206 |     /// assert_eq!(English::add_possessive("house"), "House");
207 |     /// ```
208 |     pub fn capitalize_first(s: &str) -> String {
209 |         let mut c = s.chars();
210 |         match c.next() {
211 |             None => String::new(),
212 |             Some(first) => first.to_uppercase().collect::<String>() + c.as_str(),
213 |         }
214 |     }
215 | }
216 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # english
  2 | 
  3 | [![Crates.io](https://img.shields.io/crates/v/english)](https://crates.io/crates/english)
  4 | [![Docs.rs](https://docs.rs/english/badge.svg)](https://docs.rs/english)
  5 | ![License](https://img.shields.io/crates/l/english)
  6 | [![Discord](https://img.shields.io/discord/123456789012345678.svg?logo=discord&logoColor=white&color=5865F2)](https://discord.gg/tDBPkdgApN)
  7 | 
  8 | 
  9 | **english** is a blazing fast and light weight English inflection library written in Rust. Total bundled data size is less than 1 MB. It provides extremely accurate verb conjugation and noun/adjective declension based on highly processed Wiktionary data, making it ideal for real-time procedural text generation.
 10 | 
 11 | ## ⚡ Speed and Accuracy
 12 | 
 13 | Evaluation of the English inflector (`extractor/main.rs/check_*`) and performance benchmarking (`examples/speedmark.rs`) shows:
 14 | 
 15 | | Part of Speech | Correct / Total | Accuracy  | Throughput (calls/sec) | Time per Call |
 16 | |----------------|----------------|-----------|-----------------------|---------------|
 17 | | **Nouns**      | 238106 / 238549 | 99.81%   | 5,228,300             | 191 ns        |
 18 | | **Verbs**      | 158056 / 161643 | 97.78%   | 8,473,248             | 118 ns        |
 19 | | **Adjectives** | 119200 / 119356 | 99.86%   | 11,999,052             | 83 ns        |
 20 | 
 21 | *Note: Benchmarking was done under a worst-case scenario; typical real-world usage is 50~ nanoseconds faster.*
 22 | 
 23 | ## 📦 Installation
 24 | 
 25 | ```
 26 | cargo add english
 27 | ```
 28 | 
 29 | Then in your code:
 30 | 
 31 | ```rust
 32 | use english::*;
 33 | fn main() {
 34 |     // --- Mixed Sentence Example ---
 35 |     let subject_number = Number::Plural;
 36 |     let run = Verb::present_participle("run"); // running
 37 |     let child = Noun::from("child").with_specifier(run); //running child
 38 |     let subject = English::noun(child, &subject_number); //running children
 39 |     let verb = English::verb(
 40 |         "steal",
 41 |         &Person::Third,
 42 |         &subject_number,
 43 |         &Tense::Past,
 44 |         &Form::Finite,
 45 |     ); //stole
 46 |     let object = Noun::count_with_number("potato", 7); //7 potatoes
 47 | 
 48 |     let sentence = format!("The {} {} {}.", subject, verb, object);
 49 |     assert_eq!(sentence, "The running children stole 7 potatoes.");
 50 | 
 51 |     // --- Nouns ---
 52 |     // Note that noun(), count(), etc can work on both strings and Noun struct
 53 |     let jeans = Noun::from("pair").with_complement("of jeans");
 54 |     assert_eq!(Noun::count_with_number(jeans, 3), "3 pairs of jeans");
 55 |     // Regular plurals
 56 |     assert_eq!(English::noun("cat", &Number::Plural), "cats");
 57 |     // Add a number 2-9 to the end of the word to try different forms.
 58 |     // Can use plural()
 59 |     assert_eq!(Noun::plural("die2"), "dice");
 60 |     // Use count function for better ergonomics if needed
 61 |     assert_eq!(Noun::count("man", 2), "men");
 62 |     // Use count_with_number function to preserve the number
 63 |     assert_eq!(Noun::count_with_number("nickel", 3), "3 nickels");
 64 |     // Invariant nouns
 65 |     assert_eq!(English::noun("sheep", &Number::Plural), "sheep");
 66 | 
 67 |     // --- Verbs ---
 68 |     // All verb functions can use either strings or Verb struct
 69 |     let pick_up = Verb::from("pick").with_particle("up");
 70 |     // Helper functions: past() , third_person(), present_participle(), infinitive() etc.
 71 |     assert_eq!(Verb::past(&pick_up,), "picked up");
 72 |     assert_eq!(Verb::present_participle("walk"), "walking");
 73 |     assert_eq!(Verb::past_participle("go"), "gone");
 74 |     // Add a number 2-9 to the end of the word to try different forms.
 75 |     assert_eq!(Verb::past("lie"), "lay");
 76 |     assert_eq!(Verb::past("lie2"), "lied");
 77 |     // "to be" has the most verb forms in english and requires using verb()
 78 |     assert_eq!(
 79 |         English::verb(
 80 |             "be",
 81 |             &Person::First,
 82 |             &Number::Singular,
 83 |             &Tense::Present,
 84 |             &Form::Finite
 85 |         ),
 86 |         "am"
 87 |     );
 88 | 
 89 |     // --- Adjectives ---
 90 |     // Add a number 2-9 to the end of the word to try different forms. (Bad has the most forms at 3)
 91 |     assert_eq!(English::adj("bad", &Degree::Comparative), "more bad");
 92 |     assert_eq!(English::adj("bad", &Degree::Superlative), "most bad");
 93 |     assert_eq!(Adj::comparative("bad2"), "badder");
 94 |     assert_eq!(Adj::superlative("bad2"), "baddest");
 95 |     assert_eq!(Adj::comparative("bad3"), "worse");
 96 |     assert_eq!(Adj::superlative("bad3"), "worst");
 97 |     assert_eq!(Adj::positive("bad3"), "bad");
 98 | 
 99 |     // --- Pronouns ---
100 |     assert_eq!(
101 |         English::pronoun(
102 |             &Person::First,
103 |             &Number::Singular,
104 |             &Gender::Neuter,
105 |             &Case::PersonalPossesive
106 |         ),
107 |         "my"
108 |     );
109 |     assert_eq!(
110 |         English::pronoun(
111 |             &Person::First,
112 |             &Number::Singular,
113 |             &Gender::Neuter,
114 |             &Case::Possessive
115 |         ),
116 |         "mine"
117 |     );
118 | 
119 |     // --- Possessives ---
120 |     assert_eq!(English::add_possessive("dog"), "dog's");
121 |     assert_eq!(English::add_possessive("dogs"), "dogs'");
122 | }
123 | ```
124 | 
125 | ---
126 | 
127 | ## 🔧 Crate Overview
128 | 
129 | ### `english`
130 | 
131 | > The public API for verb conjugation and noun/adjective declension.
132 | 
133 | * Combines optimized data generated from `extractor` with inflection logic from `english-core`
134 | * Pure Rust, no external dependencies
135 | * Fast Binary search over pre-sorted arrays: `O(log n)` lookup.
136 | * Code generation ensures no runtime penalty.
137 | 
138 | ### `english-core`
139 | 
140 | > The core engine for English inflection — pure algorithmic logic.
141 | 
142 | * Implements the core rules for conjugation/declension
143 | * Used to classify forms as regular or irregular for the extractor
144 | * Has no data dependency — logic-only
145 | * Can be used stand alone for an even smaller footprint (at the cost of some accuracy)
146 | 
147 | ### `extractor`
148 | 
149 | > A tool to process and refine Wiktionary data.
150 | 
151 | * Parses large English Wiktionary dumps
152 | * Extracts all verb, noun, and adjective forms
153 | * Uses `english-core` to filter out regular forms, preserving only irregulars
154 | * Generates sorted static arrays for use in `english`
155 | 
156 | ---
157 | 
158 | ## 📦 Obtaining Wiktionary Data & Running the Extractor
159 | 
160 | This project relies on raw data extracted from Wiktionary. Current version built with data from 8/17/2025.
161 | 
162 | - [Wiktextract (GitHub)](https://github.com/tatuylonen/wiktextract)
163 | - [Kaikki.org raw data](https://kaikki.org/dictionary/rawdata.html)
164 | 
165 | ### Steps
166 | 
167 | 1. Download the **raw Wiktextract JSONL dump** (~20 GB) from [Kaikki.org](https://kaikki.org/dictionary/rawdata.html).
168 | 2. Place the file somewhere accessible (e.g. `../rawwiki.jsonl`).
169 | 3. From the `extractor` folder, run: `cargo run --release ../rawwiki.jsonl`
170 | 4. Move the generated files adj_array.rs, noun_array.rs, verb_array.rs into the /src of english
171 | 
172 | ## Benchmarks
173 | Performance benchmarks were run on my M2 Macbook.
174 | 
175 | Writing benchmarks and tests for such a project is rather difficult and requires opinionated decisions. Many words may have alternative inflections, and the data in wiktionary is not perfect. Many words might be both countable and uncountable, the tagging of words may be inconsistent. This library includes a few uncountable words in its dataset, but not all. Uncountable words require special handling anyway. Take all benchmarks with a pound of salt, write your own tests for your own usecases. Any suggestions to improve the benchmarking are highly appreciated.
176 | 
177 | ## Disclaimer
178 | Wiktionary data is often unstable and subject to weird changes. This means that the provided inflections may change unexpectedly. You can look at the diffs of *_array.rs files for a source of truth.
179 | 
180 | ## Inspirations and Thanks
181 | - Ole in the bevy discord suggested I use ```phf``` instead of sorted arrays, this resulted in up to 40% speedups
182 | - https://github.com/atteo/evo-inflector
183 | - https://github.com/plurals/pluralize
184 | 
185 | 
186 | ## 📄 License
187 | 
188 | - Code: Dual licensed under MIT and Apache © 2024 [gold-silver-copper](https://github.com/gold-silver-copper)
189 |   - [MIT](https://opensource.org/licenses/MIT)
190 |   - [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0)
191 | 
192 | - Data: Wiktionary content is dual-licensed under
193 |   - [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)
194 |   - [GNU FDL](https://www.gnu.org/licenses/fdl-1.3.html)
195 | 


--------------------------------------------------------------------------------
/extractor/noun_plural_check.csv:
--------------------------------------------------------------------------------
  1 | wiki_single,wiktionary_plural
  2 | encyclopedia,encyclopediæ
  3 | pound,pound
  4 | month,month
  5 | abscissa,abscissæ
  6 | lens,lens
  7 | lens,lentes
  8 | year,year
  9 | irc,irc's
 10 | stock,stocken
 11 | comma,commata
 12 | comma,commaes
 13 | turkey,turkies
 14 | portuguese,portugueses
 15 | index,index's
 16 | cow,kye
 17 | citrus,citrusses
 18 | penny,pens
 19 | deer,deers
 20 | tree,treen
 21 | boy,boyz
 22 | brother,brethren
 23 | sister,sistren
 24 | child,childer
 25 | daughter,daughtren
 26 | bee,been
 27 | honey,honies
 28 | how,how's
 29 | khaki,khakies
 30 | pupa,pupæ
 31 | vertebra,vertebræ
 32 | moose,meese
 33 | equinox,equinoctes
 34 | replica,repliche
 35 | mc,mc's
 36 | eye,eyen
 37 | eye,eyne
 38 | ok,ok's
 39 | acre,acre
 40 | foot,foot
 41 | knee,kneen
 42 | pizza,pizze
 43 | do,do's
 44 | maori,maories
 45 | larva,larvæ
 46 | shoe,shoon
 47 | shoe,shoen
 48 | why,why's
 49 | apparatus,apparatûs
 50 | apparatus,apparatūs
 51 | apparatus,apparati
 52 | camera,cameræ
 53 | camera,camerae
 54 | cheese,cheesen
 55 | complex,complices
 56 | cloth,clothes
 57 | box,boxen
 58 | basis,baseis
 59 | basis,basises
 60 | polynya,polynyi
 61 | piano,pianoes
 62 | octopus,octopusses
 63 | octopus,octopi
 64 | octopus,octopii
 65 | octopus,octopodes
 66 | octopus,octopus
 67 | beaver,beaver
 68 | donkey,donkies
 69 | fox,foxen
 70 | house,housen
 71 | house,hice
 72 | jew,jewes
 73 | subpoena,subpoenæ
 74 | c,c's
 75 | k,k's
 76 | q,q's
 77 | v,v's
 78 | calf,calfs
 79 | integer,integri
 80 | hyperbola,hyperbolæ
 81 | attorney,attornies
 82 | rhinoceros,rhinoceri
 83 | rhinoceros,rhinoceroi
 84 | rhinoceros,rhinocerotes
 85 | extremity,extremitys
 86 | granary,granarys
 87 | abc,abc's
 88 | elf,elfs
 89 | idea,ideæ
 90 | phylum,phylums
 91 | genus,genusses
 92 | genus,genii
 93 | uva,uvæ
 94 | area,areæ
 95 | mile,mile
 96 | flea,fleen
 97 | inch,inch
 98 | omnibus,omnibi
 99 | philistine,philistim
100 | real,réis
101 | seraph,seraphims
102 | pair,pair
103 | testa,testæ
104 | nexus,nexus
105 | sandwich,sandwichs
106 | candela,candelae
107 | lux,luces
108 | fatwa,fatawa
109 | axis,axiis
110 | penumbra,penumbræ
111 | panacea,panaceæ
112 | stepbrother,stepbrethren
113 | oryx,oryges
114 | wildebeest,wildebeesten
115 | lion,lion
116 | monkey,monkies
117 | valley,vallies
118 | hernia,herniæ
119 | tsar,tsari
120 | rex,reges
121 | mohawk,"""mohawks"" in all other senses"
122 | bacterium,bacteriums
123 | scenario,scenari
124 | nebula,nebulæ
125 | datum,datums
126 | formula,formulæ
127 | fungus,fungusses
128 | oasis,oasises
129 | vita,vitæ
130 | sagitta,sagittae
131 | hydra,hydræ
132 | zona,zonæ
133 | metropolis,metropolisses
134 | metropolis,metropolis's
135 | fenestra,fenestræ
136 | sow,swine
137 | chick,chicken
138 | lamb,lamber
139 | lamb,lambren
140 | pea,pease
141 | portcullis,portculli
142 | portcullis,portscullis
143 | agenda,agendae
144 | candelabrum,candelabrums
145 | yard,yard
146 | hobbit,hobbitses
147 | cornucopia,cornucopiæ
148 | furlong,furlong
149 | sock,sox
150 | gyros,gyroi
151 | dachshund,dachshunde
152 | toga,togæ
153 | bolus,boli
154 | pasta,paste
155 | bonus,boni
156 | ala,alæ
157 | catapulta,catapultæ
158 | braggadocio,braggadocioes
159 | braggadocio,braggadocii
160 | fax,faxxes
161 | virus,virusses
162 | virus,vira
163 | palpebra,palpebræ
164 | vagina,vaginæ
165 | axolotl,axolots
166 | cicada,cicadæ
167 | phobia,phobiæ
168 | chorus,chorusses
169 | chorus,chori
170 | fresco,freschi
171 | chimney,chimnies
172 | storey,stories
173 | minutia,minutiæ
174 | trachea,tracheæ
175 | corgi,corgwn
176 | amir,umara
177 | boa,boæ
178 | corona,coronæ
179 | fovea,foveæ
180 | lacuna,lacunæ
181 | tuber,tubera
182 | umbra,umbræ
183 | ballista,ballistæ
184 | fascia,fasciæ
185 | fauna,faunæ
186 | galley,gallies
187 | glans,glans
188 | punk,punx
189 | saliva,salivæ
190 | amnesia,amnesiæ
191 | inertia,inertiæ
192 | supernova,supernovæ
193 | sos,sos's
194 | platypus,platypodes
195 | asbestos,asbesti
196 | rabbi,rabbies
197 | persona,personæ
198 | viola,viole
199 | walrus,walri
200 | walrus,walrii
201 | molecule,moleculæ
202 | garda,gardaí
203 | sarcophagus,sarcophagusses
204 | pulley,pullies
205 | parabola,parabolæ
206 | arena,arenæ
207 | shits,the shits
208 | telly,tellys
209 | cassia,cassiæ
210 | retina,retinæ
211 | spatula,spatulæ
212 | aura,auræ
213 | ambrosia,ambrosiae
214 | hysteria,hysteriæ
215 | climax,climaces
216 | macron,macra
217 | orchestra,orchestrae
218 | paranoia,paranoiæ
219 | detritus,detrita
220 | cannula,cannulæ
221 | gaeltacht,gaeltachtaí
222 | rebus,rebusses
223 | rebus,rebi
224 | flora,floræ
225 | slr,slr's
226 | stria,striæ
227 | nausea,nauseæ
228 | uvula,uvulæ
229 | vulva,vulvæ
230 | lek,lekë
231 | lb,lbs.
232 | anemia,anemiæ
233 | tiara,tiarae
234 | tiara,tiaræ
235 | furcula,furculæ
236 | cc,cc's
237 | cornea,corneæ
238 | da,da's
239 | monomania,monomaniæ
240 | cello,celli
241 | giallo,gialli
242 | rpm,rpm's
243 | circus,circusses
244 | circus,circi
245 | fibula,fibulæ
246 | maxilla,maxillæ
247 | a,a's
248 | a,aes
249 | garda,gardaí
250 | i,i's
251 | j,j's
252 | o,o's
253 | slav,slavi
254 | v,v's
255 | z,z's
256 | bison,bisontes
257 | frappuccino,frappuccini
258 | macchiato,macchiati
259 | zeitgeist,zeitgeisten
260 | tabula,tabulæ
261 | willies,the willies
262 | pneumonia,pneumoniæ
263 | injuria,injuriæ
264 | areola,areolæ
265 | emir,umara
266 | rv,rv's
267 | ko,ko's
268 | acs,acs's
269 | fistula,fistulæ
270 | caesar,caesares
271 | kindergarten,kindergärten
272 | sclera,scleræ
273 | seta,setæ
274 | amaryllis,amaryllides
275 | specimen,specimina
276 | bandit,banditti
277 | chrysalis,chrysalisses
278 | cantata,cantate
279 | oss,oss's
280 | aficionado,aficionadi
281 | marquis,marquisses
282 | hots,the hots
283 | aula,aulæ
284 | airbus,airbii
285 | kobold,kobolde
286 | tropics,the tropics
287 | lamina,laminæ
288 | cbd,cbd's
289 | medusa,medusæ
290 | scrofula,scrofulæ
291 | brennschluss,brennschlüsse
292 | medulla,medullæ
293 | mentula,mentulæ
294 | stations,the stations
295 | corolla,corollæ
296 | sultan,salateen
297 | mastiff,mastives
298 | concha,conchæ
299 | chorea,choreæ
300 | sequela,sequelæ
301 | stewardess,stewardii
302 | sniffles,the sniffles
303 | fackeltanz,fackeltänze
304 | skins,the skins
305 | zugzwang,zugzwänge
306 | litas,litų
307 | aureus,aureii
308 | kleenex,kleenices
309 | kleenex,kleenices
310 | nineties,the nineties
311 | fossa,fossæ
312 | essentials,the essentials
313 | mk,mk's
314 | fach,fächer
315 | rhinocerot,rhinocerotes
316 | creeps,the creeps
317 | putto,puttoes
318 | membrana,membranæ
319 | shakes,the shakes
320 | rts,rts's
321 | conceptus,conceptūs
322 | cowan,cowanis
323 | phy,phy's
324 | spins,the spins
325 | splits,the splits
326 | blahs,the blahs
327 | wanderwort,wanderwörter
328 | bursa,bursæ
329 | fortepiano,fortepianoes
330 | coryza,coryzæ
331 | cursus,cursus
332 | cursus,cursūs
333 | cursus,cursi
334 | fursona,fursonae
335 | tupuna,tūpuna
336 | lunula,lunulæ
337 | pan,pen
338 | underhanded,the underhanded
339 | crista,cristæ
340 | faqih,fuqaha'
341 | orchis,orchisses
342 | fbo,fbo's
343 | stipula,stipulæ
344 | understory,understorys
345 | landammann,landammänner
346 | habenula,habenulæ
347 | wirtshaus,wirtshäuser
348 | stoa,stoæ
349 | cithara,citharæ
350 | rotavirus,rotaviri
351 | dwindles,the dwindles
352 | tuchus,tuchi
353 | collywobbles,the collywobbles
354 | auslaut,auslaute
355 | mirepoix,mirepoixs
356 | cyma,cymæ
357 | fossula,fossulæ
358 | fimbria,fimbriæ
359 | telangiectasia,telangiectasiæ
360 | rothschild,rothschildren
361 | userbox,userboxen
362 | fqih,fuqaha'
363 | mashgiach,mashgihim
364 | vizsla,vizslák
365 | oms,oms's
366 | dysesthesia,dysesthesiae
367 | kupuna,na kupuna
368 | ais,ais's
369 | vibratiuncula,vibratiunculæ
370 | scrophula,scrophulæ
371 | hartebeest,hartebeesten
372 | foveola,foveolæ
373 | nvocc,nvocc's
374 | cestui,cestuies
375 | lebensraum,lebensräume
376 | prester,presteres
377 | gwerz,gwerzioù
378 | sads,the sads
379 | sxs,sxs's
380 | antechinus,antechini
381 | lagena,lagenæ
382 | clavicula,claviculæ
383 | tps,tps's
384 | hydroecium,hydrœcia
385 | brisky,uncertain: briskys
386 | pacs,pacs's
387 | golding,goldinges
388 | patellula,patellulæ
389 | hoodman,hoodmans
390 | jotun,jötnar
391 | conferva,confervæ
392 | asbestus,asbesti
393 | sprachbund,sprachbünde
394 | temperates,the temperates
395 | primigravida,primigravidæ
396 | actinula,actinulæ
397 | sprachbund,sprachbünde
398 | aquafauna,aquafaunæ
399 | azat,azatk‘
400 | shu,shu's
401 | scu,scu's
402 | lallwort,lallwörter
403 | endotesta,endotestæ
404 | nachlass,nachlässe
405 | subcontinuum,subcontinuums
406 | dts,the dts
407 | tx,tx's
408 | leonardeschi,the leonardeschi
409 | wasteman,wastemans
410 | hoa,hoa's
411 | gaeilgeoir,gaeilgeoirí
412 | stubborns,the stubborns
413 | amebula,amebulæ
414 | dismals,the dismals
415 | urstromtal,urstromtäler
416 | sleepies,the sleepies
417 | chaoskampf,chaoskampfs
418 | uap,uap's
419 | paprikahuhn,paprikahühner
420 | ulx,ulx's
421 | lacc,lacc's
422 | sxs,sxs's
423 | pasuk,p'sukim
424 | lgbo,lgbo's
425 | lionesses,the lionesses
426 | residenzstadt,residenzstädte
427 | dphil,dphil's
428 | dolefuls,the dolefuls
429 | ozt,ozt's
430 | sandnigga,sandniggaz
431 | eaas,eaas's
432 | handie,handsies
433 | blahaj,blahajar
434 | his,his's
435 | kb,kb's
436 | ytpmv,ytpmv's
437 | fto,fto's
438 | meemies,the meemies
439 | hisatsinom,the hisatsinom
440 | amoebula,amoebulæ
441 | pwn,pwne
442 | sturzstrom,sturzströme
443 | insp,insp's
444 | vax,vaxen
445 | 


--------------------------------------------------------------------------------
/extractor/insane_noun.csv:
--------------------------------------------------------------------------------
  1 | word,plural,frequency
  2 | ,s,199688
  3 | y,ies,12214
  4 | ,es,8708
  5 | ,,4767
  6 | um,a,1790
  7 | an,en,1722
  8 | is,es,1714
  9 | ,e,983
 10 | us,i,917
 11 | on,a,607
 12 | ,ta,502
 13 | ful,sful,376
 14 | o,i,279
 15 | s,des,235
 16 | x,ces,216
 17 | ,im,186
 18 | s,i,173
 19 | rson,ople,166
 20 | ,i,154
 21 | f,ves,137
 22 | a,e,105
 23 | ,ses,84
 24 | fe,ves,76
 25 | ex,ices,71
 26 | ah,ot,70
 27 | e,i,66
 28 | ,en,64
 29 | ,x,63
 30 | a,i,60
 31 | ,n,51
 32 | ah,oth,51
 33 | x,ges,50
 34 | ,a,48
 35 | en,ina,41
 36 | a,ot,39
 37 | oot,eet,37
 38 | n,,37
 39 | ,y,37
 40 | ouse,ice,35
 41 | ,ren,35
 42 | a,y,32
 43 | ,z,27
 44 | ey,ies,26
 45 | ,zes,26
 46 | s,tes,25
 47 | e,ia,24
 48 | e,ae,22
 49 | a,oth,21
 50 | ff,ves,20
 51 | ,nes,20
 52 | is,eis,19
 53 | e,ai,19
 54 | us,era,18
 55 | o,ines,17
 56 | o,,17
 57 | ,m,16
 58 | ,ae,16
 59 | ,er,14
 60 | ooth,eeth,14
 61 | ful,esful,12
 62 | a,os,12
 63 | a,es,12
 64 | ,ot,11
 65 | ah,os,11
 66 | oose,eese,11
 67 | s,,10
 68 | yful,iesful,10
 69 | ,t,10
 70 | ,oth,10
 71 | es,ai,10
 72 | o,hi,10
 73 | o,a,10
 74 | x,kes,8
 75 | s,es,8
 76 | us,a,8
 77 | e,a,7
 78 | ets,tsy,7
 79 | ,un,7
 80 | ,ob,7
 81 | ,j,7
 82 | ,in,7
 83 | ,au,6
 84 | h,t,6
 85 | s,ta,6
 86 | es,ites,6
 87 | s,tia,6
 88 | ok,ki,6
 89 | z,ces,5
 90 | um,i,5
 91 | os,e,5
 92 | er,arim,5
 93 | ,g,5
 94 | ,k,5
 95 | ,ach,5
 96 | ,ia,5
 97 | ,tes,4
 98 | ed,dim,4
 99 | us,ii,4
100 | ,at,4
101 | ,een,4
102 | ,u,4
103 | us,ora,4
104 | ,ie,4
105 | a,,4
106 | el,lach,4
107 | ,ech,4
108 | ,ata,4
109 | us,es,4
110 | en,ines,4
111 | ,it,4
112 | ,r,4
113 | ur,ora,4
114 | us,odes,4
115 | er,rim,4
116 | q,t,4
117 | ,ar,3
118 | us,is,3
119 | os,i,3
120 | ,lar,3
121 | um,ae,3
122 | k,it,3
123 | ,ion,3
124 | ,ok,3
125 | ,is,3
126 | ,den,3
127 | f,vim,3
128 | ut,ita,3
129 | s,ra,3
130 | es,ae,3
131 | l,ux,3
132 | et,tim,3
133 | l,is,3
134 | a,or,3
135 | eps,ipes,3
136 | ad,den,3
137 | el,alim,3
138 | i,a,3
139 | a,ur,3
140 | ,os,3
141 | an,onim,2
142 | us,ae,2
143 | i,,2
144 | sm,,2
145 | os,sim,2
146 | erfamilias,resfamilias,2
147 | us,des,2
148 | ,ai,2
149 | aon,eonim,2
150 | a,in,2
151 | full,sfull,2
152 | m,ns,2
153 | en,anim,2
154 | ia,es,2
155 | ot,ten,2
156 | n,i,2
157 | k,t,2
158 | ar,roth,2
159 | ,se,2
160 | u,i,2
161 | et,ot,2
162 | ek,ky,2
163 | ,q,2
164 | onseigneur,esseigneurs,2
165 | ,des,2
166 | adam,esdames,2
167 | f,ven,2
168 | o,e,2
169 | ,oi,2
170 | e,ies,2
171 | s,res,2
172 | ah,im,2
173 | t,s,2
174 | ya,i,2
175 | e,oi,2
176 | ,ov,2
177 | eh,oth,2
178 | fful,vesful,2
179 | each,igh,2
180 | il,ux,2
181 | e,ce,2
182 | fidi,wafid,2
183 | anservant,enservants,2
184 | ademoiselle,esdemoiselles,2
185 | ak,ci,2
186 | f,vs,2
187 | ,eanna,2
188 | anchild,enchildren,2
189 | el,lech,2
190 | eder,adarim,2
191 | ,th,2
192 | ka,ok,2
193 | a,een,2
194 | ny,ce,2
195 | li,tin,2
196 | s,e,2
197 | ar,our,2
198 | e,s,2
199 | os,atim,2
200 | er,ri,2
201 | tis,des,2
202 | q,it,2
203 | ,ean,2
204 | ia,e,2
205 | e,ata,2
206 | y,ia,2
207 | adame,esdames,2
208 | onsieur,essieurs,2
209 | os,ea,2
210 | a,he,2
211 | i,een,2
212 | ,ir,2
213 | sibi,wasib,2
214 | x,ches,2
215 | eps,ipites,2
216 | ets,tsi,2
217 | juz,ajza,1
218 | ah,iyos,1
219 | rijite,warij,1
220 | em,omim,1
221 | oof,eef,1
222 | okhol,akhly,1
223 | umlungu,abelungu,1
224 | ee,,1
225 | ein,anim,1
226 | saw,asaw,1
227 | ,le,1
228 | misr,amsar,1
229 | le,rren,1
230 | orafe,erafe,1
231 | eg,agim,1
232 | uramba,iramba,1
233 | sa,jes,1
234 | t,ot,1
235 | ax,ices,1
236 | ,essrs,1
237 | nduna,zinduna,1
238 | an,ni,1
239 | est,sunt,1
240 | khara,akhir,1
241 | nasheed,anasheed,1
242 | y,e,1
243 | c,,1
244 | amenukal,imenukalen,1
245 | add,udud,1
246 | w,ois,1
247 | zil,azil,1
248 | ann,enn,1
249 | di,oud,1
250 | ovabitch,sovbitches,1
251 | en,onim,1
252 | ek,iky,1
253 | mganga,waganga,1
254 | y,ce,1
255 | ,xes,1
256 | ra,war,1
257 | es,osim,1
258 | h,ot,1
259 | tl,,1
260 | h,s,1
261 | nia,en,1
262 | moran,ilmoran,1
263 | umkhwetha,abakwetha,1
264 | inego,sinegoes,1
265 | ah,ioth,1
266 | by,sby,1
267 | us,,1
268 | at,ot,1
269 | i,onin,1
270 | es,ides,1
271 | rs,esdames,1
272 | kgotla,dikgotla,1
273 | ,ovi,1
274 | riya,wari,1
275 | amazigh,imazighen,1
276 | ri,li,1
277 | ,tsy,1
278 | ,ja,1
279 | ,bi,1
280 | umuzungu,abazungu,1
281 | y,ees,1
282 | e,tes,1
283 | soa,osi,1
284 | ngaka,dingaka,1
285 | r,yr,1
286 | erg,areg,1
287 | kibanja,bibanja,1
288 | umkhwetha,abakhwetha,1
289 | ,yat,1
290 | h,e,1
291 | ie,ce,1
292 | aphil,ephilim,1
293 | ,tys,1
294 | on,i,1
295 | s,is,1
296 | oste,atim,1
297 | umuzungu,bazungu,1
298 | ,ens,1
299 | es,aisim,1
300 | eikh,uyookh,1
301 | adol,edolim,1
302 | itur,untur,1
303 | er,res,1
304 | us,ina,1
305 | o,is,1
306 | ie,s,1
307 | uvabitch,suvbitches,1
308 | ashah,shioth,1
309 | a,s,1
310 | s,x,1
311 | onsr,essrs,1
312 | hisname,theirnames,1
313 | j,ljj,1
314 | da,ren,1
315 | sangoma,zangoma,1
316 | indiq,andaqa,1
317 | oote,eete,1
318 | woman,swomen,1
319 | ,ah,1
320 | mzungu,wazungu,1
321 | e,ren,1
322 | ongolawi,anagla,1
323 | wa,awat,1
324 | tambala,matambala,1
325 | lma,walim,1
326 | tar,darka,1
327 | uy,luim,1
328 | mbongi,zimbongi,1
329 | sibongo,zibongo,1
330 | omanhene,amanhene,1
331 | als,ulus,1
332 | eder,adorim,1
333 | martial,smartial,1
334 | esh,ashim,1
335 | ,ons,1
336 | aliah,elihim,1
337 | ootful,eetful,1
338 | id,eda,1
339 | ,il,1
340 | ils,ulus,1
341 | zeeyeh,wazee,1
342 | myometritis,endometritides,1
343 | it,aisim,1
344 | tur,ntur,1
345 | imrah,emirot,1
346 | it,eysim,1
347 | sente,lisente,1
348 | e,ides,1
349 | s,isim,1
350 | ,as,1
351 | x,ctes,1
352 | mbongi,imbongi,1
353 | ification,fications,1
354 | ,hang,1
355 | ol,ly,1
356 | hizb,ahzab,1
357 | ya,at,1
358 | oothmark,eethmarks,1
359 | afiz,uffaz,1
360 | umzulu,amazulu,1
361 | al,is,1
362 | waqf,awqaf,1
363 | eq,aqim,1
364 | ,na,1
365 | an,n,1
366 | ,eh,1
367 | ,em,1
368 | afek,fekot,1
369 | is,u,1
370 | ah,iyoth,1
371 | ushaf,asahif,1
372 | er,ren,1
373 | ,zim,1
374 | achresis,echreses,1
375 | r,ar,1
376 | s,thes,1
377 | mosarwa,basarwa,1
378 | nn,wan,1
379 | ars,uroos,1
380 | en,onem,1
381 | ach,chim,1
382 | ow,attle,1
383 | nyanga,zinyanga,1
384 | esh,shayim,1
385 | tl,meh,1
386 | ed,de,1
387 | ilbab,alabib,1
388 | ok,ukim,1
389 | mbizo,zimbizo,1
390 | ars,urus,1
391 | araph,eraphim,1
392 | eole,olae,1
393 | wa,een,1
394 | intaqah,anatiq,1
395 | ah,iyot,1
396 | sir,asir,1
397 | r,essrs,1
398 | ces,ses,1
399 | ek,kim,1
400 | lme,walim,1
401 | nie,ce,1
402 | ace,i,1
403 | ootpaw,eetpaws,1
404 | do,be,1
405 | mon,yn,1
406 | i,anim,1
407 | ah,in,1
408 | ann,inn,1
409 | c,x,1
410 | alim,ulama,1
411 | ful,sfuls,1
412 | rani,ara,1
413 | is,eisim,1
414 | itongo,amatongo,1
415 | ote,a,1
416 | wmwd,ymydau,1
417 | eyrir,aurar,1
418 | lilangeni,emalangeni,1
419 | ,la,1
420 | hadith,ahadith,1
421 | sharif,ashraf,1
422 | kungwi,makungwi,1
423 | ,me,1
424 | wakf,awkaf,1
425 | anfriend,enfriends,1
426 | y,ata,1
427 | e,lun,1
428 | un,en,1
429 | us,sim,1
430 | s,ysim,1
431 | ,ch,1
432 | aliah,elichim,1
433 | omyn,ymyn,1
434 | diptotes,nouns,1
435 | mus,wamis,1
436 | kwerekwere,amakwerekwere,1
437 | nie,s,1
438 | y,sies,1
439 | ek,ki,1
440 | i,ah,1
441 | am,omim,1
442 | jid,ajid,1
443 | aykh,uyookh,1
444 | esh,ashoth,1
445 | s,tot,1
446 | oret,urot,1
447 | us,oi,1
448 | moloi,baloi,1
449 | ipsis,eipses,1
450 | lolwapa,malwapa,1
451 | gepik,qepik,1
452 | et,itten,1
453 | singhalese,cingalese,1
454 | ariqa,uruq,1
455 | o,eaux,1
456 | ,ut,1
457 | ,iv,1
458 | apanca,epanche,1
459 | ofabitch,sabitches,1
460 | oop,eep,1
461 | inkhosi,amakhosi,1
462 | ebsi,bassa,1
463 | ekwele,bipkwele,1
464 | omin,imin,1
465 | otshelo,etshelo,1
466 | mosotho,basotho,1
467 | an,,1
468 | ,thes,1
469 | eder,iddarim,1
470 | muzungu,wazungu,1
471 | os,ai,1
472 | is,asa,1
473 | triptotes,nouns,1
474 | rs,mes,1
475 | ,jev,1
476 | beast,,1
477 | ,sa,1
478 | sicoco,zicoco,1
479 | inn,awan,1
480 | l,aux,1
481 | s,zes,1
482 | asul,usul,1
483 | ets,tsiv,1
484 | mwalimu,walimu,1
485 | add,udood,1
486 | ayit,eytim,1
487 | elet,lot,1
488 | eh,im,1
489 | ,ada,1
490 | iss,lles,1
491 | lmah,walim,1
492 | aki,eke,1
493 | feful,vesful,1
494 | ophato,ephato,1
495 | ard,nards,1
496 | iec,ce,1
497 | as,es,1
498 | i,anin,1
499 | ,gun,1
500 | er,orim,1
501 | ofabitch,sofbitches,1
502 | aliach,luchim,1
503 | mzee,wazee,1
504 | ommin,immin,1
505 | er,,1
506 | us,eres,1
507 | wa,a,1
508 | ,h,1
509 | i,ar,1
510 | ratal,irtal,1
511 | ko,ot,1
512 | eq,qim,1
513 | homme,shommes,1
514 | n,sa,1
515 | s,ces,1
516 | di,adim,1
517 | ,ekh,1
518 | okoro,ekoro,1
519 | rotl,artal,1
520 | cow,kine,1
521 | nik,ota,1
522 | aliah,luchim,1
523 | i,onim,1
524 | ex,iges,1
525 | tan,atin,1
526 | desac,sdesac,1
527 | nkisi,minkisi,1
528 | e,tini,1
529 | aliah,lihim,1
530 | ,an,1
531 | in,a,1
532 | kama,cama,1
533 | ann,innan,1
534 | cimibundu,vimbundu,1
535 | o,ya,1
536 | osanto,isanti,1
537 | ach,chaot,1
538 | af,efim,1
539 | eder,idarim,1
540 | lmeh,walim,1
541 | elet,loth,1
542 | loti,maloti,1
543 | in,e,1
544 | afir,uffar,1
545 | thorax,horaces,1
546 | s,ntes,1
547 | et,ittim,1
548 | wthyn,ythynnod,1
549 | ,kuna,1
550 | hanedd,aneddion,1
551 | rah,war,1
552 | mot,smots,1
553 | um,en,1
554 | is,eson,1
555 | arif,urafa,1
556 | yot,ik,1
557 | ut,eet,1
558 | th,yoth,1
559 | aliah,lichim,1
560 | ,ni,1
561 | orena,arena,1
562 | ,len,1
563 | hadeeth,ahadeeth,1
564 | is,ai,1
565 | r,ir,1
566 | anne,enne,1
567 | d,ood,1
568 | eh,ot,1
569 | nik,ari,1
570 | eh,os,1
571 | gid,agid,1
572 | each,ich,1
573 | o,mus,1
574 | el,loch,1
575 | imrah,emiroth,1
576 | ,ims,1
577 | as,ones,1
578 | ,te,1
579 | ,och,1
580 | achua,wochua,1
581 | d,ud,1
582 | ex,icia,1
583 | huk,uit,1
584 | aliah,luhim,1
585 | ,ke,1
586 | ya,,1
587 | oritz,ritzim,1
588 | motswana,batswana,1
589 | k,cy,1
590 | an,ns,1
591 | likuta,makuta,1
592 | muwashshah,tawashih,1
593 | nkishi,minkishi,1
594 | y,es,1
595 | ys,ies,1
596 | ok,ky,1
597 | darme,sdarmes,1
598 | egetz,kotzim,1
599 | dah,ada,1
600 | ah,iot,1
601 | it,tin,1
602 | rasah,aris,1
603 | alian,elsh,1
604 | e,ya,1
605 | nill,illion,1
606 | elek,alakim,1
607 | ti,pol,1
608 | al,len,1
609 | ann,innah,1
610 | kgosana,dikgosana,1
611 | ,od,1
612 | ,ji,1
613 | eh,at,1
614 | kgosi,dikgosi,1
615 | sak,iskei,1
616 | etrum,hertum,1
617 | igqirha,amagqirha,1
618 | itrix,rices,1
619 | hil,jil,1
620 | lal,alil,1
621 | funa,nagu,1
622 | y,s,1
623 | a,ek,1
624 | herero,ovaherero,1
625 | alim,ulema,1
626 | inkosi,amakhosi,1
627 | 


--------------------------------------------------------------------------------
/extractor/analyzed_endings.csv:
--------------------------------------------------------------------------------
   1 | singular_suffix,plural_suffix,count
   2 | a,as,9629
   3 | m,ms,1276
   4 | f,fs,918
   5 | i,i,724
   6 | s,s,688
   7 | a,a,568
   8 | o,oes,522
   9 | a,ata,499
  10 | h,hs,375
  11 | n,n,364
  12 | e,e,339
  13 | h,hes,324
  14 | o,o,245
  15 | y,ys,227
  16 | lus,li,226
  17 | is,ides,222
  18 | u,u,222
  19 | ron,ra,199
  20 | ion,ia,189
  21 | n,ns,180
  22 | s,ss,179
  23 | os,oi,166
  24 | erson,eople,165
  25 | z,zes,164
  26 | t,t,134
  27 | k,k,119
  28 | h,h,110
  29 | x,xes,107
  30 | g,g,98
  31 | r,r,97
  32 | eus,ei,91
  33 | s,ses,90
  34 | to,ti,85
  35 | e,es,84
  36 | cus,ci,83
  37 | l,l,82
  38 | s,sses,81
  39 | tis,tes,74
  40 | ius,ii,72
  41 | ife,ives,70
  42 | tus,ti,70
  43 | no,ni,67
  44 | rus,ri,65
  45 | eful,esful,64
  46 | d,d,63
  47 | u,us,59
  48 | tful,tsful,58
  49 | m,m,57
  50 | sus,si,53
  51 | gus,gi,52
  52 | k,ki,52
  53 | nful,nsful,52
  54 | r,res,50
  55 | y,y,49
  56 | rful,rsful,46
  57 | z,z,46
  58 | lo,li,45
  59 | i,ies,43
  60 | hus,hi,42
  61 | lon,la,42
  62 | c,c,41
  63 | men,mina,41
  64 | nus,ni,40
  65 | ix,ices,39
  66 | n,nim,39
  67 | n,nes,38
  68 | non,na,38
  69 | ton,ta,38
  70 | ka,ki,37
  71 | an,a,36
  72 | e,en,36
  73 | x,x,36
  74 | mus,mi,35
  75 | t,ts,35
  76 | d,dren,34
  77 | r,rim,32
  78 | x,xs,30
  79 | a,ai,29
  80 | bus,bi,29
  81 | h,him,29
  82 | kful,ksful,29
  83 | lis,les,29
  84 | tex,tices,28
  85 | lful,lsful,27
  86 | p,p,27
  87 | l,les,25
  88 | ne,ni,25
  89 | w,w,25
  90 | r,ri,24
  91 | z,zzes,24
  92 | pus,pi,23
  93 | j,jes,22
  94 | pful,psful,22
  95 | ax,aces,21
  96 | aff,aves,20
  97 | do,di,20
  98 | o,ones,20
  99 | ta,te,20
 100 | fan,fen,19
 101 | dful,dsful,17
 102 | f,f,17
 103 | io,i,17
 104 | la,le,17
 105 | le,lia,17
 106 | dus,di,16
 107 | gful,gsful,16
 108 | ns,ntes,16
 109 | b,b,15
 110 | con,ca,15
 111 | eon,ea,15
 112 | n,ne,15
 113 | na,ne,15
 114 | r,rs,15
 115 | d,des,14
 116 | i,ia,14
 117 | lex,lices,14
 118 | lis,leis,14
 119 | ra,re,14
 120 | rah,rot,14
 121 | t,ten,14
 122 | d,dae,13
 123 | n,ni,13
 124 | xon,xa,13
 125 | zo,zi,13
 126 | d,dim,12
 127 | h,hen,12
 128 | i,im,12
 129 | ia,ie,12
 130 | so,si,12
 131 | go,gines,11
 132 | ka,ky,11
 133 | mful,msful,11
 134 | nus,nera,11
 135 | ris,res,11
 136 | ro,ri,11
 137 | g,ges,10
 138 | j,j,10
 139 | k,kim,10
 140 | l,lim,10
 141 | na,ny,10
 142 | rah,roth,10
 143 | t,tim,10
 144 | tes,tai,10
 145 | wful,wsful,10
 146 | co,chi,9
 147 | h,hi,9
 148 | pex,pices,9
 149 | t,te,9
 150 | t,tes,9
 151 | u,ux,9
 152 | zah,zot,9
 153 | a,at,8
 154 | dex,dices,8
 155 | g,ge,8
 156 | g,gim,8
 157 | hful,hsful,8
 158 | is,i,8
 159 | le,lae,8
 160 | le,li,8
 161 | mo,mi,8
 162 | mon,ma,8
 163 | nah,not,8
 164 | ney,nies,8
 165 | q,q,8
 166 | re,ri,8
 167 | ta,ti,8
 168 | vus,vi,8
 169 | y,yes,8
 170 | yx,yces,8
 171 | za,ze,8
 172 | a,az,7
 173 | bful,bsful,7
 174 | k,ky,7
 175 | lah,lot,7
 176 | r,ra,7
 177 | te,ti,7
 178 | vis,ves,7
 179 | z,zy,7
 180 | as,ata,6
 181 | bis,bes,6
 182 | do,dines,6
 183 | ha,hot,6
 184 | hah,hot,6
 185 | hon,ha,6
 186 | lah,loth,6
 187 | m,ma,6
 188 | ma,me,6
 189 | n,ny,6
 190 | ns,ntia,6
 191 | pah,pot,6
 192 | ra,rot,6
 193 | s,sen,6
 194 | ta,tes,6
 195 | u,ua,6
 196 | v,v,6
 197 | z,zim,6
 198 | za,zot,6
 199 | a,aim,5
 200 | ah,at,5
 201 | as,ades,5
 202 | as,ai,5
 203 | ax,akes,5
 204 | co,ci,5
 205 | g,gen,5
 206 | ga,gi,5
 207 | h,hy,5
 208 | ha,hos,5
 209 | hful,hesful,5
 210 | l,lach,5
 211 | le,lai,5
 212 | me,mai,5
 213 | nis,nes,5
 214 | o,oj,5
 215 | ra,roth,5
 216 | ryful,riesful,5
 217 | s,sim,5
 218 | se,si,5
 219 | sful,sesful,5
 220 | yful,ysful,5
 221 | yx,yges,5
 222 | zah,zoth,5
 223 | as,ates,4
 224 | bah,both,4
 225 | ce,ci,4
 226 | es,etes,4
 227 | ex,eces,4
 228 | ex,eges,4
 229 | ge,gi,4
 230 | gon,ga,4
 231 | hah,hoth,4
 232 | io,ia,4
 233 | key,kies,4
 234 | l,lech,4
 235 | l,li,4
 236 | l,lz,4
 237 | la,los,4
 238 | la,lot,4
 239 | lum,li,4
 240 | m,mata,4
 241 | m,mim,4
 242 | mes,mites,4
 243 | mis,mes,4
 244 | mur,mora,4
 245 | n,ntes,4
 246 | nah,noth,4
 247 | oon,oa,4
 248 | pa,pot,4
 249 | pus,podes,4
 250 | sa,se,4
 251 | t,ti,4
 252 | ux,uces,4
 253 | vo,vi,4
 254 | x,xen,4
 255 | ys,yes,4
 256 | z,zi,4
 257 | za,zoth,4
 258 | aad,aden,3
 259 | al,aux,3
 260 | bex,bices,3
 261 | ceps,cipes,3
 262 | d,da,3
 263 | d,dau,3
 264 | d,der,3
 265 | d,dy,3
 266 | dah,dot,3
 267 | der,darim,3
 268 | e,ees,3
 269 | e,er,3
 270 | eus,eis,3
 271 | fah,fot,3
 272 | fex,fices,3
 273 | h,ha,3
 274 | h,he,3
 275 | h,hot,3
 276 | h,hoth,3
 277 | ha,hoth,3
 278 | hok,hki,3
 279 | ix,ikes,3
 280 | k,ken,3
 281 | kah,koth,3
 282 | kon,ka,3
 283 | l,lia,3
 284 | l,ln,3
 285 | la,li,3
 286 | m,mi,3
 287 | me,ma,3
 288 | n,nob,3
 289 | n,not,3
 290 | n,noth,3
 291 | n,nz,3
 292 | na,ni,3
 293 | ne,na,3
 294 | nets,ntsy,3
 295 | o,og,3
 296 | pah,poth,3
 297 | pe,pae,3
 298 | po,pi,3
 299 | ps,pes,3
 300 | pus,pora,3
 301 | put,pita,3
 302 | q,qim,3
 303 | r,re,3
 304 | r,ren,3
 305 | ra,ry,3
 306 | re,ria,3
 307 | rex,rices,3
 308 | s,se,3
 309 | sa,sy,3
 310 | sey,sies,3
 311 | t,ter,3
 312 | t,ty,3
 313 | u,ues,3
 314 | u,uit,3
 315 | uk,uit,3
 316 | v,vim,3
 317 | va,vot,3
 318 | vah,voth,3
 319 | vey,vies,3
 320 | ya,yot,3
 321 | yfe,yves,3
 322 | yon,ya,3
 323 | z,ze,3
 324 | aful,asful,2
 325 | ail,aux,2
 326 | ak,at,2
 327 | al,ais,2
 328 | aq,at,2
 329 | as,ae,2
 330 | az,aces,2
 331 | b,bz,2
 332 | bah,bot,2
 333 | ban,ben,2
 334 | bos,batim,2
 335 | ce,cies,2
 336 | ceps,cipites,2
 337 | cus,cera,2
 338 | d,dden,2
 339 | d,de,2
 340 | d,deen,2
 341 | d,dz,2
 342 | da,des,2
 343 | da,dy,2
 344 | di,deen,2
 345 | e,eg,2
 346 | e,eis,2
 347 | e,ek,2
 348 | e,ez,2
 349 | es,edes,2
 350 | ez,eces,2
 351 | fer,frim,2
 352 | fo,fi,2
 353 | fus,fi,2
 354 | g,ga,2
 355 | g,gi,2
 356 | ga,ge,2
 357 | ged,gdim,2
 358 | gey,gies,2
 359 | gia,ge,2
 360 | h,hie,2
 361 | h,hz,2
 362 | han,honim,2
 363 | he,hae,2
 364 | heder,hadarim,2
 365 | hen,hanim,2
 366 | het,htim,2
 367 | hex,hices,2
 368 | hos,he,2
 369 | i,iz,2
 370 | ia,i,2
 371 | iah,iot,2
 372 | io,ii,2
 373 | iq,iit,2
 374 | ism,i,2
 375 | itis,ides,2
 376 | ius,ia,2
 377 | ix,iches,2
 378 | j,jj,2
 379 | k,ke,2
 380 | k,ker,2
 381 | l,lion,2
 382 | l,llar,2
 383 | la,loth,2
 384 | lah,los,2
 385 | ley,lies,2
 386 | lfe,lves,2
 387 | lli,ltin,2
 388 | lum,lae,2
 389 | lx,lces,2
 390 | m,mat,2
 391 | m,my,2
 392 | mah,mot,2
 393 | mah,moth,2
 394 | manchild,menchildren,2
 395 | manservant,menservants,2
 396 | me,mata,2
 397 | mel,mlech,2
 398 | men,mines,2
 399 | n,nen,2
 400 | n,ner,2
 401 | ne,nae,2
 402 | ne,nai,2
 403 | nets,ntsi,2
 404 | nny,nce,2
 405 | nos,ne,2
 406 | nx,nces,2
 407 | oe,oae,2
 408 | oe,oai,2
 409 | on,oi,2
 410 | oot,oten,2
 411 | ous,odes,2
 412 | ous,oes,2
 413 | ox,oces,2
 414 | p,pen,2
 415 | p,py,2
 416 | pa,pe,2
 417 | pe,pai,2
 418 | q,qq,2
 419 | q,qun,2
 420 | r,rot,2
 421 | r,roth,2
 422 | rah,ros,2
 423 | rey,ries,2
 424 | rf,rven,2
 425 | ros,rsim,2
 426 | rus,rii,2
 427 | s,sa,2
 428 | sa,si,2
 429 | sar,sour,2
 430 | sia,ses,2
 431 | sis,seis,2
 432 | t,ta,2
 433 | t,tn,2
 434 | t,tot,2
 435 | ta,tot,2
 436 | ta,ty,2
 437 | te,tae,2
 438 | terfamilias,tresfamilias,2
 439 | tes,tae,2
 440 | tus,ta,2
 441 | tus,tera,2
 442 | tyful,tiesful,2
 443 | u,uen,2
 444 | uis,ues,2
 445 | uq,ut,2
 446 | us,udes,2
 447 | us,ura,2
 448 | uus,ui,2
 449 | v,ve,2
 450 | v,vin,2
 451 | veh,voth,2
 452 | vets,vtsy,2
 453 | vo,va,2
 454 | w,wen,2
 455 | wa,ween,2
 456 | xful,xesful,2
 457 | xus,xi,2
 458 | ya,yoth,2
 459 | yah,yot,2
 460 | ys,ydes,2
 461 | zis,zes,2
 462 | a,aa,1
 463 | a,ades,1
 464 | a,aes,1
 465 | a,ak,1
 466 | a,ala,1
 467 | a,alar,1
 468 | a,am,1
 469 | a,ar,1
 470 | a,athes,1
 471 | aal,alen,1
 472 | aan,ani,1
 473 | aan,ans,1
 474 | achua,wochua,1
 475 | ae,ai,1
 476 | afful,avesful,1
 477 | afidi,awafid,1
 478 | ah,ae,1
 479 | ais,aeson,1
 480 | alim,ulama,1
 481 | alim,ulema,1
 482 | alma,awalim,1
 483 | almah,awalim,1
 484 | alme,awalim,1
 485 | almeh,awalim,1
 486 | amazigh,imazighen,1
 487 | amenukal,imenukalen,1
 488 | amus,awamis,1
 489 | an,asa,1
 490 | ann,awan,1
 491 | arijite,awarij,1
 492 | ariya,awari,1
 493 | as,aces,1
 494 | as,antes,1
 495 | asibi,awasib,1
 496 | atl,a,1
 497 | atur,antur,1
 498 | azeeyeh,awazee,1
 499 | b,be,1
 500 | b,bim,1
 501 | b,bims,1
 502 | b,bin,1
 503 | b,bob,1
 504 | ba,be,1
 505 | bel,blach,1
 506 | ber,bri,1
 507 | bi,ba,1
 508 | bi,bah,1
 509 | bi,banim,1
 510 | bi,banin,1
 511 | bi,bonim,1
 512 | bi,bonin,1
 513 | bit,btin,1
 514 | bkhara,bakhir,1
 515 | bok,bki,1
 516 | boste,batim,1
 517 | bs,bes,1
 518 | bwthyn,bythynnod,1
 519 | c,cada,1
 520 | c,ces,1
 521 | c,ci,1
 522 | ca,che,1
 523 | cah,cos,1
 524 | cah,cot,1
 525 | cah,coth,1
 526 | ce,cs,1
 527 | cek,cky,1
 528 | cen,cines,1
 529 | cful,csful,1
 530 | cow,cattle,1
 531 | cow,kine,1
 532 | cwmwd,cymydau,1
 533 | d,dai,1
 534 | d,den,1
 535 | d,di,1
 536 | d,dia,1
 537 | d,din,1
 538 | d,dn,1
 539 | d,dun,1
 540 | da,de,1
 541 | da,din,1
 542 | da,dot,1
 543 | dah,dos,1
 544 | dah,doth,1
 545 | dars,duroos,1
 546 | dars,durus,1
 547 | dd,dood,1
 548 | dd,dud,1
 549 | dda,dren,1
 550 | de,dia,1
 551 | de,dren,1
 552 | del,dlach,1
 553 | del,dloch,1
 554 | der,dren,1
 555 | di,d,1
 556 | di,da,1
 557 | diptotes,nouns,1
 558 | do,dis,1
 559 | dongolawi,danagla,1
 560 | drasah,daris,1
 561 | dy,dsies,1
 562 | dyful,diesful,1
 563 | e,ebi,1
 564 | e,edes,1
 565 | e,em,1
 566 | e,eob,1
 567 | e,ese,1
 568 | e,eta,1
 569 | ea,ee,1
 570 | ea,es,1
 571 | eal,eis,1
 572 | ebeast,e,1
 573 | ee,es,1
 574 | eed,ede,1
 575 | eest,esunt,1
 576 | ef,evim,1
 577 | eh,eot,1
 578 | eis,easa,1
 579 | ekwele,bipkwele,1
 580 | el,eaux,1
 581 | el,eis,1
 582 | em,ens,1
 583 | eo,ei,1
 584 | eo,emus,1
 585 | erg,areg,1
 586 | es,e,1
 587 | es,eis,1
 588 | es,eisim,1
 589 | es,eysim,1
 590 | etl,emeh,1
 591 | eu,ei,1
 592 | eus,ea,1
 593 | ew,eois,1
 594 | eyrir,aurar,1
 595 | f,fe,1
 596 | f,fi,1
 597 | f,fim,1
 598 | fals,fulus,1
 599 | fanne,fenne,1
 600 | far,froth,1
 601 | fer,f,1
 602 | fer,farim,1
 603 | fet,ftim,1
 604 | fid,feda,1
 605 | fils,fulus,1
 606 | foote,feete,1
 607 | footful,feetful,1
 608 | footpaw,feetpaws,1
 609 | fsir,fasir,1
 610 | fut,feet,1
 611 | g,gar,1
 612 | g,ghang,1
 613 | g,gin,1
 614 | g,gn,1
 615 | ga,ghe,1
 616 | gadol,gedolim,1
 617 | gaon,geonim,1
 618 | gas,ges,1
 619 | ge,goi,1
 620 | gepik,qepik,1
 621 | ger,gri,1
 622 | get,gitten,1
 623 | get,gittim,1
 624 | ghanedd,ganeddion,1
 625 | go,ghi,1
 626 | gr,gar,1
 627 | gus,ges,1
 628 | h,hae,1
 629 | h,hat,1
 630 | h,hean,1
 631 | h,heen,1
 632 | h,hin,1
 633 | h,hiv,1
 634 | h,hod,1
 635 | h,hoi,1
 636 | h,hons,1
 637 | h,hos,1
 638 | h,hun,1
 639 | ha,hi,1
 640 | hadd,hudood,1
 641 | hadd,hudud,1
 642 | hadeeth,ahadeeth,1
 643 | hadith,ahadith,1
 644 | hafiz,huffaz,1
 645 | hah,hiot,1
 646 | hah,hioth,1
 647 | hah,hiyos,1
 648 | hah,hiyot,1
 649 | hah,hiyoth,1
 650 | hah,hos,1
 651 | haliach,hluchim,1
 652 | haliah,helichim,1
 653 | haliah,helihim,1
 654 | haliah,hlichim,1
 655 | haliah,hlihim,1
 656 | haliah,hluchim,1
 657 | haliah,hluhim,1
 658 | ham,homim,1
 659 | har,hroth,1
 660 | harif,hurafa,1
 661 | hat,hot,1
 662 | haykh,huyookh,1
 663 | he,hai,1
 664 | he,hi,1
 665 | heder,hadorim,1
 666 | hegetz,hkotzim,1
 667 | heikh,huyookh,1
 668 | hein,hanim,1
 669 | hel,halim,1
 670 | hel,hlach,1
 671 | helek,halakim,1
 672 | hem,homim,1
 673 | her,harim,1
 674 | herero,ovaherero,1
 675 | hesh,hashoth,1
 676 | hetrum,hhertum,1
 677 | hf,hvs,1
 678 | hizb,ahzab,1
 679 | ho,heaux,1
 680 | hok,hky,1
 681 | hok,hukim,1
 682 | hokhol,hakhly,1
 683 | hol,hly,1
 684 | hoop,heep,1
 685 | hos,hea,1
 686 | hus,ha,1
 687 | i,iim,1
 688 | i,iit,1
 689 | i,ik,1
 690 | i,iu,1
 691 | i,iyat,1
 692 | ia,ies,1
 693 | ia,ii,1
 694 | ia,iot,1
 695 | iach,ichim,1
 696 | iah,ioth,1
 697 | ian,i,1
 698 | ic,i,1
 699 | ic,ix,1
 700 | ido,ibe,1
 701 | ie,ice,1
 702 | ie,ii,1
 703 | ie,iya,1
 704 | if,ivim,1
 705 | ifeful,ivesful,1
 706 | iful,isful,1
 707 | ifuna,inagu,1
 708 | igqirha,amagqirha,1
 709 | ika,iok,1
 710 | ile,irren,1
 711 | imbizo,izimbizo,1
 712 | imbongi,iimbongi,1
 713 | imbongi,izimbongi,1
 714 | in,i,1
 715 | induna,izinduna,1
 716 | inkhosi,amakhosi,1
 717 | inkosi,amakhosi,1
 718 | inyanga,izinyanga,1
 719 | is,ii,1
 720 | is,ires,1
 721 | is,ithes,1
 722 | is,itot,1
 723 | is,ix,1
 724 | is,izes,1
 725 | isangoma,izangoma,1
 726 | isibongo,izibongo,1
 727 | isicoco,izicoco,1
 728 | it,iot,1
 729 | it,is,1
 730 | ith,iyoth,1
 731 | itongo,amatongo,1
 732 | ium,ien,1
 733 | ius,i,1
 734 | ix,iges,1
 735 | iya,i,1
 736 | iya,ii,1
 737 | iyot,iik,1
 738 | ja,je,1
 739 | jann,jinn,1
 740 | jann,jinnah,1
 741 | jann,jinnan,1
 742 | jilbab,jalabib,1
 743 | jinn,jawan,1
 744 | jo,je,1
 745 | jsa,jjes,1
 746 | juz,ajza,1
 747 | k,kie,1
 748 | k,kil,1
 749 | k,kke,1
 750 | k,ku,1
 751 | k,kun,1
 752 | ka,kes,1
 753 | ka,kor,1
 754 | ka,koth,1
 755 | kafir,kuffar,1
 756 | kah,kos,1
 757 | kah,kot,1
 758 | ke,kae,1
 759 | ke,kai,1
 760 | kel,kalim,1
 761 | ket,kot,1
 762 | kgosana,dikgosana,1
 763 | kgosi,dikgosi,1
 764 | kgotla,dikgotla,1
 765 | ki,kar,1
 766 | kibanja,bibanja,1
 767 | kko,kot,1
 768 | klal,kalil,1
 769 | ko,ka,1
 770 | kos,kai,1
 771 | ksaw,kasaw,1
 772 | kungwi,makungwi,1
 773 | kwerekwere,amakwerekwere,1
 774 | kyful,kiesful,1
 775 | l,lau,1
 776 | l,leanna,1
 777 | l,lekh,1
 778 | l,len,1
 779 | l,llen,1
 780 | l,lob,1
 781 | l,loch,1
 782 | l,ly,1
 783 | la,les,1
 784 | la,lin,1
 785 | la,lur,1
 786 | lace,li,1
 787 | laf,lefim,1
 788 | lah,lin,1
 789 | ldesac,lsdesac,1
 790 | le,llun,1
 791 | leach,lich,1
 792 | leh,lim,1
 793 | les,laisim,1
 794 | lets,ltsy,1
 795 | lex,licia,1
 796 | lf,lvs,1
 797 | lfful,lvesful,1
 798 | lfull,lsfull,1
 799 | lhomme,lshommes,1
 800 | likuta,makuta,1
 801 | lilangeni,emalangeni,1
 802 | lin,la,1
 803 | lipsis,leipses,1
 804 | lis,leisim,1
 805 | lis,lu,1
 806 | lit,laisim,1
 807 | lit,leysim,1
 808 | lj,lljj,1
 809 | lo,la,1
 810 | lolwapa,malwapa,1
 811 | lon,li,1
 812 | los,le,1
 813 | los,lea,1
 814 | los,li,1
 815 | loti,maloti,1
 816 | lr,lir,1
 817 | lus,lae,1
 818 | lus,lera,1
 819 | lus,loi,1
 820 | luy,lluim,1
 821 | ly,lia,1
 822 | lyful,liesful,1
 823 | m,mae,1
 824 | m,mas,1
 825 | m,me,1
 826 | m,messrs,1
 827 | m,mm,1
 828 | m,mme,1
 829 | m,mni,1
 830 | m,mu,1
 831 | m,mun,1
 832 | m,mut,1
 833 | ma,mek,1
 834 | ma,my,1
 835 | madam,mesdames,1
 836 | madame,mesdames,1
 837 | mademoiselle,mesdemoiselles,1
 838 | manfriend,menfriends,1
 839 | mann,menn,1
 840 | mdah,mada,1
 841 | me,mae,1
 842 | me,mi,1
 843 | med,mdim,1
 844 | mer,morim,1
 845 | mer,mrim,1
 846 | mes,mae,1
 847 | mes,mosim,1
 848 | mex,mices,1
 849 | mex,miges,1
 850 | mey,mies,1
 851 | mful,msfuls,1
 852 | mfull,msfull,1
 853 | mganga,waganga,1
 854 | mintaqah,manatiq,1
 855 | misr,amsar,1
 856 | miss,mlles,1
 857 | mmon,myn,1
 858 | mokoro,mekoro,1
 859 | moloi,baloi,1
 860 | monseigneur,messeigneurs,1
 861 | monsieur,messieurs,1
 862 | monsr,messrs,1
 863 | mophato,mephato,1
 864 | morafe,merafe,1
 865 | moran,ilmoran,1
 866 | morena,marena,1
 867 | mos,mi,1
 868 | mosarwa,basarwa,1
 869 | mosotho,basotho,1
 870 | motshelo,metshelo,1
 871 | motswana,batswana,1
 872 | mr,messrs,1
 873 | mrs,mesdames,1
 874 | mrs,mmes,1
 875 | msoa,mosi,1
 876 | mun,men,1
 877 | muramba,miramba,1
 878 | mus,ma,1
 879 | mus,mera,1
 880 | mushaf,masahif,1
 881 | muwashshah,tawashih,1
 882 | muzungu,wazungu,1
 883 | mwalimu,walimu,1
 884 | myometritis,endometritides,1
 885 | mzee,wazee,1
 886 | mzungu,wazungu,1
 887 | n,na,1
 888 | n,nah,1
 889 | n,nan,1
 890 | n,nat,1
 891 | n,neh,1
 892 | n,ngun,1
 893 | n,nie,1
 894 | n,nion,1
 895 | n,nn,1
 896 | n,nna,1
 897 | n,nov,1
 898 | na,n,1
 899 | na,nes,1
 900 | na,nor,1
 901 | na,not,1
 902 | na,noth,1
 903 | na,nur,1
 904 | nak,nci,1
 905 | naki,neke,1
 906 | naphil,nephilim,1
 907 | nard,nnards,1
 908 | nasheed,anasheed,1
 909 | nces,nses,1
 910 | ndarme,nsdarmes,1
 911 | ndi,noud,1
 912 | ne,nia,1
 913 | ne,noi,1
 914 | ne,ntes,1
 915 | ne,ntini,1
 916 | nee,n,1
 917 | nek,niky,1
 918 | nets,ntsiv,1
 919 | ngaka,dingaka,1
 920 | ni,n,1
 921 | nie,nce,1
 922 | nie,ns,1
 923 | nis,neis,1
 924 | nkishi,minkishi,1
 925 | nkisi,minkisi,1
 926 | nmot,nsmots,1
 927 | nnie,nce,1
 928 | nnie,ns,1
 929 | nnill,nillion,1
 930 | no,na,1
 931 | no,ne,1
 932 | no,nya,1
 933 | nofabitch,nsabitches,1
 934 | nofabitch,nsofbitches,1
 935 | nok,nki,1
 936 | novabitch,nsovbitches,1
 937 | ns,ndes,1
 938 | ntar,ndarka,1
 939 | nti,npol,1
 940 | nus,nae,1
 941 | nus,neres,1
 942 | nus,nes,1
 943 | nuvabitch,nsuvbitches,1
 944 | ny,nce,1
 945 | ny,ns,1
 946 | nzil,nazil,1
 947 | o,oi,1
 948 | o,on,1
 949 | o,oos,1
 950 | o,ot,1
 951 | o,oth,1
 952 | o,oz,1
 953 | ocimibundu,ovimbundu,1
 954 | of,ovim,1
 955 | oh,os,1
 956 | oh,ot,1
 957 | okama,ocama,1
 958 | om,ons,1
 959 | omanhene,amanhene,1
 960 | oos,oi,1
 961 | os,odes,1
 962 | os,ora,1
 963 | os,ores,1
 964 | os,otes,1
 965 | othorax,ohoraces,1
 966 | ous,oi,1
 967 | ox,octes,1
 968 | oz,oces,1
 969 | p,pes,1
 970 | p,pi,1
 971 | p,ple,1
 972 | p,ps,1
 973 | p,pth,1
 974 | pa,por,1
 975 | pa,pur,1
 976 | pas,pones,1
 977 | pe,pi,1
 978 | pelet,plot,1
 979 | pelet,ploth,1
 980 | peole,polae,1
 981 | pes,pites,1
 982 | pis,pes,1
 983 | pnik,pari,1
 984 | pnik,pota,1
 985 | pon,pa,1
 986 | poritz,pritzim,1
 987 | posanto,pisanti,1
 988 | psak,piskei,1
 989 | pus,pera,1
 990 | qel,qalim,1
 991 | r,rar,1
 992 | r,rau,1
 993 | r,reen,1
 994 | r,ria,1
 995 | r,rja,1
 996 | r,rjev,1
 997 | r,rji,1
 998 | r,rn,1
 999 | r,rok,1
1000 | r,rov,1
1001 | r,rtsy,1
1002 | r,ru,1
1003 | r,run,1
1004 | r,rz,1
1005 | ra,ros,1
1006 | rah,rim,1
1007 | rak,rci,1
1008 | rashah,rshioth,1
1009 | rasul,rusul,1
1010 | ratal,irtal,1
1011 | rby,rsby,1
1012 | rdi,radim,1
1013 | re,rai,1
1014 | red,rdim,1
1015 | reh,rat,1
1016 | req,raqim,1
1017 | resh,rashim,1
1018 | resh,rshayim,1
1019 | ret,rot,1
1020 | rets,rtsy,1
1021 | ri,li,1
1022 | rin,re,1
1023 | ris,reis,1
1024 | rotl,artal,1
1025 | rs,r,1
1026 | ru,ri,1
1027 | rum,rae,1
1028 | rus,ra,1
1029 | rus,rina,1
1030 | rus,rsim,1
1031 | ry,rata,1
1032 | ry,res,1
1033 | ry,ria,1
1034 | s,sar,1
1035 | s,seanna,1
1036 | s,sir,1
1037 | s,sm,1
1038 | s,soi,1
1039 | s,ssa,1
1040 | s,sse,1
1041 | s,ste,1
1042 | s,sy,1
1043 | sach,schaot,1
1044 | safek,sfekot,1
1045 | sapanca,sepanche,1
1046 | saraph,seraphim,1
1047 | se,sia,1
1048 | se,sides,1
1049 | seach,sigh,1
1050 | sebsi,sbassa,1
1051 | seder,sidarim,1
1052 | seder,siddarim,1
1053 | sek,skim,1
1054 | sel,slach,1
1055 | sente,lisente,1
1056 | seq,sqim,1
1057 | ser,srim,1
1058 | ses,sides,1
1059 | sgid,sagid,1
1060 | sharif,ashraf,1
1061 | shisname,stheirnames,1
1062 | shuk,suit,1
1063 | sinego,ssinegoes,1
1064 | singhalese,cingalese,1
1065 | sjid,sajid,1
1066 | son,sa,1
1067 | srani,sara,1
1068 | st,ss,1
1069 | sus,sii,1
1070 | sya,si,1
1071 | t,tae,1
1072 | t,tai,1
1073 | t,tean,1
1074 | t,th,1
1075 | t,tir,1
1076 | t,toth,1
1077 | t,tovi,1
1078 | t,tt,1
1079 | t,ttys,1
1080 | ta,t,1
1081 | ta,toth,1
1082 | tachresis,techreses,1
1083 | tah,tim,1
1084 | tah,tot,1
1085 | tah,toth,1
1086 | tambala,matambala,1
1087 | tariqa,turuq,1
1088 | tax,tices,1
1089 | te,ta,1
1090 | te,tia,1
1091 | teg,tagim,1
1092 | ten,tines,1
1093 | ter,tres,1
1094 | tey,ties,1
1095 | tis,tai,1
1096 | titrix,trices,1
1097 | tmartial,tsmartial,1
1098 | to,ta,1
1099 | toof,teef,1
1100 | toothmark,teethmarks,1
1101 | triptotes,nouns,1
1102 | tum,ti,1
1103 | tus,tii,1
1104 | tus,tora,1
1105 | ty,te,1
1106 | ty,tees,1
1107 | tys,ties,1
1108 | u,uens,1
1109 | u,ukuna,1
1110 | uah,uos,1
1111 | uah,uot,1
1112 | uah,uoth,1
1113 | uan,un,1
1114 | ues,uites,1
1115 | uhil,ujil,1
1116 | uitur,uuntur,1
1117 | umkhwetha,abakhwetha,1
1118 | umkhwetha,abakwetha,1
1119 | umlungu,abelungu,1
1120 | umuzungu,abazungu,1
1121 | umuzungu,bazungu,1
1122 | umzulu,amazulu,1
1123 | ura,uwar,1
1124 | urah,uwar,1
1125 | uwa,ua,1
1126 | v,va,1
1127 | v,vz,1
1128 | va,ve,1
1129 | va,voth,1
1130 | vah,vos,1
1131 | vah,vot,1
1132 | veh,vos,1
1133 | veh,vot,1
1134 | vnia,ven,1
1135 | vok,vki,1
1136 | w,wau,1
1137 | w,wes,1
1138 | w,wis,1
1139 | wakf,awkaf,1
1140 | walian,welsh,1
1141 | waqf,awqaf,1
1142 | wiec,wce,1
1143 | womin,wimin,1
1144 | wommin,wimmin,1
1145 | womyn,wymyn,1
1146 | woose,weese,1
1147 | wr,wyr,1
1148 | wwa,ween,1
1149 | x,xob,1
1150 | x,xxes,1
1151 | xis,xeis,1
1152 | y,ych,1
1153 | y,yem,1
1154 | y,yim,1
1155 | y,yin,1
1156 | y,yos,1
1157 | y,yren,1
1158 | ya,yos,1
1159 | yah,yos,1
1160 | yah,yoth,1
1161 | ye,yce,1
1162 | yen,yonem,1
1163 | yen,yonim,1
1164 | yification,yfications,1
1165 | yk,ycy,1
1166 | yka,yok,1
1167 | yoret,yurot,1
1168 | yote,ya,1
1169 | ytan,yatin,1
1170 | ywoman,yswomen,1
1171 | yya,yat,1
1172 | z,zen,1
1173 | z,zer,1
1174 | z,zok,1
1175 | z,zzim,1
1176 | zayit,zeytim,1
1177 | zek,zki,1
1178 | zek,zky,1
1179 | zimrah,zemirot,1
1180 | zimrah,zemiroth,1
1181 | zindiq,zandaqa,1
1182 | zwa,zawat,1
1183 | 


--------------------------------------------------------------------------------
/extractor/src/main.rs:
--------------------------------------------------------------------------------
  1 | use csv::Writer;
  2 | use english_core::*;
  3 | use serde::Deserialize;
  4 | use std::collections::{HashMap, HashSet};
  5 | use std::env;
  6 | use std::error::Error;
  7 | use std::fs::File;
  8 | use std::io::{BufRead, BufReader, Write};
  9 | mod file_generation;
 10 | use file_generation::*;
 11 | mod helpers;
 12 | use csv::{ReaderBuilder, WriterBuilder};
 13 | pub use helpers::*;
 14 | 
 15 | fn main() -> Result<(), Box<dyn Error>> {
 16 |     let args: Vec<String> = env::args().collect();
 17 | 
 18 |     if args.len() != 2 {
 19 |         eprintln!("Usage: cargo run --release rawwiki.jsonl");
 20 |         std::process::exit(1);
 21 |     }
 22 | 
 23 |     let input_path = &args[1];
 24 | 
 25 |     let filtered_json_path = "english_filtered.jsonl";
 26 | 
 27 |     //filter_english_entries(input_path, filtered_json_path);
 28 | 
 29 |     //let input_path = "../../english.jsonl";
 30 |     check_noun_plurals(filtered_json_path, "noun_plural_check.csv")?;
 31 |     check_verb_conjugations(filtered_json_path, "verbs_check.csv")?;
 32 |     check_adjective_forms(filtered_json_path, "adj_check.csv")?;
 33 | 
 34 |     extract_verb_conjugations_new(filtered_json_path, "verb_conjugations.csv")?;
 35 |     extract_irregular_nouns(filtered_json_path, "nouns_with_plurals.csv")?;
 36 | 
 37 |     extract_irregular_adjectives(filtered_json_path, "adjectives.csv")?;
 38 | 
 39 |     generate_nouns_phf("nouns_with_plurals.csv", "noun_phf.rs");
 40 |     generate_adjectives_phf("adjectives.csv", "adj_phf.rs");
 41 |     generate_verbs_phf("verb_conjugations.csv", "verb_phf.rs");
 42 | 
 43 |     //  analyze_and_write_suffix_rules("nouns_with_plurals.csv", "analyzed_endings.csv");
 44 |     Ok(())
 45 | }
 46 | 
 47 | /// Extracts irregular noun plurals and writes them to a CSV.
 48 | fn extract_irregular_nouns(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
 49 |     let mut forms_map: HashMap<String, HashSet<String>> = HashMap::new();
 50 | 
 51 |     let (reader, mut writer) = base_setup(input_path, output_path);
 52 |     writer.write_record(&["word", "plural"])?;
 53 | 
 54 |     for line in reader.lines() {
 55 |         let line = line?;
 56 |         let entry: Entry = match serde_json::from_str(&line) {
 57 |             Ok(e) => e,
 58 |             Err(e) => {
 59 |                 println!("{:#?}", e);
 60 |                 continue;
 61 |             }
 62 |         };
 63 | 
 64 |         if !entry_is_proper(&entry, "noun") {
 65 |             continue;
 66 |         }
 67 | 
 68 |         let infinitive = entry.word.to_lowercase();
 69 | 
 70 |         if !forms_map.contains_key(&infinitive) {
 71 |             forms_map.insert(infinitive.clone(), HashSet::new());
 72 |         }
 73 | 
 74 |         let mut plural_found = false;
 75 |         if let Some(forms) = entry.forms {
 76 |             for form in &forms {
 77 |                 let tags = &form.tags;
 78 | 
 79 |                 let entry_form = form.form.to_lowercase();
 80 |                 if entry_form == "dubious" {
 81 |                     continue;
 82 |                 }
 83 |                 if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) {
 84 |                     continue;
 85 |                 }
 86 | 
 87 |                 if tags.contains(&"plural".into()) {
 88 |                     forms_map
 89 |                         .get_mut(&infinitive)
 90 |                         .unwrap()
 91 |                         .insert(entry_form.clone());
 92 |                 }
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     for (inf, setik) in forms_map.iter_mut() {
 98 |         let predicted_plural = EnglishCore::pluralize_noun(&inf);
 99 |         if setik.is_empty() {
100 |             continue;
101 |         }
102 |         let alr_cont = setik.remove(&predicted_plural);
103 |         let mut index = match alr_cont {
104 |             true => 2,
105 |             false => 1,
106 |         };
107 |         let mut sorted_vec: Vec<String> = setik.clone().into_iter().collect();
108 |         sorted_vec.sort(); // uses Ord for sorting
109 |         for thing in sorted_vec.iter() {
110 |             let word_key = if index == 1 {
111 |                 inf.clone()
112 |             } else {
113 |                 format!("{inf}{index}")
114 |             };
115 |             let keyd_struct = [word_key.clone(), thing.clone()];
116 | 
117 |             if index < 10 {
118 |                 writer.write_record(&keyd_struct)?;
119 |             }
120 |             index += 1;
121 |         }
122 |     }
123 | 
124 |     writer.flush()?;
125 |     println!("Done! Output written to {}", output_path);
126 |     Ok(())
127 | }
128 | 
129 | fn extract_irregular_adjectives(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
130 |     let mut forms_map: HashMap<String, HashSet<AdjParts>> = HashMap::new();
131 |     let (reader, mut writer) = base_setup(input_path, output_path);
132 |     writer.write_record(&["positive", "comparative", "superlative"])?;
133 | 
134 |     for line in reader.lines() {
135 |         let line = line?;
136 |         let entry: Entry = match serde_json::from_str(&line) {
137 |             Ok(e) => e,
138 |             Err(e) => {
139 |                 println!("{:#?}", e);
140 |                 continue;
141 |             }
142 |         };
143 |         if !entry_is_proper(&entry, "adj") {
144 |             continue;
145 |         }
146 | 
147 |         let infinitive = entry.word.to_lowercase();
148 |         if !forms_map.contains_key(&infinitive) {
149 |             forms_map.insert(infinitive.clone(), HashSet::new());
150 |         }
151 |         let mut adjik = AdjParts::default();
152 |         adjik.positive = infinitive.clone();
153 | 
154 |         if let Some(forms) = entry.forms {
155 |             for form in &forms {
156 |                 let tags = &form.tags;
157 |                 let entry_form = form.form.to_lowercase();
158 |                 if entry_form == "dubious" {
159 |                     continue;
160 |                 }
161 |                 if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) {
162 |                     continue;
163 |                 }
164 | 
165 |                 if tags.contains(&"comparative".into()) && adjik.comparative == "" {
166 |                     adjik.comparative = entry_form.clone();
167 |                 }
168 | 
169 |                 if tags.contains(&"superlative".into()) && adjik.superlative == "" {
170 |                     adjik.superlative = entry_form.clone();
171 |                 }
172 |             }
173 |         }
174 | 
175 |         let predicted_comparative = EnglishCore::comparative(&infinitive);
176 |         let predicted_superlative = EnglishCore::superlative(&infinitive);
177 |         if adjik.comparative == "" {
178 |             adjik.comparative = predicted_comparative.clone();
179 |         }
180 |         if adjik.superlative == "" {
181 |             adjik.superlative = predicted_superlative.clone();
182 |         }
183 | 
184 |         forms_map
185 |             .get_mut(&infinitive)
186 |             .unwrap()
187 |             .insert(adjik.clone());
188 |     }
189 |     for (inf, setik) in forms_map.iter_mut() {
190 |         let predicted_comparative = EnglishCore::comparative(&inf);
191 |         let predicted_superlative = EnglishCore::superlative(&inf);
192 | 
193 |         let mut predicted_adj = AdjParts::default();
194 |         predicted_adj.positive = inf.clone();
195 |         predicted_adj.comparative = predicted_comparative.clone();
196 |         predicted_adj.superlative = predicted_superlative.clone();
197 |         if setik.is_empty() {
198 |             continue;
199 |         }
200 | 
201 |         let mut index = match setik.remove(&predicted_adj) {
202 |             true => 2,
203 |             false => 1,
204 |         };
205 |         let mut sorted_vec: Vec<AdjParts> = setik.clone().into_iter().collect();
206 |         sorted_vec.sort(); // uses Ord for sorting
207 |         for thing in sorted_vec.iter() {
208 |             let word_key = if index == 1 {
209 |                 inf.clone()
210 |             } else {
211 |                 format!("{inf}{index}")
212 |             };
213 |             //positive,comparative,superlative
214 |             let keyd_struct = [
215 |                 word_key.clone(),
216 |                 thing.comparative.clone(),
217 |                 thing.superlative.clone(),
218 |             ];
219 |             index += 1;
220 |             writer.write_record(&keyd_struct)?;
221 |         }
222 |     }
223 |     writer.flush()?;
224 |     println!("Done! Output written to {}", output_path);
225 |     Ok(())
226 | }
227 | 
228 | /// Extracts verb conjugations and writes them to a CSV.
229 | fn extract_verb_conjugations_new(
230 |     input_path: &str,
231 |     output_path: &str,
232 | ) -> Result<(), Box<dyn Error>> {
233 |     let mut forms_map: HashMap<String, HashSet<VerbParts>> = HashMap::new();
234 |     let (reader, mut writer) = base_setup(input_path, output_path);
235 |     writer.write_record(&[
236 |         "infinitive",
237 |         "third_person_singular",
238 |         "past",
239 |         "present_participle",
240 |         "past_participle",
241 |     ])?;
242 | 
243 |     for line in reader.lines() {
244 |         let line = line?;
245 |         let entry: Entry = match serde_json::from_str(&line) {
246 |             Ok(e) => e,
247 |             Err(e) => {
248 |                 println!("{:#?}", e);
249 |                 continue;
250 |             }
251 |         };
252 |         if !entry_is_proper(&entry, "verb") {
253 |             continue;
254 |         }
255 | 
256 |         let mut has_third = false;
257 |         let infinitive = entry.word.to_lowercase();
258 |         if !forms_map.contains_key(&infinitive) {
259 |             forms_map.insert(infinitive.clone(), HashSet::new());
260 |         }
261 |         let mut verbik = VerbParts::default();
262 |         verbik.inf = infinitive.clone();
263 | 
264 |         if verbik.inf == "be" {
265 |             continue;
266 |         }
267 | 
268 |         if let Some(forms) = entry.forms {
269 |             for form in &forms {
270 |                 let tags = &form.tags;
271 |                 let entry_form = form.form.to_lowercase();
272 |                 if !word_is_proper(&entry_form) || contains_bad_tag(tags.clone()) {
273 |                     continue;
274 |                 }
275 | 
276 |                 if tags.contains(&"third-person".into())
277 |                     && tags.contains(&"singular".into())
278 |                     && tags.contains(&"present".into())
279 |                     && !has_third
280 |                 {
281 |                     has_third = true;
282 |                     verbik.third = entry_form.clone();
283 |                 }
284 | 
285 |                 if tags.contains(&"past".into())
286 |                     && !tags.contains(&"participle".into())
287 |                     && verbik.past == ""
288 |                 {
289 |                     verbik.past = entry_form.clone();
290 |                 }
291 | 
292 |                 if tags.contains(&"participle".into())
293 |                     && tags.contains(&"present".into())
294 |                     && verbik.present_part == ""
295 |                 {
296 |                     verbik.present_part = entry_form.clone();
297 |                 }
298 | 
299 |                 if tags.contains(&"participle".into())
300 |                     && tags.contains(&"past".into())
301 |                     && verbik.past_part == ""
302 |                 {
303 |                     verbik.past_part = entry_form.clone();
304 |                 }
305 |             }
306 |         }
307 | 
308 |         let predicted_past = EnglishCore::verb(
309 |             &infinitive,
310 |             &Person::Third,
311 |             &Number::Singular,
312 |             &Tense::Past,
313 |             &Form::Finite,
314 |         );
315 |         let predicted_participle = EnglishCore::verb(
316 |             &infinitive,
317 |             &Person::Third,
318 |             &Number::Singular,
319 |             &Tense::Present,
320 |             &Form::Participle,
321 |         );
322 | 
323 |         if verbik.past == "" {
324 |             verbik.past = predicted_past.clone();
325 |         }
326 |         if verbik.past_part == "" {
327 |             verbik.past_part = verbik.past.clone();
328 |         }
329 |         if verbik.present_part == "" {
330 |             verbik.present_part = predicted_participle.clone();
331 |         }
332 | 
333 |         if has_third {
334 |             forms_map
335 |                 .get_mut(&infinitive)
336 |                 .unwrap()
337 |                 .insert(verbik.clone());
338 |         }
339 |     }
340 |     for (inf, setik) in forms_map.iter_mut() {
341 |         let predicted_third = EnglishCore::verb(
342 |             &inf,
343 |             &Person::Third,
344 |             &Number::Singular,
345 |             &Tense::Present,
346 |             &Form::Finite,
347 |         );
348 |         let predicted_past = EnglishCore::verb(
349 |             &inf,
350 |             &Person::Third,
351 |             &Number::Singular,
352 |             &Tense::Past,
353 |             &Form::Finite,
354 |         );
355 |         let predicted_participle = EnglishCore::verb(
356 |             &inf,
357 |             &Person::Third,
358 |             &Number::Singular,
359 |             &Tense::Present,
360 |             &Form::Participle,
361 |         );
362 | 
363 |         let mut predicted_verb = VerbParts::default();
364 |         predicted_verb.inf = inf.clone();
365 |         predicted_verb.third = predicted_third.clone();
366 |         predicted_verb.past = predicted_past.clone();
367 |         predicted_verb.past_part = predicted_past.clone();
368 |         predicted_verb.present_part = predicted_participle.clone();
369 |         if setik.is_empty() {
370 |             continue;
371 |         }
372 | 
373 |         let mut index = match setik.remove(&predicted_verb) {
374 |             true => 2,
375 |             false => 1,
376 |         };
377 |         let mut sorted_vec: Vec<VerbParts> = setik.clone().into_iter().collect();
378 |         sorted_vec.sort(); // uses Ord for sorting
379 |         for thing in sorted_vec.iter() {
380 |             let word_key = if index == 1 {
381 |                 inf.clone()
382 |             } else {
383 |                 format!("{inf}{index}")
384 |             };
385 |             //infinitive,third_person_singular,past,present_participle,past_participle
386 |             let keyd_struct = [
387 |                 word_key.clone(),
388 |                 thing.third.clone(),
389 |                 thing.past.clone(),
390 |                 thing.present_part.clone(),
391 |                 thing.past_part.clone(),
392 |             ];
393 |             index += 1;
394 |             writer.write_record(&keyd_struct)?;
395 |         }
396 |     }
397 | 
398 |     writer.flush()?;
399 |     println!("Done! Output written to {}", output_path);
400 |     Ok(())
401 | }
402 | 
403 | pub fn check_noun_plurals(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
404 |     use english::*;
405 |     let (reader, mut writer) = base_setup(input_path, output_path);
406 |     writer.write_record(&["wiki_single", "wiktionary_plural"])?;
407 | 
408 |     let mut total_counter = 0;
409 |     let mut match_counter = 0;
410 | 
411 |     for line in reader.lines() {
412 |         let line = line?;
413 |         let entry: Entry = match serde_json::from_str(&line) {
414 |             Ok(e) => e,
415 |             Err(_) => continue,
416 |         };
417 | 
418 |         // Only proper English nouns
419 |         if !entry_is_proper(&entry, "noun") {
420 |             continue;
421 |         }
422 |         let lowercased_entry = entry.word.to_lowercase();
423 | 
424 |         // Gather all plural forms from Wiktionary
425 |         let mut wiktionary_plurals = Vec::new();
426 |         if let Some(forms) = entry.forms {
427 |             for form in forms {
428 |                 if form.tags.contains(&"plural".into())
429 |                 //    && !contains_bad_tag(form.tags.clone())
430 |                 //   && word_is_proper(&form.form)
431 |                 {
432 |                     wiktionary_plurals.push(form.form.to_lowercase());
433 |                 }
434 |             }
435 |         }
436 |         if wiktionary_plurals.is_empty() {
437 |             continue;
438 |         }
439 | 
440 |         // Try base word and numbered variants
441 |         let mut variants = vec![lowercased_entry.clone()];
442 |         for i in 2..=9 {
443 |             variants.push(format!("{}{}", lowercased_entry, i));
444 |         }
445 | 
446 |         for wiki_plural in &wiktionary_plurals {
447 |             let wiki_plural = wiki_plural.to_lowercase();
448 |             total_counter += 1;
449 |             let mut matched = false;
450 |             for variant in &variants {
451 |                 let generated_plural = English::noun(variant, &Number::Plural);
452 |                 matched = generated_plural == wiki_plural;
453 |                 if matched {
454 |                     match_counter += 1;
455 |                     break;
456 |                 }
457 |             }
458 |             if !matched {
459 |                 writer.write_record(&[lowercased_entry.clone(), wiki_plural.clone()])?;
460 |             }
461 |         }
462 |     }
463 | 
464 |     writer.flush()?;
465 |     println!("Done! Output written to {}", output_path);
466 |     println!("total match amount: {} / {}", match_counter, total_counter);
467 |     Ok(())
468 | }
469 | 
470 | pub fn check_verb_conjugations(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
471 |     use english::*;
472 |     let (reader, mut writer) = base_setup(input_path, output_path);
473 | 
474 |     writer.write_record(&["wiktionary_form", "person", "number", "tense", "form"])?;
475 | 
476 |     let mut total_counter = 0;
477 |     let mut match_counter = 0;
478 | 
479 |     for line in reader.lines() {
480 |         let line = line?;
481 |         let entry: Entry = match serde_json::from_str(&line) {
482 |             Ok(e) => e,
483 |             Err(_) => continue,
484 |         };
485 | 
486 |         // Only proper English verbs
487 |         if !entry_is_proper(&entry, "verb") {
488 |             continue;
489 |         }
490 | 
491 |         let lowercased_entry = entry.word.to_lowercase();
492 | 
493 |         // Collect (form, person, number, tense, form_type) from Wiktionary
494 |         let mut wiktionary_forms = Vec::new();
495 |         if let Some(forms) = entry.forms {
496 |             for form in forms {
497 |                 let tags = form
498 |                     .tags
499 |                     .iter()
500 |                     .map(|t| t.to_lowercase())
501 |                     .collect::<Vec<_>>();
502 |                 let form_str = form.form.to_lowercase();
503 | 
504 |                 // Skip bad data
505 |                 if form_str == "dubious"
506 |                     || contains_bad_tag(form.tags.clone())
507 |                     || !word_is_proper(&form.form)
508 |                 {
509 |                     continue;
510 |                 }
511 | 
512 |                 // Determine grammatical properties
513 |                 //Only check third person, first and second are always the infinitive
514 |                 let person = if tags.contains(&"first-person".into()) {
515 |                     continue;
516 |                 } else if tags.contains(&"second-person".into()) {
517 |                     continue;
518 |                 } else {
519 |                     Person::Third
520 |                 };
521 | 
522 |                 //only check singular, plural is always same as singular except for third singular present
523 |                 let number = if tags.contains(&"plural".into()) {
524 |                     continue;
525 |                 } else {
526 |                     Number::Singular
527 |                 };
528 | 
529 |                 let tense = if tags.contains(&"present".into()) {
530 |                     Tense::Present
531 |                 } else if tags.contains(&"past".into()) {
532 |                     Tense::Past
533 |                 } else {
534 |                     Tense::Present
535 |                 };
536 | 
537 |                 let form_type = if tags.contains(&"participle".into()) {
538 |                     Form::Participle
539 |                 } else if tags.contains(&"infinitive".into()) {
540 |                     continue;
541 |                 } else {
542 |                     Form::Finite
543 |                 };
544 | 
545 |                 wiktionary_forms.push((form_str, person, number, tense, form_type));
546 |             }
547 |         }
548 | 
549 |         if wiktionary_forms.is_empty() {
550 |             continue;
551 |         }
552 | 
553 |         // Try base and numbered variants
554 |         let mut variants = vec![lowercased_entry.clone()];
555 |         for i in 2..=9 {
556 |             variants.push(format!("{}{}", lowercased_entry, i));
557 |         }
558 | 
559 |         for (wiki_form, person, number, tense, form_type) in wiktionary_forms {
560 |             total_counter += 1;
561 |             let mut matched = false;
562 | 
563 |             for variant in &variants {
564 |                 let generated_form = English::verb(variant, &person, &number, &tense, &form_type);
565 |                 matched = generated_form == wiki_form;
566 |                 if matched {
567 |                     match_counter += 1;
568 | 
569 |                     break;
570 |                 }
571 |             }
572 | 
573 |             if !matched {
574 |                 writer.write_record(&[
575 |                     wiki_form.clone(),
576 |                     format!("{:?}", person),
577 |                     format!("{:?}", number),
578 |                     format!("{:?}", tense),
579 |                     format!("{:?}", form_type),
580 |                 ])?;
581 |             }
582 |         }
583 |     }
584 | 
585 |     writer.flush()?;
586 |     println!("Done! Output written to {}", output_path);
587 |     println!("total match amount: {} / {}", match_counter, total_counter);
588 |     Ok(())
589 | }
590 | 
591 | pub fn check_adjective_forms(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
592 |     use english::*;
593 |     let (reader, mut writer) = base_setup(input_path, output_path);
594 |     writer.write_record(&["wiktionary_form", "degree"])?;
595 | 
596 |     let mut total_counter = 0;
597 |     let mut match_counter = 0;
598 | 
599 |     for line in reader.lines() {
600 |         let line = line?;
601 |         let entry: Entry = match serde_json::from_str(&line) {
602 |             Ok(e) => e,
603 |             Err(_) => continue,
604 |         };
605 | 
606 |         // Only proper English adjectives
607 |         if !entry_is_proper(&entry, "adj") {
608 |             continue;
609 |         }
610 | 
611 |         let lowercased_entry = entry.word.to_lowercase();
612 | 
613 |         // Gather all adjective forms from Wiktionary
614 |         let mut wiki_comparative: Option<String> = None;
615 |         let mut wiki_superlative: Option<String> = None;
616 | 
617 |         if let Some(forms) = entry.forms {
618 |             for form in forms {
619 |                 let form_str = form.form.to_lowercase();
620 |                 let tags_lower: Vec<String> = form.tags.iter().map(|t| t.to_lowercase()).collect();
621 | 
622 |                 if tags_lower.contains(&"comparative".into()) {
623 |                     wiki_comparative = Some(form_str);
624 |                 } else if tags_lower.contains(&"superlative".into()) {
625 |                     wiki_superlative = Some(form_str);
626 |                 }
627 |             }
628 |         }
629 | 
630 |         // If Wiktionary has no comparative or superlative, skip
631 |         if wiki_comparative.is_none() && wiki_superlative.is_none() {
632 |             continue;
633 |         }
634 | 
635 |         // Try base and numbered variants
636 |         let mut variants = vec![lowercased_entry.clone()];
637 |         for i in 2..=9 {
638 |             variants.push(format!("{}{}", lowercased_entry, i));
639 |         }
640 | 
641 |         // Comparative
642 |         if let Some(wiki_comp) = &wiki_comparative {
643 |             let wiki_comp = wiki_comp.to_lowercase();
644 |             total_counter += 1;
645 |             let mut matched = false;
646 |             for variant in &variants {
647 |                 let generated_comp = English::adj(variant, &Degree::Comparative);
648 |                 if generated_comp == wiki_comp {
649 |                     match_counter += 1;
650 |                     matched = true;
651 |                     break;
652 |                 }
653 |             }
654 |             if !matched {
655 |                 writer.write_record(&[wiki_comp.clone(), "Comparative".into()])?;
656 |             }
657 |         }
658 | 
659 |         // Superlative
660 |         if let Some(wiki_sup) = &wiki_superlative {
661 |             let wiki_sup = wiki_sup.to_lowercase();
662 |             total_counter += 1;
663 |             let mut matched = false;
664 |             for variant in &variants {
665 |                 let generated_sup = English::adj(variant, &Degree::Superlative);
666 |                 if generated_sup == wiki_sup {
667 |                     match_counter += 1;
668 |                     matched = true;
669 |                     break;
670 |                 }
671 |             }
672 |             if !matched {
673 |                 writer.write_record(&[wiki_sup.clone(), "Superlative".into()])?;
674 |             }
675 |         }
676 |     }
677 | 
678 |     writer.flush()?;
679 |     println!("Done! Output written to {}", output_path);
680 |     println!("total match amount: {} / {}", match_counter, total_counter);
681 |     Ok(())
682 | }
683 | 
684 | pub fn filter_english_entries(input_path: &str, output_path: &str) -> Result<(), Box<dyn Error>> {
685 |     let input = File::open(input_path)?;
686 |     let reader = BufReader::new(input);
687 | 
688 |     let mut output = File::create(output_path)?;
689 | 
690 |     for line in reader.lines() {
691 |         let line = line?;
692 |         let entry: Entry = match serde_json::from_str(&line) {
693 |             Ok(e) => e,
694 |             Err(_) => continue, // skip bad lines
695 |         };
696 | 
697 |         // Keep only English words
698 |         if entry.lang_code != "en" {
699 |             continue;
700 |         }
701 | 
702 |         // Write valid entry back as JSON
703 |         writeln!(output, "{}", line)?;
704 |     }
705 | 
706 |     println!("Filtered dataset saved to {}", output_path);
707 |     Ok(())
708 | }
709 | 
710 | /// Strip trailing digits from a word (e.g., "walrus2" -> "walrus")
711 | pub fn strip_trailing_number(word: &str) -> &str {
712 |     word.trim_end_matches(|c: char| c.is_ascii_digit())
713 | }
714 | 
715 | /// Process CSV and write suffix rules with frequencies to output CSV
716 | pub fn analyze_and_write_suffix_rules(
717 |     input_path: &str,
718 |     output_path: &str,
719 | ) -> Result<(), Box<dyn Error>> {
720 |     let file = File::open(input_path)?;
721 |     let mut rdr = ReaderBuilder::new().from_reader(BufReader::new(file));
722 | 
723 |     let mut freq: HashMap<(String, String), usize> = HashMap::new();
724 | 
725 |     for result in rdr.records() {
726 |         let record = result?;
727 |         let singular_raw = record.get(0).unwrap();
728 |         let plural = record.get(1).unwrap();
729 | 
730 |         let singular = strip_trailing_number(singular_raw);
731 | 
732 |         let pair = suffix_rule(singular, plural);
733 |         *freq.entry(pair).or_insert(0) += 1;
734 |     }
735 | 
736 |     // Sort by frequency descending, then singular suffix, then plural suffix
737 |     let mut freq_vec: Vec<_> = freq.into_iter().collect();
738 |     freq_vec.sort_by(|a, b| {
739 |         b.1.cmp(&a.1)
740 |             .then_with(|| a.0.0.cmp(&b.0.0))
741 |             .then_with(|| a.0.1.cmp(&b.0.1))
742 |     });
743 | 
744 |     let mut wtr = WriterBuilder::new().from_path(output_path)?;
745 |     wtr.write_record(&["singular_suffix", "plural_suffix", "count"])?;
746 | 
747 |     for ((sing_suf, plur_suf), count) in freq_vec {
748 |         wtr.write_record(&[sing_suf, plur_suf, count.to_string()])?;
749 |     }
750 | 
751 |     wtr.flush()?;
752 |     Ok(())
753 | }
754 | 


--------------------------------------------------------------------------------