├── .gitignore ├── .github └── workflows │ ├── main.yml │ └── release.yml ├── tests ├── parse_tests.rs ├── list_reading_tests.rs ├── ignore_tests.rs ├── edit_distance_tests.rs ├── pruning_tests.rs ├── uniquely_decodable_tests.rs ├── list_information_tests.rs └── list_manipulation_tests.rs ├── Cargo.toml ├── dist-workspace.toml ├── LICENSE ├── Dockerfile ├── src ├── cards.rs ├── display_information │ ├── uniquely_decodable.rs │ └── mod.rs ├── schlinkert_pruning.rs ├── edit_distance.rs ├── file_writer.rs ├── parsers.rs ├── dice.rs ├── input_validations.rs ├── file_readers.rs ├── list_manipulations.rs ├── lib.rs └── main.rs ├── CHANGELOG.md ├── wordlists-to-tidy.markdown ├── deny.toml └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | l 3 | scrap_code.markdown 4 | scrap_code.mdown 5 | scrap_code.txt 6 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | 8 | jobs: 9 | build-and-push-docker-image: 10 | name: Build and push the Docker image 11 | runs-on: debian-latest 12 | steps: 13 | - name: Login to GitHub Packages Docker Registry 14 | uses: docker/login-action@v1 15 | with: 16 | registry: ghcr.io 17 | username: ${{ github.repository_owner }} 18 | password: ${{ secrets.GITHUB_TOKEN }} 19 | - name: Build 20 | uses: docker/build-push-action@v3 21 | with: 22 | platforms: linux/amd64,linux/arm64 23 | push: true 24 | -------------------------------------------------------------------------------- /tests/parse_tests.rs: -------------------------------------------------------------------------------- 1 | mod parse_tests { 2 | use tidy::parsers::eval_list_length; 3 | 4 | #[test] 5 | fn can_parse_print_rand() { 6 | assert_eq!(eval_list_length("7776").unwrap(), 7776); 7 | assert_eq!(eval_list_length("6**5").unwrap(), 7776); 8 | assert_eq!(eval_list_length("10000").unwrap(), 10000); 9 | assert_eq!(eval_list_length("10**2").unwrap(), 100); 10 | } 11 | 12 | #[test] 13 | fn panics_when_noninteger_is_inputted_to_print_rand() { 14 | assert!(eval_list_length("four").is_err()); 15 | } 16 | 17 | #[test] 18 | fn panics_when_too_many_exponents_inputted_to_print_rand() { 19 | assert!(eval_list_length("2**4**3").is_err()); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tidy" 3 | version = "0.3.17" 4 | authors = ["sts10 "] 5 | edition = "2024" 6 | license = "MIT" 7 | readme = "readme.markdown" 8 | repository = "https://github.com/sts10/tidy" 9 | description = "Combine and clean word lists" 10 | categories = ["command-line-utilities"] 11 | 12 | [dependencies] 13 | clap = { version = "4.5.18", features = ["derive"] } 14 | memchr = "2.7.4" 15 | radix_fmt = "1.0.0" 16 | rand = "0.9.0" 17 | itertools = "0.14.0" 18 | unicode-normalization = "0.1.24" 19 | unicode-segmentation = "1.12.0" 20 | # icu = "1.5.0" 21 | icu = "2.0.0" 22 | serde = { version = "1.0", features = ["derive"] } 23 | serde_json = "1.0" 24 | 25 | # The profile that 'cargo dist' will build with 26 | [profile.dist] 27 | inherits = "release" 28 | lto = "thin" 29 | -------------------------------------------------------------------------------- /dist-workspace.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["cargo:."] 3 | 4 | # Config for 'dist' 5 | [dist] 6 | # The preferred dist version to use in CI (Cargo.toml SemVer syntax) 7 | cargo-dist-version = "0.30.2" 8 | # CI backends to support 9 | ci = "github" 10 | # The installers to generate for each app 11 | installers = ["shell"] 12 | # Target platforms to build apps for (Rust target-triple syntax) 13 | targets = ["aarch64-apple-darwin", "aarch64-unknown-linux-gnu", "aarch64-pc-windows-msvc", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu", "x86_64-unknown-linux-musl", "x86_64-pc-windows-msvc"] 14 | # Which actions to run on pull requests 15 | pr-run-mode = "plan" 16 | # Whether to install an updater program 17 | install-updater = false 18 | # Path that installers should place binaries in 19 | install-path = "CARGO_HOME" 20 | -------------------------------------------------------------------------------- /tests/list_reading_tests.rs: -------------------------------------------------------------------------------- 1 | mod list_reading_tests { 2 | use crate::list_reading_tests::file_readers::blend; 3 | use tidy::*; 4 | 5 | #[test] 6 | fn can_blend_multiple_lists() { 7 | let word_lists_by_file = vec![ 8 | vec!["one".to_string(), "three".to_string(), "five".to_string()], 9 | vec![ 10 | "two".to_string(), 11 | "four".to_string(), 12 | "six".to_string(), 13 | "eight".to_string(), 14 | "ten".to_string(), 15 | ], 16 | ]; 17 | let blended_list = blend(&word_lists_by_file); 18 | 19 | assert_eq!( 20 | blended_list, 21 | ["one", "two", "three", "four", "five", "six", "eight", "ten"].to_vec() 22 | ); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020-2024 Sam Schlinkert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/ignore_tests.rs: -------------------------------------------------------------------------------- 1 | mod ignore_tests { 2 | use tidy::*; 3 | 4 | fn make_list() -> Vec { 5 | vec!["mA1,word1 mB1", "mA2,word2 mB2", "mA3,word3 mB3", "A,B,C"] 6 | .iter() 7 | .map(|x| x.to_string()) 8 | .collect() 9 | } 10 | 11 | #[test] 12 | fn can_ignore_metadata_before_a_delimiter() { 13 | let this_tidy_request = TidyRequest { 14 | list: make_list(), 15 | ignore_before_delimiter: Some(','), 16 | maximum_length: Some(10), 17 | ..Default::default() 18 | }; 19 | let new_list = tidy_list(this_tidy_request); 20 | assert!(new_list.contains(&"mA1,word1 mB1".to_string())); 21 | } 22 | #[test] 23 | fn can_ignore_metadata_after_a_delimiter() { 24 | let this_tidy_request = TidyRequest { 25 | list: make_list(), 26 | ignore_after_delimiter: Some('s'), 27 | maximum_length: Some(10), 28 | ..Default::default() 29 | }; 30 | let new_list = tidy_list(this_tidy_request); 31 | println!("{:?}", new_list); 32 | assert!(new_list.contains(&"mA1,word1 mB1".to_string())); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:alpine as builder 2 | # Workaround "cannot find crti.o" 3 | RUN apk add --no-cache musl-dev 4 | WORKDIR /app 5 | # First, only copy over the Cargo manifests, and do a build with a dummy 6 | # main.rs. This way, Docker can cache all of this, and only actually download 7 | # and build those depepndencies once. This is nice, because otherwise it's 8 | # quite slow. 9 | COPY Cargo.toml Cargo.lock . 10 | RUN mkdir src && echo "fn main() { }" > src/main.rs 11 | RUN cargo build --release 12 | RUN rm -r src/ 13 | # The above should get cached; but the below steps will be executed each 14 | # time you change the source code and do 'docker build'. 15 | COPY . . 16 | RUN cargo build --release 17 | RUN strip target/release/tidy 18 | 19 | FROM scratch 20 | COPY --from=builder /app/target/release/tidy /bin/tidy 21 | ENTRYPOINT ["/bin/tidy"] 22 | 23 | # Add some metadata about the image, for extra neatness. 24 | LABEL org.opencontainers.image.title="tidy" \ 25 | org.opencontainers.image.description="A command-line tool for combining and cleaning large word list files" \ 26 | org.opencontainers.image.url="https://github.com/sts10/tidy" \ 27 | org.opencontainers.image.authors="Sam Schlinkert = vec!["101", "00", "0001", "1"] 8 | .iter() 9 | .map(|w| w.to_string()) 10 | .collect(); 11 | let this_tidy_request = TidyRequest { 12 | list: list.clone(), 13 | should_schlinkert_prune: true, 14 | ..Default::default() 15 | }; 16 | let new_list = tidy_list(this_tidy_request); 17 | assert_eq!(list, new_list); 18 | } 19 | 20 | #[test] 21 | fn can_run_schlinkert_prune_on_reversed_list_if_it_saves_more_words() { 22 | let list: Vec = vec![ 23 | "news", 24 | "paper", 25 | "newspaper", 26 | "donkey", 27 | "newsdonkey", 28 | "ghost", 29 | "newsghost", 30 | "radish", 31 | "newsradish", 32 | ] 33 | .iter() 34 | .map(|w| w.to_string()) 35 | .collect(); 36 | 37 | let this_tidy_request = TidyRequest { 38 | list: list, 39 | should_schlinkert_prune: true, 40 | ..Default::default() 41 | }; 42 | let new_list = tidy_list(this_tidy_request); 43 | // If Schlinkert prune was done in forwards, only 44 | // 5 words would be saved. But if we Schlinkert 45 | // prune the reversed list, we save 8 words. 46 | assert!(new_list.len() == 8); 47 | // And now let's confirm that the new list is indeed 48 | // uniquely decodable, at least as far as Tidy is able 49 | // to confirm. 50 | assert!(is_uniquely_decodable(&new_list)); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/cards.rs: -------------------------------------------------------------------------------- 1 | use radix_fmt::*; // https://stackoverflow.com/a/50278316 2 | 3 | /// Convert a number in a "card code" (base 26) 4 | pub fn print_as_cards(n: usize, list_length: usize) -> String { 5 | let n_as_base_26 = radix(n, 26); 6 | 7 | // Pad card codes with zeros 8 | let n_width = n_as_base_26.to_string().len(); 9 | let pad_width = radix(list_length - 1, 26).to_string().len(); 10 | 11 | let mut padded_n = String::new(); 12 | for _i in n_width..pad_width { 13 | padded_n.push('0'); 14 | } 15 | // Now that we have the appropriate number of zeros 16 | // in `padded_n`, it's time to add our number 17 | padded_n += &n_as_base_26.to_string(); 18 | 19 | padded_n 20 | .to_string() 21 | .chars() 22 | .map(|ch| char_to_card(ch) + "-") 23 | .collect::() 24 | .trim_end_matches('-') 25 | .trim() 26 | .to_string() 27 | } 28 | 29 | /// Convert 0-z inputted character to a 3-character "card code" 30 | fn char_to_card(ch: char) -> String { 31 | match ch { 32 | '0' => "B02", 33 | '1' => "B03", 34 | '2' => "B04", 35 | '3' => "B05", 36 | '4' => "B06", 37 | '5' => "B07", 38 | '6' => "B08", 39 | '7' => "B09", 40 | '8' => "B10", 41 | '9' => "BJa", 42 | 'a' => "BQu", 43 | 'b' => "BKi", 44 | 'c' => "BAc", 45 | 'd' => "R02", 46 | 'e' => "R03", 47 | 'f' => "R04", 48 | 'g' => "R05", 49 | 'h' => "R06", 50 | 'i' => "R07", 51 | 'j' => "R08", 52 | 'k' => "R09", 53 | 'l' => "R10", 54 | 'm' => "RJa", 55 | 'n' => "RQu", 56 | 'o' => "RKi", 57 | 'p' => "RAc", 58 | _ => panic!("Unable to convert this number from a letter to a card code."), 59 | } 60 | .to_string() 61 | } 62 | -------------------------------------------------------------------------------- /src/display_information/uniquely_decodable.rs: -------------------------------------------------------------------------------- 1 | //! This is a (rather clumsily) implementation of the Sardinas-Patterson algorithm 2 | //! by Sam Schlinkert. 3 | //! The goal is to check if a word list (`c`) is uniquely decodable. 4 | //! 5 | //! I followed 6 | //! 7 | //! very closely. Since then, other contributors have refactored it. 8 | use std::collections::HashSet; 9 | 10 | /// Return true if the list is uniquely decodable, false if not. I 11 | /// don't _think_ we need to check reversed words in this case. 12 | pub fn is_uniquely_decodable>(c: &[T]) -> bool { 13 | sardinas_patterson_theorem(c.iter().map(|f| f.as_ref()).collect()) 14 | } 15 | 16 | /// Generate c for any number n 17 | fn generate_cn<'a>(c: &HashSet<&'a str>, cn_minus_1: &HashSet<&'a str>) -> HashSet<&'a str> { 18 | let mut cn = HashSet::new(); 19 | 20 | for w1 in c.iter() { 21 | for w2 in cn_minus_1.iter() { 22 | if w1.len() > w2.len() && w1.starts_with(w2) { 23 | // w2 is a prefix word of w1 24 | // so, we're going to add the dangling suffix to a new HashSet 25 | // called cn 26 | cn.insert(&w1[w2.len()..]); 27 | } 28 | if w2.len() > w1.len() && w2.starts_with(w1) { 29 | // w1 is a prefix word of w2 30 | // so, we're going to add the dangling suffix to a new HashSet 31 | // called cn 32 | cn.insert(&w2[w1.len()..]); 33 | } 34 | } 35 | } 36 | cn 37 | } 38 | 39 | fn generate_c_infinity_with_a_halt_break<'a>(c: &'a HashSet<&str>) -> HashSet<&'a str> { 40 | let mut cn = generate_cn(c, c); 41 | let mut cs = cn.clone(); 42 | 43 | loop { 44 | cn = generate_cn(c, &cn); 45 | let prior = cs.len(); 46 | cs.extend(&cn); 47 | if cs.len() == prior { 48 | // if the set size did not increase, cn is a subset 49 | // Cycle detected. Halting algorithm. 50 | break; 51 | } 52 | } 53 | cs 54 | } 55 | 56 | /// Returns true if c is uniquely decodable 57 | fn sardinas_patterson_theorem(c: HashSet<&str>) -> bool { 58 | let c_infinity = generate_c_infinity_with_a_halt_break(&c); 59 | c.is_disjoint(&c_infinity) 60 | } 61 | -------------------------------------------------------------------------------- /src/schlinkert_pruning.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | /// Return a Vector of words that "caused" the Sardinas-Patterson algorithm to 4 | /// determine that this list was not uniquely decodable. 5 | /// These "offending" words can then be removed from the original 6 | /// list to, theoretically, make the list uniquely decodable. 7 | pub fn get_sardinas_patterson_final_intersection>(c: &[T]) -> Vec { 8 | // Convert c to a HashSet, I think 9 | let c = c.iter().map(|f| f.as_ref()).collect(); 10 | 11 | let c_infinity = generate_c_infinity_with_a_halt_break(&c); 12 | // We want to collect a list of words that "caused" the Sardinas-Patterson algorithm 13 | // to determine that this list was not uniquely decodable. 14 | // If the given list is in fact uniquely decodable, this list of words will be empty. 15 | // If there are words in the list, we'll return those to src/lib to be 16 | // removed from the final list. 17 | let final_intersection = c.intersection(&c_infinity); 18 | Vec::from_iter(final_intersection) 19 | .iter() 20 | .map(|w| w.to_string()) 21 | .collect() 22 | } 23 | 24 | /// Generate c for any number n 25 | fn generate_cn<'a>(c: &HashSet<&'a str>, cn_minus_1: &HashSet<&'a str>) -> HashSet<&'a str> { 26 | let mut cn = HashSet::new(); 27 | 28 | for w1 in c.iter() { 29 | for w2 in cn_minus_1.iter() { 30 | if w1.len() > w2.len() && w1.starts_with(w2) { 31 | // w2 is a prefix word of w1 32 | // so, we're going to add the dangling suffix to a new HashSet 33 | // called cn 34 | cn.insert(&w1[w2.len()..]); 35 | } 36 | if w2.len() > w1.len() && w2.starts_with(w1) { 37 | // w1 is a prefix word of w2 38 | // so, we're going to add the dangling suffix to a new HashSet 39 | // called cn 40 | cn.insert(&w2[w1.len()..]); 41 | } 42 | } 43 | } 44 | cn 45 | } 46 | 47 | fn generate_c_infinity_with_a_halt_break<'a>(c: &'a HashSet<&str>) -> HashSet<&'a str> { 48 | let mut cn = generate_cn(c, c); 49 | let mut cs = cn.clone(); 50 | 51 | loop { 52 | cn = generate_cn(c, &cn); 53 | let prior = cs.len(); 54 | cs.extend(&cn); 55 | if cs.len() == prior { 56 | // if the set size did not increase, cn is a subset 57 | // Cycle detected. Halting algorithm. 58 | break; 59 | } 60 | } 61 | cs 62 | } 63 | -------------------------------------------------------------------------------- /src/edit_distance.rs: -------------------------------------------------------------------------------- 1 | //! Compute the edit distance between two strings 2 | 3 | use std::cmp::min; 4 | 5 | /// `find_edit_distance(str_a, str_b)` returns the edit distance between the two 6 | /// strings. This edit distance is defined as being 1 point per insertion, 7 | /// substitution, or deletion which must be made to make the strings equal. 8 | /// 9 | /// I adapted this function from one I found in the 10 | /// [TheAlgorithms/Rust repo on Github](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/edit_distance.rs). 11 | /// 12 | /// Instead of storing the `m * n` matrix expicitly, only one row (of length `n`) is stored. 13 | /// It keeps overwriting itself based on its previous values with the help of two scalars, 14 | /// gradually reaching the last row. Then, the score is `matrix[n]`. 15 | pub fn find_edit_distance(str_a: &str, str_b: &str) -> u32 { 16 | let (str_a, str_b) = (str_a.as_bytes(), str_b.as_bytes()); 17 | let (m, n) = (str_a.len(), str_b.len()); 18 | let mut distances: Vec = vec![0; n + 1]; // the dynamic programming matrix (only 1 row stored) 19 | let mut s: u32; // distances[i - 1][j - 1] or distances[i - 1][j] 20 | let mut c: u32; // distances[i][j - 1] or distances[i][j] 21 | let mut char_a: u8; // str_a[i - 1] the i-th character in str_a; only needs to be computed once per row 22 | let mut char_b: u8; // str_b[j - 1] the j-th character in str_b 23 | 24 | // 0th row 25 | for (j, v) in distances.iter_mut().enumerate().take(n + 1).skip(1) { 26 | *v = j as u32; 27 | } 28 | // rows 1 to m 29 | for i in 1..=m { 30 | s = (i - 1) as u32; 31 | c = i as u32; 32 | char_a = str_a[i - 1]; 33 | for j in 1..=n { 34 | // c is distances[i][j-1] and s is distances[i-1][j-1] at the beginning of each round of iteration 35 | char_b = str_b[j - 1]; 36 | c = min( 37 | s + if char_a == char_b { 0 } else { 1 }, 38 | min(c + 1, distances[j] + 1), 39 | ); 40 | // c is updated to distances[i][j], and will thus become distances[i][j-1] for the next cell 41 | s = distances[j]; // here distances[j] means distances[i-1][j] becuase it has not been overwritten yet 42 | // s is updated to distances[i-1][j], and will thus become distances[i-1][j-1] for the next cell 43 | distances[j] = c; // now distances[j] is updated to distances[i][j], and will thus become distances[i-1][j] for the next ROW 44 | } 45 | } 46 | 47 | distances[n] 48 | } 49 | -------------------------------------------------------------------------------- /tests/uniquely_decodable_tests.rs: -------------------------------------------------------------------------------- 1 | mod uniquely_decodable_tests { 2 | use tidy::display_information::uniquely_decodable::is_uniquely_decodable; 3 | 4 | #[test] 5 | fn can_determine_a_list_with_prefix_words_is_not_uniquely_decodable() { 6 | let list: Vec = vec!["news", "newspaper", "paper", "elephant"] 7 | .iter() 8 | .map(|x| x.to_string()) 9 | .collect(); 10 | 11 | assert!(!is_uniquely_decodable(&list)); 12 | 13 | let list2: Vec = vec![ 14 | "spill".to_string(), 15 | "sun".to_string(), 16 | "moved".to_string(), 17 | "spills".to_string(), 18 | "unmoved".to_string(), 19 | ]; 20 | assert!(!is_uniquely_decodable(&list2)); 21 | } 22 | 23 | #[test] 24 | fn can_determine_that_a_list_is_uniquely_decodable() { 25 | let list: Vec = vec![ 26 | "excursion", 27 | "friday", 28 | "gyration", 29 | "natural", 30 | "pentagon", 31 | "sheath", 32 | "silver", 33 | "starless", 34 | "underling", 35 | "unmarked", 36 | "untaxed", 37 | "zippy", 38 | ] 39 | .iter() 40 | .map(|w| w.to_string()) 41 | .collect(); 42 | assert!(is_uniquely_decodable(&list)); 43 | } 44 | 45 | #[test] 46 | fn can_determine_binary_code_with_a_suffix_code_is_not_uniquely_decodable() { 47 | let list: Vec = vec!["02", "12", "120", "20", "21"] 48 | .iter() 49 | .map(|w| w.to_string()) 50 | .collect(); 51 | assert!(!is_uniquely_decodable(&list)); 52 | } 53 | 54 | #[test] 55 | fn given_a_series_of_binary_codes_can_determine_which_are_uniquely_decodable() { 56 | let list: Vec = vec!["0", "10", "110", "111"] 57 | .iter() 58 | .map(|w| w.to_string()) 59 | .collect(); 60 | assert!(is_uniquely_decodable(&list)); 61 | 62 | let list: Vec = vec!["0", "10", "010", "101"] 63 | .iter() 64 | .map(|w| w.to_string()) 65 | .collect(); 66 | assert!(!is_uniquely_decodable(&list)); 67 | 68 | let list: Vec = vec!["0", "01", "011", "0111"] 69 | .iter() 70 | .map(|w| w.to_string()) 71 | .collect(); 72 | assert!(is_uniquely_decodable(&list)); 73 | 74 | // '0, 1, 00, 11' is not an uniquely decodable code 75 | let list: Vec = vec!["0", "1", "00", "11"] 76 | .iter() 77 | .map(|w| w.to_string()) 78 | .collect(); 79 | assert!(!is_uniquely_decodable(&list)); 80 | } 81 | 82 | #[test] 83 | fn knows_that_a_fixed_length_code_is_uniquely_decodable() { 84 | let list: Vec = vec![ 85 | "buoy", "cote", "dads", "duel", "gale", "life", "lurk", "peer", "rain", "tong", 86 | ] 87 | .iter() 88 | .map(|w| w.to_string()) 89 | .collect(); 90 | assert!(is_uniquely_decodable(&list)); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /tests/list_information_tests.rs: -------------------------------------------------------------------------------- 1 | mod list_information_tests { 2 | use tidy::display_information::*; 3 | // use tidy::*; 4 | 5 | #[test] 6 | fn can_calculate_entropy_per_word_of_generated_list() { 7 | assert_eq!(calc_entropy_per_word(7_776), 12.92481250360578); 8 | assert_eq!(calc_entropy_per_word(16_103), 13.975041868009528); 9 | assert_eq!(calc_entropy_per_word(18_318), 14.160974374927935); 10 | } 11 | 12 | #[test] 13 | fn can_calculate_assumed_entropy_per_character_of_generated_list() { 14 | let list: Vec = vec!["to", "canopy", "cold", "seasons", "fire", "Christmas"] 15 | .iter() 16 | .map(|x| x.to_string()) 17 | .collect(); 18 | assert_eq!(assumed_entropy_per_character(&list), 1.292481250360578); 19 | } 20 | 21 | #[test] 22 | fn can_calculate_mean_edit_distance() { 23 | let list: Vec = vec![ 24 | "bat", "cat", "rat", "hat", "mat", "tat", "fat", "oat", "pat", "sat", "vat", 25 | ] 26 | .iter() 27 | .map(|x| x.to_string()) 28 | .collect(); 29 | assert_eq!(find_mean_edit_distance(&list), 1.0); 30 | 31 | let list2: Vec = vec!["abcd", "abce", "abxz"] 32 | .iter() 33 | .map(|x| x.to_string()) 34 | .collect(); 35 | assert_eq!(find_mean_edit_distance(&list2), 1.6666666666666667); 36 | 37 | let list3: Vec = vec!["abcd", "abce", "abxz", "abpt"] 38 | .iter() 39 | .map(|x| x.to_string()) 40 | .collect(); 41 | assert_eq!(find_mean_edit_distance(&list3), (11.0 / 6.0) as f64); 42 | } 43 | 44 | #[test] 45 | fn can_find_first_different_character() { 46 | assert_eq!( 47 | find_first_different_character_zero_indexed("apple", "zebra"), 48 | 0 49 | ); 50 | assert_eq!( 51 | find_first_different_character_zero_indexed("berry", "bicker"), 52 | 1 53 | ); 54 | assert_eq!( 55 | find_first_different_character_zero_indexed("hello", "help"), 56 | 3 57 | ); 58 | assert_eq!( 59 | find_first_different_character_zero_indexed("radius", "radical"), 60 | 4 61 | ); 62 | assert_eq!( 63 | find_first_different_character_zero_indexed("zip", "zippy"), 64 | 3 65 | ); 66 | assert_eq!( 67 | find_first_different_character_zero_indexed("zippy", "zip"), 68 | 3 69 | ); 70 | } 71 | 72 | #[test] 73 | fn can_find_longest_shared_prefix_in_a_list() { 74 | let list: Vec = vec![ 75 | "to", 76 | "canopy", 77 | "cold", 78 | "academia", 79 | "academic", 80 | "seasons", 81 | "fire", 82 | "Christmas", 83 | ] 84 | .iter() 85 | .map(|x| x.to_string()) 86 | .collect(); 87 | assert_eq!(find_longest_shared_prefix(&list, None), 7); 88 | 89 | let list: Vec = vec!["to", "canopy", "cancel", "seasons", "fire", "Christmas"] 90 | .iter() 91 | .map(|x| x.to_string()) 92 | .collect(); 93 | assert_eq!(find_longest_shared_prefix(&list, None), 3); 94 | } 95 | #[test] 96 | fn can_get_shortest_word_length() { 97 | let list: Vec = vec!["canopy", "to", "cold", "seasons", "fire", "Christmas"] 98 | .iter() 99 | .map(|x| x.to_string()) 100 | .collect(); 101 | assert_eq!(get_shortest_word_length(&list), 2); 102 | } 103 | #[test] 104 | fn can_get_mean_word_length() { 105 | let list: Vec = vec!["canopy", "to", "cold", "seasons", "fire", "Christmas"] 106 | .iter() 107 | .map(|x| x.to_string()) 108 | .collect(); 109 | assert_eq!(mean_word_length(&list), 5.3333335); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v0.3.17 2 | * Uses new version of `cargo-dist` (v0.28.2) to cut a fresh release, including binaries. 3 | 4 | # v0.3.16 5 | * Fixes error when parsing system language/locale! Use "-" rather than underscores ("\_")! 6 | 7 | # v0.3.15 8 | **WARNING:** Do NOT use this release. Has bug when parsing detected system language. 9 | * Upgrades `icu` crate dependency to latest version, v2.0. Completes #61. 10 | 11 | # v0.3.14 12 | * Upgrade to Rust edition 2024. 13 | * Reformats entire codebase. 14 | 15 | # v0.3.13 16 | 17 | * Upgrades rand and itertools dependencies ([#59](https://github.com/sts10/tidy/pull/59)) 18 | * Uses latest version of cargo-dist to create new release 19 | 20 | # v0.3.12 21 | 22 | * Upgrades some dependencies, including the version of cargo-dist used. 23 | 24 | # v0.3.10 25 | 26 | * Adds a new option to sort words by length (`--sort-by-length`). (Thanks to @latk for help with this work.) See [this blog post](https://sts10.github.io/2024/07/06/double-sorting.html) for more about this new feature. 27 | 28 | # v0.3.9 29 | 30 | * Update all dependencies that have new versions 31 | * Uses version 0.14.1 of cargo-dist to create release binaries and a shell installation script. 32 | 33 | # v0.3.8 34 | 35 | * Uses version 0.8.0 of cargo-dist to create release binaries and a shell installation script. 36 | # v0.3.8 37 | 38 | * Uses version 0.8.0 of cargo-dist to create release binaries and a shell installation script. 39 | 40 | # v0.3.7 41 | 42 | First release using [cargo-dist](https://opensource.axo.dev/cargo-dist/). Should create binaries for Mac and Windows users. Cool! 43 | 44 | # v0.3.0 45 | 46 | The big new feature in this release is that users can optionally print attributes and word samples in JSON format. 47 | 48 | ## Changes 49 | * d06d1ea - Uses an enum for result of Kraft-McMillan Inequality 50 | * abe465d - only calculates longest word length once, in order to be more efficient 51 | * a979645 - brings help text up to date with JSON feature 52 | * fdf4071 - print word samples within JSON output 53 | * dad0cd6 - gives credit back to Kraft! 54 | * f77ec28 - more concise creation of `ListAttributes` object. Also think I made the shared prefix calculation a bit faster 55 | * 8549df7 - make shared prefix optional, since it takes a while 56 | * 95d72b6 - improves the descriptiveness of a function name 57 | * 4fed268 - fixes spelling of 'unique' in new display attributes code 58 | * b07f7dc - puts `ListAttributes` into a new enum, adds feature of printing list attributes in JSON 59 | 60 | # v0.2.91 61 | 62 | Mostly housekeeping in this release. 63 | 64 | * 0a6a78b - moves Shannon line boolean attribute behind 5 As rather than 4, since it's a pretty dubious attribute at this point 65 | * 67ab0ca - adds link to NSA's password generator and its word list 66 | * d3f3549 - fixes mistake in explanation of unique decodability in readme 67 | * dc4828e - adds some metadata to Cargo.toml for thoroughness 68 | * 80181b0 - adds upgrade and uninstall information to the readme 69 | * 84bf97a - updates word sample language in readme 70 | 71 | # v0.2.90 72 | 73 | The big change in this release is that Tidy now performs Schlinkert pruning both on the list as given, _and_ the list where every word is reversed. 74 | 75 | Performing the Schlinkert prune on the reversed words is equivalent to using prefix words in Sardinas-Patterson algorithm, rather than suffix words. Tidy now tries both, preferring whichever process saves more words on the original list. This is the case on the BIPS39 English word list. See #43 for more information. 76 | 77 | ## Commits with major changes 78 | * 1de5d1c - adds a test to make sure Tidy runs Schlinkert pruning the reversed list 79 | * be38459 - when reversing words before doing the Schlinkert prune, use graphemes rather than characters to better attempt to handle accented characters and emoji 80 | * 8ac7782 - executes Schlinkert prune in both directions, then prefer whichever saves the most words 81 | * d681136 - Adds deny.toml to ease compatibility checks 82 | * 24063ce - doesn't print a space after 6th word of each sample 83 | 84 | 85 | Also various function and variable renaming for clarity and, as usual, other updates to the README. 86 | -------------------------------------------------------------------------------- /src/file_writer.rs: -------------------------------------------------------------------------------- 1 | use crate::cards::print_as_cards; 2 | use crate::dice::print_as_dice; 3 | use crate::display_information::display_list_information; 4 | use std::fs::File; 5 | use std::io::Write; 6 | use std::path::PathBuf; 7 | 8 | #[derive(Default, Debug, Clone)] 9 | pub struct PrintRequest { 10 | pub tidied_list: Vec, 11 | pub dry_run: bool, 12 | pub quiet: bool, 13 | pub output: Option, 14 | pub dice_sides: Option, 15 | pub cards: bool, 16 | pub print_dice_sides_as_their_base: bool, 17 | pub attributes: u8, 18 | pub attributes_as_json: bool, 19 | pub samples: bool, 20 | pub ignore_before_delimiter: Option, 21 | pub ignore_after_delimiter: Option, 22 | } 23 | 24 | /// Print to terminal or file 25 | pub fn print_list(print_req: PrintRequest) { 26 | if !print_req.quiet { 27 | if print_req.tidied_list.is_empty() { 28 | eprintln!( 29 | "WARNING: All words removed (tidied list is empty). Check inputted list and given options." 30 | ); 31 | } else if !print_req.dry_run { 32 | eprintln!("Printing new list..."); 33 | } 34 | } 35 | if !print_req.dry_run { 36 | match print_req.output { 37 | Some(output) => { 38 | // Print to file 39 | print_list_to_file( 40 | &print_req.tidied_list, 41 | output, 42 | print_req.cards, 43 | print_req.dice_sides, 44 | print_req.print_dice_sides_as_their_base, 45 | ); 46 | } 47 | // If no output file destination, print resulting list, word by word, 48 | // to println (which goes to stdout, allowing use of > on command line) 49 | None => { 50 | for (i, word) in print_req.tidied_list.iter().enumerate() { 51 | if let Some(dice_sides) = print_req.dice_sides { 52 | print!( 53 | "{:}\t", 54 | print_as_dice( 55 | i, 56 | dice_sides, 57 | print_req.tidied_list.len(), 58 | print_req.print_dice_sides_as_their_base 59 | ) 60 | ); 61 | } else if print_req.cards { 62 | print!("{:}\t", print_as_cards(i, print_req.tidied_list.len())); 63 | } 64 | println!("{}", word); 65 | } 66 | } 67 | } 68 | } 69 | if !print_req.quiet { 70 | if !print_req.dry_run && !print_req.tidied_list.is_empty() { 71 | eprintln!("\nDone making list."); 72 | } else if print_req.dry_run { 73 | eprintln!("Dry run complete"); 74 | } 75 | if print_req.attributes > 0 || print_req.samples { 76 | display_list_information( 77 | &print_req.tidied_list, 78 | print_req.attributes, 79 | print_req.attributes_as_json, 80 | print_req.ignore_after_delimiter, 81 | print_req.ignore_before_delimiter, 82 | print_req.samples, 83 | ); 84 | } 85 | } 86 | } 87 | 88 | fn print_list_to_file( 89 | tidied_list: &[String], 90 | output: PathBuf, 91 | cards: bool, 92 | dice_sides: Option, 93 | print_dice_sides_as_their_base: bool, 94 | ) { 95 | let mut f = File::create(output).expect("Unable to create file"); 96 | for (i, word) in tidied_list.iter().enumerate() { 97 | // If user set a number of dice_sides, we'll add the appropriate 98 | // dice roll information, then a tab, then the word. 99 | if let Some(dice_sides) = dice_sides { 100 | write!( 101 | f, 102 | "{}\t", 103 | print_as_dice( 104 | i, 105 | dice_sides, 106 | tidied_list.len(), 107 | print_dice_sides_as_their_base 108 | ), 109 | ) 110 | .expect("Unable to write dice roll to file"); 111 | } else if cards { 112 | write!(f, "{}\t", print_as_cards(i, tidied_list.len())) 113 | .expect("Unable to write corresponding card to file"); 114 | } 115 | 116 | writeln!(f, "{}", word).expect("Unable to write word to file"); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/parsers.rs: -------------------------------------------------------------------------------- 1 | /// Parse user's input to a handful of options, either directly as a `usize`, 2 | /// or, if they entered Python exponent notation (base**exponent), which 3 | /// we'll need to evaluate as an exponent. Either way, return a `usize` 4 | /// or `expect`/`panic!`. 5 | /// 6 | /// This is useful when making lists fit to a specific amount of dice and 7 | /// dice sides. (As an example, five rolls of a six-sided dice would be: 6**5). 8 | pub fn eval_list_length(input: &str) -> Result { 9 | match input.split("**").collect::>().as_slice() { 10 | [] => Err("Please specify a number.".to_string()), 11 | [num_string] => num_string.parse::().map_err(|_| { 12 | format!( 13 | "Unable to parse input {}. Enter a number or a base**exponent", 14 | input 15 | ) 16 | }), 17 | [base_string, exponent_string] => { 18 | let base: usize = base_string 19 | .parse::() 20 | .map_err(|_| format!("Unable to parse input {}. Positive integers only.", input))?; 21 | let exponent: u32 = exponent_string 22 | .parse::() 23 | .map_err(|_| format!("Unable to parse input {}. Positive integers only.", input))?; 24 | Ok(base.pow(exponent)) 25 | } 26 | _ => Err("You can only specify one exponent! Use format: base**exponent".to_string()), 27 | } 28 | } 29 | 30 | use crate::TidyRequest; 31 | use crate::split_and_vectorize; 32 | pub fn parse_whittle_options( 33 | mut this_tidy_request: TidyRequest, 34 | whittle_to_s: Option, 35 | ) -> Result<(TidyRequest, Option, Option), String> { 36 | match whittle_to_s { 37 | Some(whittle_to_string) => { 38 | // Some whittle_to String has been provided, which we need to do a lot of work for 39 | // First, parse length_to_whittle_to 40 | let length_to_whittle_to = 41 | eval_list_length(split_and_vectorize(&whittle_to_string, ",")[0]).unwrap(); 42 | // Determine initial starting point 43 | let starting_point = if split_and_vectorize(&whittle_to_string, ",").len() == 2 { 44 | // If user gave us one, use that. 45 | split_and_vectorize(&whittle_to_string, ",")[1] 46 | .parse::() 47 | .unwrap_or((length_to_whittle_to as f64 * 1.4) as usize) 48 | } else { 49 | // If not, start with length_to_whittle_to*1.4 as a decent opening guess. 50 | // Effectively this assumes we'll cut about 40% of words in most 51 | // Tidy runs. 52 | (length_to_whittle_to as f64 * 1.4) as usize 53 | }; 54 | // It's possible that our derive starting_point is higher than the length 55 | // of our inputted_word_list. If that's the case, reset starting_point 56 | // to that length. 57 | let starting_point = if starting_point > this_tidy_request.list.len() { 58 | this_tidy_request.list.len() 59 | } else { 60 | // if not, we're good. Let given starting_point pass through. 61 | starting_point 62 | }; 63 | 64 | // Another potential issue: User is asking for too many words, given length of 65 | // the inputted_word_list (which would be a problem!) 66 | if length_to_whittle_to > this_tidy_request.list.len() { 67 | let error_msg = format!( 68 | "ERROR: Cannot make a list of {} words from the inputted list(s), given the selected options. Please try again, either by changing options or inputting more words.", 69 | length_to_whittle_to 70 | ); 71 | return Err(error_msg); 72 | } 73 | 74 | // Give user a heads up that we're working on it. 75 | eprintln!( 76 | "Whittling list to {} words. This may take a moment...", 77 | length_to_whittle_to 78 | ); 79 | 80 | // When whittling, confidentally overwrite a few request parameters 81 | this_tidy_request.take_first = Some(starting_point); 82 | this_tidy_request.take_rand = None; 83 | this_tidy_request.print_rand = None; 84 | this_tidy_request.print_first = None; 85 | 86 | Ok(( 87 | this_tidy_request, 88 | Some(length_to_whittle_to), 89 | Some(starting_point), 90 | )) 91 | } 92 | None => Ok((this_tidy_request, None, None)), 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/dice.rs: -------------------------------------------------------------------------------- 1 | use radix_fmt::*; // https://stackoverflow.com/a/50278316 2 | /// Print dice rolls before each corresponding word. Note 3 | /// that the `n` parameter should be zero-indexed. A tab (`\t`) 4 | /// is printed between the dice roll and the word. 5 | /// 6 | /// The `base` parameter represents the number of sides of the 7 | /// dice, which can be set from 2 to 9. 8 | /// 9 | /// Here's an example of an outputted word list with base 6: 10 | /// ```text 11 | /// 11111 aback 12 | /// 11112 abandons 13 | /// 11113 abated 14 | /// 11114 abbey 15 | /// 11115 abbot 16 | /// 11116 abbreviated 17 | /// 11121 abdomen 18 | /// 11122 abducted 19 | /// 11123 aberrant 20 | /// 11124 abide 21 | /// 11125 ability 22 | /// 11126 abject 23 | /// 11131 abnormally 24 | /// // etc. 25 | /// ``` 26 | /// 27 | /// If this base is between 4 and 8, 28 | /// this function assumes the user will be using actual dice, which are index at 1. 29 | /// Thus, `if 4 <= base && base <= 8`, we add `1` to each digit of the dice 30 | /// roll before printing it. 31 | /// 32 | /// I wish I could replicate this radix function easily without the dependency, 33 | /// but that doesn't seem [very easy](https://stackoverflow.com/a/50278316). 34 | pub fn print_as_dice(n: usize, base: u8, list_length: usize, use_letters: bool) -> String { 35 | // Set width for zero-padding 36 | 37 | // First, get the literal width of the largest number we'll be printing. 38 | // This is, by definition the length of the list. 39 | // We want the length of the number in the base we want to print all 40 | // the numbers, so use radix function. 41 | let n_as_base = radix(n, base); 42 | 43 | // Pad dice roll numbers with zeros 44 | let n_width = n_as_base.to_string().len(); 45 | let pad_width = radix(list_length - 1, base).to_string().len(); 46 | 47 | let mut padded_n = String::new(); 48 | for _i in n_width..pad_width { 49 | padded_n.push('0'); 50 | } 51 | // Now that we have the appropriate number of zeros 52 | // in `padded_n`, it's time to add our number 53 | padded_n += &n_as_base.to_string(); 54 | 55 | // Print the dice rolls in slightly different ways, 56 | // depending on the value of the base. 57 | if use_letters { 58 | // We'll use zero-indexed values if sides_as_letters is 59 | // selected 60 | match base { 61 | // Values of 0 and 1 should have been caught earlier, 62 | // so we'll panic! if we have them here 63 | 0 | 1 => panic!("Too few dice sides entered"), 64 | 2..=36 => padded_n 65 | .chars() 66 | .map(|ch| ch.to_string().to_uppercase()) 67 | .collect::() 68 | .trim() 69 | .to_string(), 70 | _ => panic!("Amount of dice sides received is too high"), 71 | } 72 | } else { 73 | // We'll use 1-indexed values if sides_as_letters is NOT 74 | // selected 75 | match base { 76 | 0 | 1 => panic!("Too few dice sides entered"), 77 | 2..=9 => padded_n 78 | .chars() 79 | .map(|ch| (ch.to_string().parse::().unwrap() + 1).to_string()) 80 | .collect::(), 81 | 10..=36 => padded_n 82 | .chars() 83 | .map(|ch| char_to_digit(ch) + "-") 84 | .collect::() 85 | .trim_end_matches('-') 86 | .trim() 87 | .to_string(), 88 | _ => panic!("Amount of dice sides received is too high"), 89 | } 90 | } 91 | } 92 | 93 | /// Convert 0-z inputted character to a 1-indexed, padded string ("01" to "36") 94 | fn char_to_digit(ch: char) -> String { 95 | match ch { 96 | '0' => "01", 97 | '1' => "02", 98 | '2' => "03", 99 | '3' => "04", 100 | '4' => "05", 101 | '5' => "06", 102 | '6' => "07", 103 | '7' => "08", 104 | '8' => "09", 105 | '9' => "10", 106 | 'a' => "11", 107 | 'b' => "12", 108 | 'c' => "13", 109 | 'd' => "14", 110 | 'e' => "15", 111 | 'f' => "16", 112 | 'g' => "17", 113 | 'h' => "18", 114 | 'i' => "19", 115 | 'j' => "20", 116 | 'k' => "21", 117 | 'l' => "22", 118 | 'm' => "23", 119 | 'n' => "24", 120 | 'o' => "25", 121 | 'p' => "26", 122 | 'q' => "27", 123 | 'r' => "28", 124 | 's' => "29", 125 | 't' => "30", 126 | 'u' => "31", 127 | 'v' => "32", 128 | 'w' => "33", 129 | 'x' => "34", 130 | 'y' => "35", 131 | 'z' => "36", 132 | _ => panic!("Unable to convert this dice number from a letter to a number."), 133 | } 134 | .to_string() 135 | } 136 | -------------------------------------------------------------------------------- /src/input_validations.rs: -------------------------------------------------------------------------------- 1 | pub fn validate_dice_sides(dice_sides: Option) -> Result<(), &'static str> { 2 | if let Some(dice_sides) = dice_sides { 3 | if !(2 <= dice_sides && dice_sides <= 36) { 4 | return Err("Error: Specified number of dice sides must be between 2 and 36."); 5 | } 6 | } 7 | Ok(()) 8 | } 9 | 10 | pub fn validate_list_truncation_options( 11 | whittle_to: &Option, 12 | cut_to: Option, 13 | take_first: Option, 14 | take_rand: Option, 15 | ) -> Result<(), &'static str> { 16 | // Check for invalid whittle_to requests 17 | if whittle_to.is_some() && cut_to.is_some() { 18 | Err( 19 | "Error: Can not specify BOTH a 'cut to' and 'whittle to' option. Please only use one of these two.", 20 | ) 21 | } else if whittle_to.is_some() && (take_first.is_some() || take_rand.is_some()) { 22 | Err( 23 | "Error: Can not specify BOTH a 'whittle to' amount and a 'take first' or 'take rand' amount. Please only specify a whittle-to amount or a take amount.", 24 | ) 25 | } else { 26 | Ok(()) 27 | } 28 | } 29 | 30 | use crate::TidyRequest; 31 | pub fn validate_and_parse_ignore_options( 32 | this_tidy_request: &TidyRequest, 33 | dice_sides: Option, 34 | print_dice_sides_as_their_base: bool, 35 | ) -> Result<(Option, Option), &'static str> { 36 | // Warn about the (many!) current limitations of the 'ignore' options 37 | match ( 38 | this_tidy_request.ignore_after_delimiter, 39 | this_tidy_request.ignore_before_delimiter, 40 | ) { 41 | // If given both a from_delimiter and through_delimiter, error out nicely. 42 | (Some(_after_delimiter), Some(_before_delimiter)) => { 43 | let err_message = "Can't ignore metadata on both sides."; 44 | Err(err_message) 45 | } 46 | // No ignore delimiters given, so just return None to both 47 | // variables. 48 | (None, None) => Ok((None, None)), 49 | // A after_delimiter given, but not a before_delimiter 50 | (Some(after_delimiter), None) => { 51 | if this_tidy_request.to_lowercase 52 | || this_tidy_request.should_straighten_quotes 53 | || this_tidy_request.should_remove_prefix_words 54 | || this_tidy_request.should_remove_suffix_words 55 | || this_tidy_request.should_schlinkert_prune 56 | || this_tidy_request.should_delete_nonalphanumeric 57 | || this_tidy_request.should_delete_integers 58 | || this_tidy_request 59 | .should_delete_before_first_delimiter 60 | .is_some() 61 | || this_tidy_request 62 | .should_delete_after_first_delimiter 63 | .is_some() 64 | || this_tidy_request.minimum_edit_distance.is_some() 65 | || this_tidy_request.maximum_shared_prefix_length.is_some() 66 | || this_tidy_request.homophones_list.is_some() 67 | || dice_sides.is_some() 68 | || print_dice_sides_as_their_base 69 | { 70 | let err_message = "--ignore-after option does not work with one of the other options you selected. Please change options. Exiting"; 71 | Err(err_message) 72 | } else { 73 | Ok((Some(after_delimiter), None)) 74 | } 75 | } 76 | // No after_delimiter given, but a before_delimiter has been given 77 | (None, Some(before_delimiter)) => { 78 | if this_tidy_request.to_lowercase 79 | || this_tidy_request.should_straighten_quotes 80 | || this_tidy_request.should_remove_prefix_words 81 | || this_tidy_request.should_remove_suffix_words 82 | || this_tidy_request.should_schlinkert_prune 83 | || this_tidy_request.should_delete_nonalphanumeric 84 | || this_tidy_request.should_delete_integers 85 | || this_tidy_request 86 | .should_delete_before_first_delimiter 87 | .is_some() 88 | || this_tidy_request 89 | .should_delete_after_first_delimiter 90 | .is_some() 91 | || this_tidy_request.minimum_edit_distance.is_some() 92 | || this_tidy_request.maximum_shared_prefix_length.is_some() 93 | || this_tidy_request.homophones_list.is_some() 94 | || dice_sides.is_some() 95 | || print_dice_sides_as_their_base 96 | { 97 | let err_message = "--ignore-before option does not work with one of the other options you selected. Please change options. Exiting"; 98 | Err(err_message) 99 | } else { 100 | Ok((None, Some(before_delimiter))) 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/file_readers.rs: -------------------------------------------------------------------------------- 1 | use crate::split_and_vectorize; 2 | use std::fs::File; 3 | use std::io::BufRead; 4 | use std::io::BufReader; 5 | use std::path::PathBuf; 6 | 7 | /// Takes a slice of `PathBuf`s representing the word list(s) 8 | /// that the user has inputted to the program. Then iterates 9 | /// through each file and addes each line to `Vec`. (Blank 10 | /// lines and duplicate links will be handled elsewhere.) 11 | pub fn make_vec_from_filenames( 12 | filenames: &[PathBuf], 13 | skip_rows_start: Option, 14 | skip_rows_end: Option, 15 | ) -> Vec { 16 | let mut word_lists_by_file: Vec> = [].to_vec(); 17 | for filename in filenames { 18 | let f = match File::open(filename) { 19 | Ok(file) => file, 20 | Err(e) => panic!("Error opening file {:?}: {}", filename, e), 21 | }; 22 | let file = BufReader::new(&f); 23 | let mut raw_lines = vec![]; 24 | for line in file.lines() { 25 | let l = match line { 26 | Ok(l) => l, 27 | Err(e) => { 28 | eprintln!( 29 | "Error reading a line from file {:?}: {}\nWill continue reading file.", 30 | filename, e 31 | ); 32 | continue; 33 | } 34 | }; 35 | raw_lines.push(l); 36 | } 37 | let size_of_raw_lines = raw_lines.len(); 38 | let mut word_list_from_this_file = [].to_vec(); 39 | for (line_number, line) in raw_lines.into_iter().enumerate() { 40 | match (skip_rows_start, skip_rows_end) { 41 | (Some(skip_rows_start), Some(skip_rows_end)) => { 42 | if line_number >= skip_rows_start 43 | && line_number < size_of_raw_lines - skip_rows_end 44 | { 45 | word_list_from_this_file.push(line); 46 | } 47 | } 48 | (Some(skip_rows_start), None) => { 49 | if line_number >= skip_rows_start { 50 | word_list_from_this_file.push(line); 51 | } 52 | } 53 | (None, Some(skip_rows_end)) => { 54 | if line_number < size_of_raw_lines - skip_rows_end { 55 | word_list_from_this_file.push(line); 56 | } 57 | } 58 | (None, None) => word_list_from_this_file.push(line), 59 | } 60 | } 61 | word_lists_by_file.push(word_list_from_this_file); 62 | } 63 | // Finally, "blend" words into one Vec 64 | blend(&word_lists_by_file) 65 | } 66 | 67 | /// "Blend" words together one at a time, like dealing cards in reverse 68 | pub fn blend(word_lists_by_file: &[Vec]) -> Vec { 69 | let mut size_of_longest_vector = 0; 70 | for word_list in word_lists_by_file { 71 | if size_of_longest_vector < word_list.len() { 72 | size_of_longest_vector = word_list.len(); 73 | } 74 | } 75 | // "Blend" words together one at a time, like dealing cards in reverse 76 | let mut blended = [].to_vec(); 77 | for i in 0..size_of_longest_vector { 78 | for list in word_lists_by_file { 79 | if list.len() > i { 80 | // Dunno how to not call clone here... 81 | blended.push(list[i].clone()); 82 | } 83 | } 84 | } 85 | blended 86 | } 87 | 88 | /// Like `make_vec_from_filenames`, this function takes a slice of `PathBuf`s of 89 | /// files. But in this case these files represent lists of homophones that the 90 | /// user wants to make sure aren't both on the resulting list. 91 | /// 92 | /// These homophone files are expected to be formatted such that each line of the file 93 | /// is `homophone1,homophone2`. 94 | /// 95 | /// This function produces a Vector of tuples of strings, representing the 96 | /// homophone pairs. 97 | pub fn read_homophones_list_from_filenames(filenames: &[PathBuf]) -> Vec<(String, String)> { 98 | let mut homophones_list: Vec<(String, String)> = vec![]; 99 | for filename in filenames { 100 | let f = match File::open(filename) { 101 | Ok(file) => file, 102 | Err(e) => panic!("Error opening file {:?}: {}", filename, e), 103 | }; 104 | let file = BufReader::new(&f); 105 | for line in file.lines() { 106 | let l = match line { 107 | Ok(l) => l, 108 | Err(e) => { 109 | eprintln!( 110 | "Error reading a line from file {:?}: {}\nWill continue reading file.", 111 | filename, e 112 | ); 113 | continue; 114 | } 115 | }; 116 | let pair: (String, String) = ( 117 | split_and_vectorize(&l, ",")[0].trim().to_string(), 118 | split_and_vectorize(&l, ",")[1].trim().to_string(), 119 | ); 120 | homophones_list.push(pair); 121 | } 122 | } 123 | homophones_list 124 | } 125 | -------------------------------------------------------------------------------- /wordlists-to-tidy.markdown: -------------------------------------------------------------------------------- 1 | # Where can I find some word lists? 2 | 3 | ## Diceware lists (generally 7,776 words long) 4 | - The [Electronic Frontier Foundation](https://www.eff.org/) has published [a few word lists for creating diceware passphrases](https://www.eff.org/deeplinks/2016/07/new-wordlists-random-passphrases). 5 | - I'm pretty sure password manager BitWarden uses [the EFF long list](https://www.eff.org/files/2016/07/18/eff_large_wordlist.txt). 6 | - [KeePassXC](https://keepassxc.org/) uses [the EFF long list with some minor modifications](https://github.com/keepassxreboot/keepassxc/blob/develop/share/wordlists/eff_large.wordlist). 7 | - Note: These lists often have a tab between the dice numbers and each word. Tidy can delete the dice numbers easily with something like `tidy -D t -o clean_eff.txt eff_large_wordlist.txt` or using the `-i` flag. 8 | - [ulif's "diceware"](https://github.com/ulif/diceware) seems to have collected [a few word lists](https://github.com/ulif/diceware/tree/master/diceware/wordlists) in its Github repo, along with [a separate page that explains each of the lists](https://github.com/ulif/diceware/blob/master/docs/wordlists.rst). 9 | - [dmuth's "diceware" program](https://github.com/dmuth/diceware) has a [collection of lists](https://github.com/dmuth/diceware/tree/master/wordlist) (h/t [atoponce](https://www.reddit.com/r/Passwords/comments/sqrymt/comment/hwnfb94/)) 10 | - [Original "Reinhold" diceware list](https://theworld.com/%7Ereinhold/diceware.wordlist.asc) created by [Arnold Reinhold](https://theworld.com/~reinhold/). Though it has some issues. 11 | - Arnold Reinhold hosts [diceware lists in a variety of languages](https://theworld.com/~reinhold/diceware.html#Diceware%20in%20Other%20Languages|outline). 12 | 13 | ## Short word lists 14 | - [Bitcoin BIPS-0039](https://github.com/bitcoin/bips/tree/master/bip-0039) (2,048 words) (h/t [atoponce](https://www.reddit.com/r/Passwords/comments/sqrymt/comment/hwnfb94/)) 15 | - [Monero's word list](https://github.com/monero-project/monero/blob/master/src/mnemonics/english.h) (1,626 words) (h/t [atoponce](https://www.reddit.com/r/Passwords/comments/sqrymt/comment/hwnfb94/)) 16 | - [Mnemonicode](https://github.com/schollz/mnemonicode/blob/master/word_list.go) is another word list optimized for pronunciation. I believe [croc](https://github.com/schollz/croc), a file transferring tool, uses it. 17 | - [Magic Wormhole](https://github.com/magic-wormhole/magic-wormhole/), a tool for transferring files, uses [a version of the PGP Word List](https://github.com/magic-wormhole/magic-wormhole/blob/master/src/wormhole/_wordlist.py), which specifically tries to use pairs of words that are phonetically distinct. 18 | - [Wagashi](https://codeberg.org/azukitofu/wagashi) is a set of new, short word lists in English. They also offer emoji lists. 19 | - [Session's English word list](https://github.com/oxen-io/session-desktop/blob/unstable/mnemonic_languages/english.json) 20 | - [simple1024](https://github.com/pera/simple1024) is a word list with 1024 common English words, an alternative to EFF's short word lists. 21 | 22 | ## Pretty long word lists 23 | - If you're using Linux or MacOS, you've likely got some long lists on your computer. Check `/usr/share/dict/words` or `/usr/share/dict/american-english`. 24 | - [NSA's RandPassGenerator](https://github.com/nsacyber/RandPassGenerator) uses [a massive 117,828-word list](https://github.com/nsacyber/RandPassGenerator/blob/master/RandPassGenerator/data/wordlist.txt). 25 | - [Niceware list](https://github.com/diracdeltas/niceware/blob/master/lib/wordlist.js) (~65,000 words). [I used Tidy to help create v 4.0.0 of this list](https://github.com/diracdeltas/niceware/pull/52)! 26 | - [Norvig Natural Language Corpus Data](https://norvig.com/ngrams/) has [a list of 333,000 commonly used words](https://norvig.com/ngrams/count_1w.txt) from the Google Web Trillion Word Corpus, as well as an assortment of other word lists. 27 | - [British National Corpus (BNC) database and word frequency lists](https://www.kilgarriff.co.uk/bnc-readme.html) 28 | - [Lists used by a program called webpassgen](https://github.com/atoponce/webpassgen/tree/master/lists) 29 | - [SCOWL (Spell Checker Oriented Word Lists) and Friends](http://wordlist.aspell.net/) is a database of information on English words useful for creating high-quality word lists suitable for use in spell checkers of most dialects of English." 30 | - [ENABLE2K](https://web.archive.org/web/20090122025747/http://personal.riverusers.com/~thegrendel/software.html) seems to be an older version of the SCOWL project? 31 | 32 | ## Collections of word lists 33 | - A collection of a few [Public Domain Word Lists](https://github.com/MichaelWehar/Public-Domain-Word-Lists) 34 | - [**A great list of word lists** by Aaron Toponce](https://gist.github.com/atoponce/95c4f36f2bc12ec13242a3ccc55023af). 35 | - [A list of word lists](http://www.webplaces.com/passwords/passphrase-word-lists.htm). 36 | - [Danish wordlists](https://github.com/n0kovo/danish-wordlists) is a "collection of [Danish] wordlists for cracking danish passwords" 37 | - [r/wordlists subreddit](https://www.reddit.com/r/wordlists/), which seems to have links to a few non-English word lists. 38 | - You can also scan [GitHub's #wordlists topic](https://github.com/topics/wordlists) 39 | - An XKCD-inspired passphrase generator with [a collection of non-English word lists](https://github.com/redacted/XKCD-password-generator/tree/master/xkcdpass/static). 40 | 41 | ## Various 42 | - The EFF also has some [fandom-inspired lists](https://www.eff.org/deeplinks/2018/08/dragon-con-diceware). They use a space between dice numbers and words, so Tidy can clean up with the `-D s` option. I prefer [Aaron Toponce's proposed _new_ fandom word lists](https://github.com/sts10/new-fandom-wordlists). 43 | - I'm pretty sure this is [1Password](https://1password.com/)'s [word list](https://1password.com/txt/agwordlist.txt) as of 2021. 44 | - 1Password published a few slightly different word lists ([one](https://github.com/1Password/spg/blob/master/testdata/agwordlist.txt), [two](https://github.com/agilebits/crackme/blob/master/doc/AgileWords.txt)) in 2018. 45 | - [SecureDrop](https://github.com/freedomofpress/securedrop/) has a few lists, including one of [adjectives](https://github.com/freedomofpress/securedrop/blob/develop/securedrop/dictionaries/adjectives.txt) and one of [nouns](https://github.com/freedomofpress/securedrop/blob/develop/securedrop/dictionaries/nouns.txt). 46 | - [Jitsi](https://meet.jit.si/) has [lists of nouns, verbs, adjectives, and adverbs](https://github.com/jitsi/js-utils/blob/1c57316514a602f3888f4aafb047e8288066186e/random/roomNameGenerator.js) for generating random room names. 47 | - [A German word list that looks promising](https://github.com/martinhoefling/goxkcdpwgen/blob/master/wordlists/de-7776-v1-diceware.txt) 48 | 49 | ## Shameless plug 50 | 51 | - I used Tidy to create [Orchard Street Wordlists](https://github.com/sts10/orchard-street-wordlists) ([as well as a few other word lists](https://github.com/sts10/generated-wordlists)). 52 | -------------------------------------------------------------------------------- /deny.toml: -------------------------------------------------------------------------------- 1 | # This template contains all of the possible sections and their default values 2 | 3 | # Note that all fields that take a lint level have these possible values: 4 | # * deny - An error will be produced and the check will fail 5 | # * warn - A warning will be produced, but the check will not fail 6 | # * allow - No warning or error will be produced, though in some cases a note 7 | # will be 8 | 9 | # The values provided in this template are the default values that will be used 10 | # when any section or field is not specified in your own configuration 11 | 12 | # Root options 13 | 14 | # The graph table configures how the dependency graph is constructed and thus 15 | # which crates the checks are performed against 16 | [graph] 17 | # If 1 or more target triples (and optionally, target_features) are specified, 18 | # only the specified targets will be checked when running `cargo deny check`. 19 | # This means, if a particular package is only ever used as a target specific 20 | # dependency, such as, for example, the `nix` crate only being used via the 21 | # `target_family = "unix"` configuration, that only having windows targets in 22 | # this list would mean the nix crate, as well as any of its exclusive 23 | # dependencies not shared by any other crates, would be ignored, as the target 24 | # list here is effectively saying which targets you are building for. 25 | targets = [ 26 | # The triple can be any string, but only the target triples built in to 27 | # rustc (as of 1.40) can be checked against actual config expressions 28 | #"x86_64-unknown-linux-musl", 29 | # You can also specify which target_features you promise are enabled for a 30 | # particular target. target_features are currently not validated against 31 | # the actual valid features supported by the target architecture. 32 | #{ triple = "wasm32-unknown-unknown", features = ["atomics"] }, 33 | ] 34 | # When creating the dependency graph used as the source of truth when checks are 35 | # executed, this field can be used to prune crates from the graph, removing them 36 | # from the view of cargo-deny. This is an extremely heavy hammer, as if a crate 37 | # is pruned from the graph, all of its dependencies will also be pruned unless 38 | # they are connected to another crate in the graph that hasn't been pruned, 39 | # so it should be used with care. The identifiers are [Package ID Specifications] 40 | # (https://doc.rust-lang.org/cargo/reference/pkgid-spec.html) 41 | #exclude = [] 42 | # If true, metadata will be collected with `--all-features`. Note that this can't 43 | # be toggled off if true, if you want to conditionally enable `--all-features` it 44 | # is recommended to pass `--all-features` on the cmd line instead 45 | all-features = false 46 | # If true, metadata will be collected with `--no-default-features`. The same 47 | # caveat with `all-features` applies 48 | no-default-features = false 49 | # If set, these feature will be enabled when collecting metadata. If `--features` 50 | # is specified on the cmd line they will take precedence over this option. 51 | #features = [] 52 | 53 | # The output table provides options for how/if diagnostics are outputted 54 | [output] 55 | # When outputting inclusion graphs in diagnostics that include features, this 56 | # option can be used to specify the depth at which feature edges will be added. 57 | # This option is included since the graphs can be quite large and the addition 58 | # of features from the crate(s) to all of the graph roots can be far too verbose. 59 | # This option can be overridden via `--feature-depth` on the cmd line 60 | feature-depth = 1 61 | 62 | # This section is considered when running `cargo deny check advisories` 63 | # More documentation for the advisories section can be found here: 64 | # https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html 65 | [advisories] 66 | # The path where the advisory databases are cloned/fetched into 67 | #db-path = "$CARGO_HOME/advisory-dbs" 68 | # The url(s) of the advisory databases to use 69 | #db-urls = ["https://github.com/rustsec/advisory-db"] 70 | # A list of advisory IDs to ignore. Note that ignored advisories will still 71 | # output a note when they are encountered. 72 | ignore = [ 73 | #"RUSTSEC-0000-0000", 74 | #{ id = "RUSTSEC-0000-0000", reason = "you can specify a reason the advisory is ignored" }, 75 | #"a-crate-that-is-yanked@0.1.1", # you can also ignore yanked crate versions if you wish 76 | #{ crate = "a-crate-that-is-yanked@0.1.1", reason = "you can specify why you are ignoring the yanked crate" }, 77 | ] 78 | # If this is true, then cargo deny will use the git executable to fetch advisory database. 79 | # If this is false, then it uses a built-in git library. 80 | # Setting this to true can be helpful if you have special authentication requirements that cargo-deny does not support. 81 | # See Git Authentication for more information about setting up git authentication. 82 | #git-fetch-with-cli = true 83 | 84 | # This section is considered when running `cargo deny check licenses` 85 | # More documentation for the licenses section can be found here: 86 | # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html 87 | [licenses] 88 | # List of explicitly allowed licenses 89 | # See https://spdx.org/licenses/ for list of possible licenses 90 | # [possible values: any SPDX 3.11 short identifier (+ optional exception)]. 91 | allow = [ 92 | "Apache-2.0", 93 | "MIT", 94 | "Unicode-DFS-2016", 95 | "Unicode-3.0", 96 | "Unlicense" 97 | ] 98 | # The confidence threshold for detecting a license from license text. 99 | # The higher the value, the more closely the license text must be to the 100 | # canonical license text of a valid SPDX license file. 101 | # [possible values: any between 0.0 and 1.0]. 102 | confidence-threshold = 0.8 103 | # Allow 1 or more licenses on a per-crate basis, so that particular licenses 104 | # aren't accepted for every possible crate as with the normal allow list 105 | exceptions = [ 106 | # Each entry is the crate and version constraint, and its specific allow 107 | # list 108 | #{ allow = ["Zlib"], crate = "adler32" }, 109 | ] 110 | 111 | # Some crates don't have (easily) machine readable licensing information, 112 | # adding a clarification entry for it allows you to manually specify the 113 | # licensing information 114 | #[[licenses.clarify]] 115 | # The package spec the clarification applies to 116 | #crate = "ring" 117 | # The SPDX expression for the license requirements of the crate 118 | #expression = "MIT AND ISC AND OpenSSL" 119 | # One or more files in the crate's source used as the "source of truth" for 120 | # the license expression. If the contents match, the clarification will be used 121 | # when running the license check, otherwise the clarification will be ignored 122 | # and the crate will be checked normally, which may produce warnings or errors 123 | # depending on the rest of your configuration 124 | #license-files = [ 125 | # Each entry is a crate relative path, and the (opaque) hash of its contents 126 | #{ path = "LICENSE", hash = 0xbd0eed23 } 127 | #] 128 | 129 | [licenses.private] 130 | # If true, ignores workspace crates that aren't published, or are only 131 | # published to private registries. 132 | # To see how to mark a crate as unpublished (to the official registry), 133 | # visit https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field. 134 | ignore = false 135 | # One or more private registries that you might publish crates to, if a crate 136 | # is only published to private registries, and ignore is true, the crate will 137 | # not have its license(s) checked 138 | registries = [ 139 | #"https://sekretz.com/registry 140 | ] 141 | 142 | # This section is considered when running `cargo deny check bans`. 143 | # More documentation about the 'bans' section can be found here: 144 | # https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html 145 | [bans] 146 | # Lint level for when multiple versions of the same crate are detected 147 | multiple-versions = "warn" 148 | # Lint level for when a crate version requirement is `*` 149 | wildcards = "allow" 150 | # The graph highlighting used when creating dotgraphs for crates 151 | # with multiple versions 152 | # * lowest-version - The path to the lowest versioned duplicate is highlighted 153 | # * simplest-path - The path to the version with the fewest edges is highlighted 154 | # * all - Both lowest-version and simplest-path are used 155 | highlight = "all" 156 | # The default lint level for `default` features for crates that are members of 157 | # the workspace that is being checked. This can be overridden by allowing/denying 158 | # `default` on a crate-by-crate basis if desired. 159 | workspace-default-features = "allow" 160 | # The default lint level for `default` features for external crates that are not 161 | # members of the workspace. This can be overridden by allowing/denying `default` 162 | # on a crate-by-crate basis if desired. 163 | external-default-features = "allow" 164 | # List of crates that are allowed. Use with care! 165 | allow = [ 166 | #"ansi_term@0.11.0", 167 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is allowed" }, 168 | ] 169 | # List of crates to deny 170 | deny = [ 171 | #"ansi_term@0.11.0", 172 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason it is banned" }, 173 | # Wrapper crates can optionally be specified to allow the crate when it 174 | # is a direct dependency of the otherwise banned crate 175 | #{ crate = "ansi_term@0.11.0", wrappers = ["this-crate-directly-depends-on-ansi_term"] }, 176 | ] 177 | 178 | # List of features to allow/deny 179 | # Each entry the name of a crate and a version range. If version is 180 | # not specified, all versions will be matched. 181 | #[[bans.features]] 182 | #crate = "reqwest" 183 | # Features to not allow 184 | #deny = ["json"] 185 | # Features to allow 186 | #allow = [ 187 | # "rustls", 188 | # "__rustls", 189 | # "__tls", 190 | # "hyper-rustls", 191 | # "rustls", 192 | # "rustls-pemfile", 193 | # "rustls-tls-webpki-roots", 194 | # "tokio-rustls", 195 | # "webpki-roots", 196 | #] 197 | # If true, the allowed features must exactly match the enabled feature set. If 198 | # this is set there is no point setting `deny` 199 | #exact = true 200 | 201 | # Certain crates/versions that will be skipped when doing duplicate detection. 202 | skip = [ 203 | #"ansi_term@0.11.0", 204 | #{ crate = "ansi_term@0.11.0", reason = "you can specify a reason why it can't be updated/removed" }, 205 | ] 206 | # Similarly to `skip` allows you to skip certain crates during duplicate 207 | # detection. Unlike skip, it also includes the entire tree of transitive 208 | # dependencies starting at the specified crate, up to a certain depth, which is 209 | # by default infinite. 210 | skip-tree = [ 211 | #"ansi_term@0.11.0", # will be skipped along with _all_ of its direct and transitive dependencies 212 | #{ crate = "ansi_term@0.11.0", depth = 20 }, 213 | ] 214 | 215 | # This section is considered when running `cargo deny check sources`. 216 | # More documentation about the 'sources' section can be found here: 217 | # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html 218 | [sources] 219 | # Lint level for what to happen when a crate from a crate registry that is not 220 | # in the allow list is encountered 221 | unknown-registry = "warn" 222 | # Lint level for what to happen when a crate from a git repository that is not 223 | # in the allow list is encountered 224 | unknown-git = "warn" 225 | # List of URLs for allowed crate registries. Defaults to the crates.io index 226 | # if not specified. If it is specified but empty, no registries are allowed. 227 | allow-registry = ["https://github.com/rust-lang/crates.io-index"] 228 | # List of URLs for allowed Git repositories 229 | allow-git = [] 230 | 231 | [sources.allow-org] 232 | # 1 or more github.com organizations to allow git sources for 233 | github = [""] 234 | # 1 or more gitlab.com organizations to allow git sources for 235 | gitlab = [""] 236 | # 1 or more bitbucket.org organizations to allow git sources for 237 | bitbucket = [""] 238 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by dist: https://axodotdev.github.io/cargo-dist 2 | # 3 | # Copyright 2022-2024, axodotdev 4 | # SPDX-License-Identifier: MIT or Apache-2.0 5 | # 6 | # CI that: 7 | # 8 | # * checks for a Git Tag that looks like a release 9 | # * builds artifacts with dist (archives, installers, hashes) 10 | # * uploads those artifacts to temporary workflow zip 11 | # * on success, uploads the artifacts to a GitHub Release 12 | # 13 | # Note that the GitHub Release will be created with a generated 14 | # title/body based on your changelogs. 15 | 16 | name: Release 17 | permissions: 18 | "contents": "write" 19 | 20 | # This task will run whenever you push a git tag that looks like a version 21 | # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc. 22 | # Various formats will be parsed into a VERSION and an optional PACKAGE_NAME, where 23 | # PACKAGE_NAME must be the name of a Cargo package in your workspace, and VERSION 24 | # must be a Cargo-style SemVer Version (must have at least major.minor.patch). 25 | # 26 | # If PACKAGE_NAME is specified, then the announcement will be for that 27 | # package (erroring out if it doesn't have the given version or isn't dist-able). 28 | # 29 | # If PACKAGE_NAME isn't specified, then the announcement will be for all 30 | # (dist-able) packages in the workspace with that version (this mode is 31 | # intended for workspaces with only one dist-able package, or with all dist-able 32 | # packages versioned/released in lockstep). 33 | # 34 | # If you push multiple tags at once, separate instances of this workflow will 35 | # spin up, creating an independent announcement for each one. However, GitHub 36 | # will hard limit this to 3 tags per commit, as it will assume more tags is a 37 | # mistake. 38 | # 39 | # If there's a prerelease-style suffix to the version, then the release(s) 40 | # will be marked as a prerelease. 41 | on: 42 | pull_request: 43 | push: 44 | tags: 45 | - '**[0-9]+.[0-9]+.[0-9]+*' 46 | 47 | jobs: 48 | # Run 'dist plan' (or host) to determine what tasks we need to do 49 | plan: 50 | runs-on: "ubuntu-22.04" 51 | outputs: 52 | val: ${{ steps.plan.outputs.manifest }} 53 | tag: ${{ !github.event.pull_request && github.ref_name || '' }} 54 | tag-flag: ${{ !github.event.pull_request && format('--tag={0}', github.ref_name) || '' }} 55 | publishing: ${{ !github.event.pull_request }} 56 | env: 57 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 58 | steps: 59 | - uses: actions/checkout@v4 60 | with: 61 | persist-credentials: false 62 | submodules: recursive 63 | - name: Install dist 64 | # we specify bash to get pipefail; it guards against the `curl` command 65 | # failing. otherwise `sh` won't catch that `curl` returned non-0 66 | shell: bash 67 | run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.30.2/cargo-dist-installer.sh | sh" 68 | - name: Cache dist 69 | uses: actions/upload-artifact@v4 70 | with: 71 | name: cargo-dist-cache 72 | path: ~/.cargo/bin/dist 73 | # sure would be cool if github gave us proper conditionals... 74 | # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible 75 | # functionality based on whether this is a pull_request, and whether it's from a fork. 76 | # (PRs run on the *source* but secrets are usually on the *target* -- that's *good* 77 | # but also really annoying to build CI around when it needs secrets to work right.) 78 | - id: plan 79 | run: | 80 | dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json 81 | echo "dist ran successfully" 82 | cat plan-dist-manifest.json 83 | echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" 84 | - name: "Upload dist-manifest.json" 85 | uses: actions/upload-artifact@v4 86 | with: 87 | name: artifacts-plan-dist-manifest 88 | path: plan-dist-manifest.json 89 | 90 | # Build and packages all the platform-specific things 91 | build-local-artifacts: 92 | name: build-local-artifacts (${{ join(matrix.targets, ', ') }}) 93 | # Let the initial task tell us to not run (currently very blunt) 94 | needs: 95 | - plan 96 | if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }} 97 | strategy: 98 | fail-fast: false 99 | # Target platforms/runners are computed by dist in create-release. 100 | # Each member of the matrix has the following arguments: 101 | # 102 | # - runner: the github runner 103 | # - dist-args: cli flags to pass to dist 104 | # - install-dist: expression to run to install dist on the runner 105 | # 106 | # Typically there will be: 107 | # - 1 "global" task that builds universal installers 108 | # - N "local" tasks that build each platform's binaries and platform-specific installers 109 | matrix: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix }} 110 | runs-on: ${{ matrix.runner }} 111 | container: ${{ matrix.container && matrix.container.image || null }} 112 | env: 113 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 114 | BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json 115 | steps: 116 | - name: enable windows longpaths 117 | run: | 118 | git config --global core.longpaths true 119 | - uses: actions/checkout@v4 120 | with: 121 | persist-credentials: false 122 | submodules: recursive 123 | - name: Install Rust non-interactively if not already installed 124 | if: ${{ matrix.container }} 125 | run: | 126 | if ! command -v cargo > /dev/null 2>&1; then 127 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 128 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 129 | fi 130 | - name: Install dist 131 | run: ${{ matrix.install_dist.run }} 132 | # Get the dist-manifest 133 | - name: Fetch local artifacts 134 | uses: actions/download-artifact@v4 135 | with: 136 | pattern: artifacts-* 137 | path: target/distrib/ 138 | merge-multiple: true 139 | - name: Install dependencies 140 | run: | 141 | ${{ matrix.packages_install }} 142 | - name: Build artifacts 143 | run: | 144 | # Actually do builds and make zips and whatnot 145 | dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json 146 | echo "dist ran successfully" 147 | - id: cargo-dist 148 | name: Post-build 149 | # We force bash here just because github makes it really hard to get values up 150 | # to "real" actions without writing to env-vars, and writing to env-vars has 151 | # inconsistent syntax between shell and powershell. 152 | shell: bash 153 | run: | 154 | # Parse out what we just built and upload it to scratch storage 155 | echo "paths<> "$GITHUB_OUTPUT" 156 | dist print-upload-files-from-manifest --manifest dist-manifest.json >> "$GITHUB_OUTPUT" 157 | echo "EOF" >> "$GITHUB_OUTPUT" 158 | 159 | cp dist-manifest.json "$BUILD_MANIFEST_NAME" 160 | - name: "Upload artifacts" 161 | uses: actions/upload-artifact@v4 162 | with: 163 | name: artifacts-build-local-${{ join(matrix.targets, '_') }} 164 | path: | 165 | ${{ steps.cargo-dist.outputs.paths }} 166 | ${{ env.BUILD_MANIFEST_NAME }} 167 | 168 | # Build and package all the platform-agnostic(ish) things 169 | build-global-artifacts: 170 | needs: 171 | - plan 172 | - build-local-artifacts 173 | runs-on: "ubuntu-22.04" 174 | env: 175 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 176 | BUILD_MANIFEST_NAME: target/distrib/global-dist-manifest.json 177 | steps: 178 | - uses: actions/checkout@v4 179 | with: 180 | persist-credentials: false 181 | submodules: recursive 182 | - name: Install cached dist 183 | uses: actions/download-artifact@v4 184 | with: 185 | name: cargo-dist-cache 186 | path: ~/.cargo/bin/ 187 | - run: chmod +x ~/.cargo/bin/dist 188 | # Get all the local artifacts for the global tasks to use (for e.g. checksums) 189 | - name: Fetch local artifacts 190 | uses: actions/download-artifact@v4 191 | with: 192 | pattern: artifacts-* 193 | path: target/distrib/ 194 | merge-multiple: true 195 | - id: cargo-dist 196 | shell: bash 197 | run: | 198 | dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json 199 | echo "dist ran successfully" 200 | 201 | # Parse out what we just built and upload it to scratch storage 202 | echo "paths<> "$GITHUB_OUTPUT" 203 | jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" 204 | echo "EOF" >> "$GITHUB_OUTPUT" 205 | 206 | cp dist-manifest.json "$BUILD_MANIFEST_NAME" 207 | - name: "Upload artifacts" 208 | uses: actions/upload-artifact@v4 209 | with: 210 | name: artifacts-build-global 211 | path: | 212 | ${{ steps.cargo-dist.outputs.paths }} 213 | ${{ env.BUILD_MANIFEST_NAME }} 214 | # Determines if we should publish/announce 215 | host: 216 | needs: 217 | - plan 218 | - build-local-artifacts 219 | - build-global-artifacts 220 | # Only run if we're "publishing", and only if plan, local and global didn't fail (skipped is fine) 221 | if: ${{ always() && needs.plan.result == 'success' && needs.plan.outputs.publishing == 'true' && (needs.build-global-artifacts.result == 'skipped' || needs.build-global-artifacts.result == 'success') && (needs.build-local-artifacts.result == 'skipped' || needs.build-local-artifacts.result == 'success') }} 222 | env: 223 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 224 | runs-on: "ubuntu-22.04" 225 | outputs: 226 | val: ${{ steps.host.outputs.manifest }} 227 | steps: 228 | - uses: actions/checkout@v4 229 | with: 230 | persist-credentials: false 231 | submodules: recursive 232 | - name: Install cached dist 233 | uses: actions/download-artifact@v4 234 | with: 235 | name: cargo-dist-cache 236 | path: ~/.cargo/bin/ 237 | - run: chmod +x ~/.cargo/bin/dist 238 | # Fetch artifacts from scratch-storage 239 | - name: Fetch artifacts 240 | uses: actions/download-artifact@v4 241 | with: 242 | pattern: artifacts-* 243 | path: target/distrib/ 244 | merge-multiple: true 245 | - id: host 246 | shell: bash 247 | run: | 248 | dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json 249 | echo "artifacts uploaded and released successfully" 250 | cat dist-manifest.json 251 | echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" 252 | - name: "Upload dist-manifest.json" 253 | uses: actions/upload-artifact@v4 254 | with: 255 | # Overwrite the previous copy 256 | name: artifacts-dist-manifest 257 | path: dist-manifest.json 258 | # Create a GitHub Release while uploading all files to it 259 | - name: "Download GitHub Artifacts" 260 | uses: actions/download-artifact@v4 261 | with: 262 | pattern: artifacts-* 263 | path: artifacts 264 | merge-multiple: true 265 | - name: Cleanup 266 | run: | 267 | # Remove the granular manifests 268 | rm -f artifacts/*-dist-manifest.json 269 | - name: Create GitHub Release 270 | env: 271 | PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}" 272 | ANNOUNCEMENT_TITLE: "${{ fromJson(steps.host.outputs.manifest).announcement_title }}" 273 | ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}" 274 | RELEASE_COMMIT: "${{ github.sha }}" 275 | run: | 276 | # Write and read notes from a file to avoid quoting breaking things 277 | echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt 278 | 279 | gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/* 280 | 281 | announce: 282 | needs: 283 | - plan 284 | - host 285 | # use "always() && ..." to allow us to wait for all publish jobs while 286 | # still allowing individual publish jobs to skip themselves (for prereleases). 287 | # "host" however must run to completion, no skipping allowed! 288 | if: ${{ always() && needs.host.result == 'success' }} 289 | runs-on: "ubuntu-22.04" 290 | env: 291 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 292 | steps: 293 | - uses: actions/checkout@v4 294 | with: 295 | persist-credentials: false 296 | submodules: recursive 297 | -------------------------------------------------------------------------------- /src/list_manipulations.rs: -------------------------------------------------------------------------------- 1 | use crate::count_characters; 2 | use crate::edit_distance::find_edit_distance; 3 | use crate::schlinkert_pruning::get_sardinas_patterson_final_intersection; 4 | use memchr::memchr; 5 | use unicode_normalization::UnicodeNormalization; 6 | 7 | /// Normalize the Unicode of a string 8 | /// See 9 | pub fn normalize_unicode(word: &str, nf: &str) -> Result { 10 | match nf.to_lowercase().as_str() { 11 | "nfc" => Ok(word.nfc().collect()), 12 | "nfd" => Ok(word.nfd().collect()), 13 | "nfkc" => Ok(word.nfkc().collect()), 14 | "nfkd" => Ok(word.nfkd().collect()), 15 | _ => Err("Unknown Unicode Normalization Form received in arguments.\nPlease use one of the following normalization forms: nfc, nfd, nfkc, or nfkd.".to_string()), 16 | } 17 | } 18 | 19 | // use core::cmp::Ordering; 20 | use icu::collator::{options::*, *}; 21 | // use icu::locale::locale; 22 | use icu::locale::Locale; 23 | /// Sort a Vector of words alphabetically, taking into account the locale of the words 24 | /// `.sorted()` words -> ["Zambia", "abbey", "eager", "enlever", "ezra", "zoo", "énigme"] 25 | /// sort_carefully words -> ["abbey", "eager", "énigme", "enlever", "ezra", "Zambia", "zoo"] 26 | pub fn sort_carefully(list: Vec, loc: Locale) -> Vec { 27 | // Examples: https://github.com/unicode-org/icu4x/tree/main/components/collator#examples 28 | // Reference: https://docs.rs/icu/latest/icu/collator/index.html 29 | // https://docs.rs/icu/latest/icu/locale/struct.Locale.html 30 | let mut options = CollatorOptions::default(); 31 | options.strength = Some(Strength::Secondary); // Note this is not the locally defined passphrase Strength! 32 | let collator = Collator::try_new(loc.into(), options).unwrap(); 33 | 34 | let mut newly_sorted_list = list; 35 | newly_sorted_list.sort_by(|a, b| collator.compare(a, b)); 36 | newly_sorted_list 37 | } 38 | 39 | /// Sort by word length, with longest words first. For words of equal length, sorts 40 | /// word alphabetically, respecting inputted locale. 41 | pub fn sort_by_length(list: Vec, loc: Locale) -> Vec { 42 | // Set up the collator again 43 | let mut options = CollatorOptions::default(); 44 | options.strength = Some(Strength::Secondary); 45 | // let collator = Collator::try_new(locale, options).unwrap(); 46 | let collator = Collator::try_new(loc.into(), options).unwrap(); 47 | 48 | let mut list = list; 49 | // Order by count_characters(w) descending, then within that, 50 | // alphabetically 51 | list.sort_by(|word1, word2| { 52 | count_characters(word2) 53 | .cmp(&count_characters(word1)) 54 | .then_with(|| collator.compare(word1, word2)) 55 | }); 56 | list 57 | } 58 | 59 | /// Given a String (a word), delete all integers from the word. 60 | pub fn delete_integers(mut word: String) -> String { 61 | word.retain(|c| !c.is_numeric()); 62 | word 63 | } 64 | 65 | /// Given a String (a word), delete all characters that are not 66 | /// alphanumeric 67 | /// ``` 68 | /// use tidy::list_manipulations::delete_nonalphanumeric; 69 | /// assert_eq!(delete_nonalphanumeric("Hello!".to_string()), "Hello"); 70 | /// assert_eq!(delete_nonalphanumeric("world824...".to_string()), "world824"); 71 | /// ``` 72 | pub fn delete_nonalphanumeric(mut word: String) -> String { 73 | word.retain(|c| c.is_alphanumeric()); 74 | word 75 | } 76 | 77 | /// Delete all characters through and including the first appearance 78 | /// of character `ch` in inputted `&str` `s`. Program uses this to 79 | /// remove character through first tab or first space, a common task 80 | /// when dealing with diceware passphrase word lists that have dice roll 81 | /// numbers before each word. The 82 | /// [EFF long list](https://www.eff.org/files/2016/07/18/eff_large_wordlist.txt) 83 | /// is one such example. 84 | /// 85 | /// Uses [memchr library](https://docs.rs/memchr/latest/memchr/) 86 | /// to find this character a bit quicker than standard function. 87 | /// 88 | /// I outlined other approaches to this function in 89 | /// [a separate repo](https://github.com/sts10/splitter/blob/main/src/lib.rs). 90 | pub fn delete_before_first_char(s: &str, ch: char) -> &str { 91 | match memchr(ch as u8, s.as_bytes()) { 92 | None => s, // not found => return the whole string 93 | Some(pos) => &s[pos + 1..], 94 | } 95 | } 96 | 97 | /// Delete all characters after and including the first appearance 98 | /// of character `ch` in inputted `&str` `s`. 99 | /// 100 | /// Uses [memchr library](https://docs.rs/memchr/latest/memchr/) 101 | /// to find this character a bit quicker than standard function. 102 | /// 103 | /// I outlined other approaches to this function in 104 | /// [a separate repo](https://github.com/sts10/splitter/blob/main/src/lib.rs). 105 | pub fn delete_after_first_char(s: &str, ch: char) -> &str { 106 | match memchr(ch as u8, s.as_bytes()) { 107 | None => s, // delimiting character not found in string s, so return the whole string 108 | Some(pos) => &s[0..pos], 109 | } 110 | } 111 | 112 | use std::collections::HashMap; 113 | /// This function removes words from the given word list 114 | /// such that the resulting, outputted list has a guaranteed 115 | /// maximum prefix length. 116 | /// 117 | /// As an example, if `maximum_shared_prefix_length == 4`, this 118 | /// means that on the resulting list, knowing the first 4 characters 119 | /// of any word on the generated list is sufficient to know which 120 | /// word it is. In this case, we'd know that if a word starts with 121 | /// "radi", we know it must be the word "radius" (if "radical" had been 122 | /// on the list, this function would have removed it). 123 | /// 124 | /// This is useful if you intend the list to be used by software that 125 | /// uses auto-complete. In the case described above, a user will only have to type the 126 | /// first 4 characters of any word before a program could successfully 127 | /// auto-complete the entire word. 128 | pub fn guarantee_maximum_prefix_length( 129 | list: &[String], 130 | maximum_shared_prefix_length: usize, 131 | ) -> Vec { 132 | let mut prefix_hashmap: HashMap = HashMap::new(); 133 | for this_word in list { 134 | // If this word is too short just skip it. 135 | if count_characters(this_word) < maximum_shared_prefix_length { 136 | continue; 137 | } 138 | prefix_hashmap 139 | .entry(get_prefix(this_word, maximum_shared_prefix_length)) 140 | .and_modify(|existing_word| { 141 | // Prefer shorter words, as a stand-in for simplicity (though that 142 | // is debatable...) 143 | if count_characters(this_word) < count_characters(existing_word) { 144 | *existing_word = this_word.to_string() 145 | } 146 | }) 147 | .or_insert_with(|| this_word.to_string()); 148 | } 149 | let new_vec: Vec<(&String, &String)> = prefix_hashmap.iter().collect(); 150 | let mut new_word_list = vec![]; 151 | for t in new_vec { 152 | new_word_list.push(t.1.to_string()); 153 | } 154 | new_word_list 155 | } 156 | 157 | /// Executes Schlinkert prune. Attempts to make list uniquely decodable 158 | /// by removing the fewest number of code words possible. Adapted from 159 | /// Sardinas-Patterson algorithm. 160 | /// Runs word list both as given and with each word reversed, preferring 161 | /// which ever preserves more words from the given list. 162 | pub fn schlinkert_prune(list: &[String]) -> Vec { 163 | // Clumsily clone the list into a new variable. 164 | let mut new_list = list.to_owned(); 165 | // First, simply find the "offenders" with the list as given. 166 | let offenders_to_remove_forwards = get_sardinas_patterson_final_intersection(list); 167 | // Now, reverse all words before running the Schlinkert prune. 168 | // This will give a different list of offending words -- and potentially FEWER 169 | // than running the prune forwards. (We call reverse_all_words function 170 | // twice because we have to un-reverse all the offending words at the end.) 171 | let offenders_to_remove_backwards = reverse_all_words( 172 | &get_sardinas_patterson_final_intersection(&reverse_all_words(list)), 173 | ); 174 | // If running the prune on the reversed words yielded fewer offenders 175 | // we'll remove those offending words, since our goal is to remove 176 | // the fewest number of words as possible. 177 | if offenders_to_remove_forwards.len() <= offenders_to_remove_backwards.len() { 178 | new_list.retain(|x| !offenders_to_remove_forwards.contains(x)); 179 | } else { 180 | new_list.retain(|x| !offenders_to_remove_backwards.contains(x)); 181 | } 182 | new_list 183 | } 184 | 185 | /// Reverse all words on given list. For example, 186 | /// `["hotdog", "hamburger", "alligator"]` becomes 187 | /// `["godtoh", "regrubmah", "rotagilla"]` 188 | /// Uses graphemes to ensure it handles accented characters correctly. 189 | pub fn reverse_all_words(list: &[String]) -> Vec { 190 | let mut reversed_list = vec![]; 191 | for word in list { 192 | reversed_list.push(word.graphemes(true).rev().collect::()); 193 | } 194 | reversed_list 195 | } 196 | 197 | use unicode_segmentation::UnicodeSegmentation; 198 | /// Given a word and a `usize` of `length`, this function returns 199 | /// the first `length` characters of that word. This length is 200 | /// measured in grapheme clusters to better handle accented 201 | /// characters and emoji. 202 | /// ``` 203 | /// use tidy::list_manipulations::get_prefix; 204 | /// assert_eq!(get_prefix("hello world", 4), "hell"); 205 | /// assert_eq!(get_prefix("sécréter", 5), "sécré"); 206 | /// assert_eq!(get_prefix("😀😃😄😁😆", 2), "😀😃"); 207 | /// ``` 208 | pub fn get_prefix(word: &str, length: usize) -> String { 209 | word.graphemes(true).take(length).collect::() 210 | } 211 | 212 | /// Helper function to determine if a given char as `u16` is a 213 | /// Latin letter (A through Z or a through z, no diacritics). 214 | /// ``` 215 | /// use tidy::list_manipulations::is_latin_alphabetic; 216 | /// assert_eq!(is_latin_alphabetic('h' as u16), true); 217 | /// assert_eq!(is_latin_alphabetic('A' as u16), true); 218 | /// assert_eq!(is_latin_alphabetic('1' as u16), false); 219 | /// assert_eq!(is_latin_alphabetic(',' as u16), false); 220 | /// assert_eq!(is_latin_alphabetic('é' as u16), false); 221 | /// assert_eq!(is_latin_alphabetic('ő' as u16), false); 222 | /// ``` 223 | pub fn is_latin_alphabetic(chr: u16) -> bool { 224 | (chr >= 65 && chr <= 90) || (chr >= 97 && chr <= 122) 225 | } 226 | 227 | /// Replaces curly or smart quotes with straight quotes. 228 | pub fn straighten_quotes(input: &str) -> String { 229 | let mut result = String::new(); 230 | for c in input.chars() { 231 | match c { 232 | '\u{201C}' => result.push('\"'), // left double quotation mark 233 | '\u{201D}' => result.push('\"'), // right double quotation mark 234 | '\u{2018}' => result.push('\''), // left single quotation mark 235 | '\u{2019}' => result.push('\''), // right single quotation mark 236 | _ => result.push(c), 237 | } 238 | } 239 | result 240 | } 241 | 242 | use itertools::Itertools; 243 | /// De-duplicates a Vector of `String`s while maintaining list order. 244 | pub fn dedup_without_sorting(list: &mut [String]) -> Vec { 245 | let dedup: Vec = list.iter().unique().map(|s| s.to_string()).collect(); 246 | dedup.to_vec() 247 | } 248 | 249 | /// Remove prefix words from the given Vector of `String`s. 250 | /// 251 | /// A brief example: If both "news" and "newspaper" are on the inputted list 252 | /// we may, for security reasons, want to remove the prefix word, 253 | /// which is "news" in this case. 254 | pub fn remove_prefix_words(list: Vec) -> Vec { 255 | let mut list_without_prefix_words = list.to_vec(); 256 | list_without_prefix_words.retain(|potential_prefix_word| { 257 | for word in &list { 258 | if word.starts_with(potential_prefix_word) && word != potential_prefix_word { 259 | // This is a prefix word, so we do NOT want to retain it. return false to the 260 | // retain 261 | return false; 262 | } else { 263 | // This particular word is not a prefix word of this potential_prefix_word. 264 | // keep looping 265 | continue; 266 | }; 267 | } 268 | // If we've made it here, we can be sure that potential_prefix_word is NOT a 269 | // prefix word. So we want to retain it for the list_without_prefix_words. 270 | // To do this, we return true to the retain. 271 | true 272 | }); 273 | list_without_prefix_words 274 | } 275 | 276 | /// Remove suffix words from the given Vector of `String`s. 277 | /// 278 | /// A brief example: If both "news" and "newspaper" are on the inputted list 279 | /// we may, for security reasons, want to remove the suffix word, 280 | /// which is "paper" in this case. 281 | pub fn remove_suffix_words(list: Vec) -> Vec { 282 | let mut list_without_suffix_words = list.to_vec(); 283 | list_without_suffix_words.retain(|potential_suffix_word| { 284 | for word in &list { 285 | if word.ends_with(potential_suffix_word) && word != potential_suffix_word { 286 | // This is a suffix word, so we do NOT want to retain it. return false to the 287 | // retain 288 | return false; 289 | } else { 290 | // This particular word is not a suffix word of this potential_suffix_word. 291 | // keep looping 292 | continue; 293 | }; 294 | } 295 | // If we've made it here, we can be sure that potential_suffix_word is NOT a 296 | // suffix word. So we want to retain it for the list_without_suffix_words. 297 | // To do this, we return true to the retain. 298 | true 299 | }); 300 | list_without_suffix_words 301 | } 302 | 303 | /// Only retain words that are the given `minimum_edit_distance` away from all 304 | /// other words on the list. 305 | /// 306 | /// Calulates edit distance using a function in the edit_distance module. 307 | pub fn enfore_minimum_edit_distance( 308 | list: Vec, 309 | minimum_edit_distance: usize, 310 | ) -> Vec { 311 | let minimum_edit_distance: u32 = minimum_edit_distance.try_into().unwrap(); 312 | let mut list_to_read = list.to_vec(); 313 | // Sort short words first to prefer them 314 | list_to_read.sort_by_key(|a| count_characters(a)); 315 | 316 | let mut new_list = list.to_vec(); 317 | new_list.retain(|potential_too_close_word| { 318 | for word in &list_to_read { 319 | // Skip if we're looking at the same word 320 | if word == potential_too_close_word { 321 | continue; 322 | } 323 | if find_edit_distance(word, potential_too_close_word) < minimum_edit_distance { 324 | // This potential_too_close_word is too close to another word on the list, 325 | // so we do NOT want to retain it. 326 | // return false to the retain 327 | return false; 328 | } else { 329 | // This particular word is not too close to this potential_too_close_word. 330 | // keep looping 331 | continue; 332 | }; 333 | } 334 | // If we've made it here, we can be sure that potential_too_close_word is NOT too 335 | // close to another word. So we want to retain it for the new_list. 336 | // To do this, we return true to the retain. 337 | true 338 | }); 339 | new_list 340 | } 341 | 342 | /// Takes the inputted word list and a Vector of tuples of Strings, 343 | /// each representing a pair of homophones, e.g. `("there", "their")`. 344 | /// The function outputs a new list in which, if both homophones 345 | /// are detected, the second homophone is removed. 346 | pub fn remove_homophones(list: Vec, homophones: Vec<(String, String)>) -> Vec { 347 | let mut words_to_remove = vec![]; 348 | for pair_of_homophones in homophones { 349 | if list.contains(&pair_of_homophones.0) 350 | && list.contains(&pair_of_homophones.1) 351 | && !(words_to_remove.contains(&pair_of_homophones.0) 352 | || words_to_remove.contains(&pair_of_homophones.1)) 353 | { 354 | words_to_remove.push(pair_of_homophones.1); 355 | } 356 | } 357 | let mut new_list = list.to_vec(); 358 | new_list.retain(|w| !words_to_remove.contains(w)); 359 | new_list 360 | } 361 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // use core::cmp::Ordering; 2 | use icu::locale::Locale; 3 | // use icu::locale::locale; 4 | use rand::prelude::SliceRandom; 5 | use rand::rng; 6 | pub mod cards; 7 | pub mod dice; 8 | pub mod display_information; 9 | pub mod edit_distance; 10 | pub mod file_readers; 11 | pub mod file_writer; 12 | pub mod list_manipulations; 13 | pub mod parsers; 14 | pub mod schlinkert_pruning; 15 | use crate::list_manipulations::*; 16 | 17 | #[derive(Default, Debug, Clone)] 18 | pub struct TidyRequest { 19 | pub list: Vec, 20 | pub take_first: Option, 21 | pub take_rand: Option, 22 | pub sort_alphabetically: bool, 23 | pub sort_by_length: bool, 24 | pub ignore_after_delimiter: Option, 25 | pub ignore_before_delimiter: Option, 26 | pub normalization_form: Option, 27 | pub locale: String, // defaults to en-US 28 | pub to_lowercase: bool, 29 | pub should_straighten_quotes: bool, 30 | pub should_remove_prefix_words: bool, 31 | pub should_remove_suffix_words: bool, 32 | pub should_schlinkert_prune: bool, 33 | pub should_remove_nonalphanumeric: bool, 34 | pub should_delete_nonalphanumeric: bool, 35 | pub should_remove_nonalphabetic: bool, 36 | pub should_remove_non_latin_alphabetic: bool, 37 | pub should_remove_nonascii: bool, 38 | pub should_remove_integers: bool, 39 | pub should_delete_integers: bool, 40 | pub should_delete_after_first_delimiter: Option, 41 | pub should_delete_before_first_delimiter: Option, 42 | pub reject_list: Option>, 43 | pub approved_list: Option>, 44 | pub homophones_list: Option>, 45 | pub minimum_length: Option, 46 | pub maximum_length: Option, 47 | pub maximum_shared_prefix_length: Option, 48 | pub minimum_edit_distance: Option, 49 | pub print_rand: Option, 50 | pub print_first: Option, 51 | } 52 | 53 | #[derive(PartialEq)] 54 | enum MetadataPosition { 55 | Start, 56 | End, 57 | } 58 | 59 | /// Simple helper function that splits a `str` by a given substring `str`, 60 | /// Then returns a Vector of `str`s. 61 | /// ``` 62 | /// use tidy::split_and_vectorize; 63 | /// assert_eq!(split_and_vectorize("a:b:c",":"), vec!["a","b","c"]); 64 | /// ``` 65 | /// I find this a handy general helper function. 66 | pub fn split_and_vectorize<'a>(string_to_split: &'a str, splitter: &str) -> Vec<&'a str> { 67 | string_to_split.split(splitter).collect() 68 | } 69 | 70 | /// This is the large, key function of the program. It takes 71 | /// a `TidyRequest` object -- which includes the word list -- 72 | /// and performs whatever functions the user has requesteed to 73 | /// perform on the list. 74 | pub fn tidy_list(req: TidyRequest) -> Vec { 75 | // First, we need to do the two truncations 76 | let mut list_to_tidy = req.list.clone(); 77 | list_to_tidy = match req.take_first { 78 | Some(amount_to_take) => { 79 | list_to_tidy.truncate(amount_to_take); 80 | list_to_tidy 81 | } 82 | None => list_to_tidy, 83 | }; 84 | list_to_tidy = match req.take_rand { 85 | Some(amount_to_take) => { 86 | let mut rng = rng(); 87 | list_to_tidy.shuffle(&mut rng); 88 | list_to_tidy.truncate(amount_to_take); 89 | list_to_tidy 90 | } 91 | None => list_to_tidy, 92 | }; 93 | let mut tidied_list = vec![]; 94 | // Now we go word-by-word 95 | for word in &list_to_tidy { 96 | // METADATA-IGNORING WORD REMOVALS 97 | // If user chose to ignore metadata, split the line into the word and the metadata 98 | // based on given delimiter. Note that metadata may come before or after the word. 99 | // We'll then do removals operations on the "word", ignoring metadata. 100 | // Later, we'll re-add the metadata to the word. 101 | 102 | // We need delimiter to have a broad scope so that we can use it 103 | // when we re-add the metadata at the end. Default to comma, but can be changed 104 | // in match statement here. 105 | let (mut new_word, delimiter, metadata, metadata_position) = 106 | match (req.ignore_after_delimiter, req.ignore_before_delimiter) { 107 | (Some(delimiter), None) => { 108 | // Parse delimiter. Currently this converts 's' to ' ' 109 | // and 't' to '\t'. 110 | let delimiter = parse_delimiter(delimiter).unwrap(); 111 | let split_vec = split_and_vectorize(word, &delimiter.to_string()); 112 | if split_vec.len() == 1 { 113 | eprintln!("No metadata found for word: {:?}", word); 114 | (word.to_string(), Some(delimiter), None, None) 115 | } else { 116 | ( 117 | split_vec[0].to_string(), 118 | Some(delimiter), 119 | Some(split_vec[1]), 120 | Some(MetadataPosition::End), 121 | ) 122 | } 123 | } 124 | (None, Some(delimiter)) => { 125 | let delimiter = parse_delimiter(delimiter).unwrap(); 126 | let split_vec = split_and_vectorize(word, &delimiter.to_string()); 127 | if split_vec.len() == 1 { 128 | eprintln!("No metadata found for word: {:?}", word); 129 | (word.to_string(), Some(delimiter), None, None) 130 | } else { 131 | ( 132 | split_vec[1].to_string(), 133 | Some(delimiter), 134 | Some(split_vec[0]), 135 | Some(MetadataPosition::Start), 136 | ) 137 | } 138 | } 139 | (Some(ref _delimiter1), Some(ref _delimiter2)) => { 140 | // This situation should be caught and handled better 141 | // in src/main.rs, so this is really just in case. 142 | panic!("Can't ignore metadata on both sides currently") 143 | } 144 | (None, None) => (word.to_string(), None, None, None), 145 | }; 146 | 147 | // Trim new word, then normalize unicode if user gave an 148 | // nromalization form to use 149 | new_word = match &req.normalization_form { 150 | Some(nf) => match normalize_unicode(new_word.trim(), nf) { 151 | Ok(word) => word, 152 | Err(e) => panic!("{}", e), 153 | }, 154 | None => { 155 | // still need to trim 156 | new_word.trim().to_string() 157 | } 158 | }; 159 | 160 | // WORD MODIFICATIONS 161 | // For logic reasons, it's crucial that Tidy perform these word 162 | // modifications BEFORE it runs word removals. 163 | // If user has chosen to Ignore Metadata, we're guranteed 164 | // that all of these will be None, so we don't have to worry 165 | // about metadata loss due to de-duplication caused by word modification. 166 | new_word = match req.should_delete_before_first_delimiter { 167 | Some(delimiter) => { 168 | delete_before_first_char(&new_word, parse_delimiter(delimiter).unwrap()).to_string() 169 | } 170 | None => new_word, 171 | }; 172 | new_word = match req.should_delete_after_first_delimiter { 173 | Some(delimiter) => { 174 | delete_after_first_char(&new_word, parse_delimiter(delimiter).unwrap()).to_string() 175 | } 176 | None => new_word, 177 | }; 178 | if req.should_delete_integers && new_word.chars().any(|c| c.is_numeric()) { 179 | new_word = delete_integers(new_word.to_string()); 180 | } 181 | if req.should_delete_nonalphanumeric && new_word.chars().any(|c| c.is_alphanumeric()) { 182 | new_word = delete_nonalphanumeric(new_word.to_string()); 183 | } 184 | if req.to_lowercase { 185 | new_word = new_word.to_ascii_lowercase(); 186 | } 187 | if req.should_straighten_quotes { 188 | new_word = straighten_quotes(&new_word).to_string(); 189 | } 190 | 191 | new_word = new_word.trim().to_string(); 192 | 193 | // WORD REMOVALS 194 | // Now that the words have been modified, we can move on to 195 | // word removals. 196 | // IF user has chosen to ignore any metadata, these should be the 197 | // first edits that we do. 198 | if req.should_remove_nonascii { 199 | // https://doc.rust-lang.org/std/primitive.str.html#method.is_ascii 200 | if !new_word.is_ascii() { 201 | // If we're here, that means we already know that we 202 | // do NOT want to add this word to our ouputted list. 203 | // So we can just skip to the next word in our loop. 204 | continue; 205 | } 206 | } 207 | // For these other checks, we have to iterate through every individual 208 | // character (char) of each word 209 | if req.should_remove_nonalphanumeric && new_word.chars().any(|c| !c.is_alphanumeric()) { 210 | continue; 211 | } 212 | if req.should_remove_nonalphabetic && new_word.chars().any(|c| !c.is_alphabetic()) { 213 | continue; 214 | } 215 | if req.should_remove_non_latin_alphabetic 216 | && new_word.chars().any(|chr| !is_latin_alphabetic(chr as u16)) 217 | { 218 | continue; 219 | } 220 | if req.should_remove_integers && new_word.chars().any(|c| c.is_numeric()) { 221 | continue; 222 | } 223 | if let Some(ref reject_list) = req.reject_list { 224 | if reject_list.contains(&new_word) { 225 | continue; 226 | } 227 | } 228 | 229 | if let Some(ref approved_list) = req.approved_list { 230 | if !approved_list.contains(&new_word) { 231 | continue; 232 | } 233 | }; 234 | 235 | if let Some(minimum_length) = req.minimum_length { 236 | if count_characters(&new_word) < minimum_length { 237 | continue; 238 | } 239 | }; 240 | 241 | if let Some(maximum_length) = req.maximum_length { 242 | if count_characters(&new_word) > maximum_length { 243 | continue; 244 | } 245 | }; 246 | 247 | // trim whitespace 248 | new_word = new_word.trim().to_string(); 249 | 250 | // If there was metadata, re-add it to the word now. 251 | if !new_word.is_empty() { 252 | if let Some(metadata) = metadata { 253 | if metadata_position == Some(MetadataPosition::End) { 254 | new_word = new_word + &delimiter.unwrap().to_string() + metadata; 255 | } else if metadata_position == Some(MetadataPosition::Start) { 256 | new_word = metadata.to_owned() + &delimiter.unwrap().to_string() + &new_word; 257 | } 258 | }; 259 | } 260 | 261 | // trim whitespace again 262 | new_word = new_word.trim().to_string(); 263 | // The trim calls could have made new_word empty 264 | // so need to check again 265 | if !new_word.is_empty() { 266 | tidied_list.push(new_word); 267 | } 268 | } 269 | // Now truncate list, if requested 270 | // Some operations are just a bit too complex for 271 | // me to figure out how to do on a per-word basis 272 | // at this time. Maybe something to revisit in the 273 | // future 274 | tidied_list = match req.homophones_list { 275 | Some(homophones_list) => remove_homophones(tidied_list, homophones_list), 276 | None => tidied_list, 277 | }; 278 | // I think this is a good order for these next few operations, 279 | // but I'm not super confident 280 | tidied_list = match req.maximum_shared_prefix_length { 281 | Some(maximum_shared_prefix_length) => { 282 | guarantee_maximum_prefix_length(&tidied_list, maximum_shared_prefix_length) 283 | } 284 | None => tidied_list, 285 | }; 286 | tidied_list = match req.minimum_edit_distance { 287 | Some(minimum_edit_distance) => { 288 | enfore_minimum_edit_distance(tidied_list, minimum_edit_distance) 289 | } 290 | None => tidied_list, 291 | }; 292 | tidied_list = if req.should_remove_suffix_words { 293 | remove_suffix_words(dedup_without_sorting(&mut tidied_list)) 294 | } else { 295 | tidied_list 296 | }; 297 | tidied_list = if req.should_remove_prefix_words { 298 | remove_prefix_words(dedup_without_sorting(&mut tidied_list)) 299 | } else { 300 | tidied_list 301 | }; 302 | tidied_list = if req.should_schlinkert_prune { 303 | schlinkert_prune(&dedup_without_sorting(&mut tidied_list)) 304 | } else { 305 | tidied_list 306 | }; 307 | 308 | // Remove duplicate words 309 | tidied_list = dedup_without_sorting(&mut tidied_list); 310 | 311 | // User can choose to print a limited number of words from nearly finished (but still 312 | // unsorted) list. 313 | // Can do so from the beginning of the nearly finished list... 314 | tidied_list = match req.print_first { 315 | Some(amount_to_cut) => { 316 | tidied_list.truncate(amount_to_cut); 317 | tidied_list 318 | } 319 | None => tidied_list, 320 | }; 321 | // And/or can do so randomly 322 | tidied_list = match req.print_rand { 323 | Some(amount_to_cut) => { 324 | let mut rng = rng(); 325 | tidied_list.shuffle(&mut rng); 326 | tidied_list.truncate(amount_to_cut); 327 | tidied_list 328 | } 329 | None => tidied_list, 330 | }; 331 | // Finally, sort list alphabetically, if the user didn't override this default behavior 332 | if req.sort_alphabetically { 333 | // We used to just be content to run tidied_list.sort() here, 334 | // but that doesn't support non-English languages and 335 | // accented characters very well. 336 | 337 | // First, parse the given locale into a valid Locale 338 | let loc = req.locale.to_string(); 339 | let loc: Locale = loc.parse().expect("Error: Given locale is not parse-able. Trying using form like 'en-US'; do not use underscores."); 340 | 341 | // Now use that Locale to sort the list more carefully 342 | tidied_list = sort_carefully(tidied_list, loc); 343 | } 344 | if req.sort_by_length { 345 | // First, parse the given locale into a valid Locale 346 | let loc = req.locale.to_string(); 347 | let loc: Locale = loc.parse().expect("Error: given locale is not parse-able. Trying using form like 'en-US'; do not use underscores."); 348 | 349 | eprintln!("Calling sort_by_length"); 350 | tidied_list = sort_by_length(tidied_list, loc); 351 | } 352 | // And remove duplicates one more time 353 | tidied_list = dedup_without_sorting(&mut tidied_list); 354 | tidied_list 355 | } 356 | 357 | use unicode_segmentation::UnicodeSegmentation; 358 | /// When counting characters of a word, we want to count all accented character as 1, 359 | /// regardless of the Unicode, to better approximate how humans would count the number 360 | /// of characters in a word. 361 | /// An alternate approach would be to convert each character to NFC before counting `word.nfc().count()` 362 | /// but I don't think this handles emoji as well as grapheme cluster counting. 363 | pub fn count_characters(word: &str) -> usize { 364 | word.graphemes(true).count() 365 | } 366 | 367 | /// Little helper function that allows users to write out whitespace 368 | /// delimiters "s" and "t", rather than having to enter the whitespace 369 | /// characters literally. 370 | pub fn parse_delimiter(delimiter: char) -> Option { 371 | if delimiter == 's' { 372 | Some(' ') 373 | } else if delimiter == 't' { 374 | Some('\t') 375 | } else { 376 | Some(delimiter) 377 | } 378 | } 379 | 380 | /// Used for the to_whittle option 381 | pub fn get_new_starting_point_guess( 382 | previous_starting_point: usize, 383 | this_list_length: usize, 384 | length_to_whittle_to: usize, 385 | ) -> usize { 386 | let mut starting_point = previous_starting_point; 387 | if this_list_length > length_to_whittle_to { 388 | // We're too high! 389 | let difference = this_list_length - length_to_whittle_to; 390 | let multiplier = starting_point as f64 / length_to_whittle_to as f64; 391 | let mut change = (difference as f64 * multiplier).floor() as usize; 392 | // Edge case we need to catch to avoid an infinite loop 393 | if change == 0 { 394 | change = 1; 395 | } 396 | starting_point -= change; 397 | } else { 398 | // We're too low! 399 | let difference = length_to_whittle_to - this_list_length; 400 | let multiplier = starting_point as f64 / length_to_whittle_to as f64; 401 | let mut change = (difference as f64 * multiplier).floor() as usize; 402 | // Edge case we need to catch to avoid an infinite loop 403 | if change == 0 { 404 | change = 1; 405 | } 406 | starting_point += change; 407 | } 408 | starting_point 409 | } 410 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use std::env; 3 | use std::path::Path; 4 | use std::path::PathBuf; 5 | use tidy::*; 6 | pub mod display_information; 7 | pub mod input_validations; 8 | use crate::file_readers::*; 9 | use crate::file_writer::*; 10 | use crate::input_validations::*; 11 | use crate::parsers::*; 12 | 13 | /// Combine and clean word lists 14 | #[derive(Parser, Debug)] 15 | #[clap(version, about, name = "tidy")] 16 | struct Args { 17 | /// Path(s) for optional list of approved words. Can accept multiple 18 | /// files. 19 | #[clap(short = 'a', long = "approve")] 20 | approved_list: Option>, 21 | 22 | /// Print attributes about new list to terminal. Can be used more than once 23 | /// to print more attributes. Some attributes may take a nontrivial amount 24 | /// of time to calculate. 25 | #[clap(short = 'A', long = "attributes", action = clap::ArgAction::Count)] 26 | attributes: u8, 27 | 28 | /// Print attributes and word samples in JSON format 29 | #[clap(short = 'j', long = "json")] 30 | attributes_as_json: bool, 31 | 32 | /// Print playing card abbreviation next to each word. 33 | /// Strongly recommend only using on lists with lengths that are powers 34 | /// of 26 (26^1, 26^2, 26^3, etc.) 35 | #[clap(long = "cards")] 36 | cards: bool, 37 | 38 | /// Debug mode 39 | #[clap(long = "debug")] 40 | debug: bool, 41 | 42 | /// Delete all characters after the first instance of the specified delimiter until the end of line 43 | /// (including the delimiter). Delimiter must be a single character (e.g., ','). Use 't' for tab and 44 | /// 's' for space. May not be used together with -g or -G options. 45 | #[clap(short = 'd', long = "delete-after")] 46 | delete_after_delimiter: Option, 47 | 48 | /// Delete all characters before and including the first instance of the specified delimiter. Delimiter 49 | /// must be a single character (e.g., ','). Use 't' for tab and 's' for space. May not be used 50 | /// together with -g or -G options. 51 | #[clap(short = 'D', long = "delete-before")] 52 | delete_before_delimiter: Option, 53 | 54 | /// Delete all integers from all words on new list 55 | #[clap(short = 'i', long = "delete-integers")] 56 | delete_integers: bool, 57 | 58 | /// Delete all non-alphanumeric characters from all words on new list. Characters with diacritics 59 | /// will remain 60 | #[clap(short = 'n', long = "delete-nonalphanumeric")] 61 | delete_nonalphanumeric: bool, 62 | 63 | /// Print dice roll before word in output. Set number of sides 64 | /// of dice. Must be between 2 and 36. Use 6 for normal dice. 65 | #[clap(long = "dice")] 66 | dice_sides: Option, 67 | 68 | /// Dry run. Don't write new list to file or terminal. 69 | #[clap(long = "dry-run")] 70 | dry_run: bool, 71 | 72 | /// Force overwrite of output file if it exists. 73 | #[clap(short = 'f', long = "force")] 74 | force_overwrite: bool, 75 | 76 | /// Path(s) to file(s) containing homophone pairs. There must be one pair 77 | /// of homophones per line, separated by a comma (sun,son). If BOTH words 78 | /// are found on a list, the SECOND word is removed. File(s) can be a CSV 79 | /// (with no column headers) or TXT file(s). 80 | #[clap(long = "homophones")] 81 | homophones_list: Option>, 82 | 83 | /// Ignore characters after the first instance of the specified delimiter until the end of line, treating 84 | /// anything before the delimiter as a word. Delimiter must be a single character (e.g., ','). Use 't' 85 | /// for tab and 's' for space. Helpful for ignoring metadata like word frequencies. 86 | /// Works with attribute analysis and most word removal options, but not with word modifications 87 | /// (like to lowercase). May not be used together with -d, -D or -G options. 88 | #[clap(short = 'g', long = "ignore-after")] 89 | ignore_after_delimiter: Option, 90 | 91 | /// Ignore characters before and including the first instance of the specified delimiter, treating 92 | /// anything after the delimiter as a word. Delimiter must be a single character (e.g., ','). Use 't' 93 | /// for tab and 's' for space. Helpful for ignoring metadata like word frequencies. 94 | /// Works with attribute analysis and most word removal options, but not with word modifications 95 | /// (like to lowercase). May not be used together with -d, -D or -g options. 96 | #[clap(short = 'G', long = "ignore-before")] 97 | ignore_before_delimiter: Option, 98 | 99 | /// Specify a locale for words on the list. Aids with sorting. Examples: en-US, es-ES. Defaults 100 | /// to system LANG. If LANG environmental variable is not set, uses en-US. 101 | #[clap(long = "locale")] 102 | locale: Option, 103 | 104 | /// Lowercase all words on new list 105 | #[clap(short = 'l', long = "lowercase")] 106 | to_lowercase: bool, 107 | 108 | /// Set maximum word length 109 | #[clap(short = 'M', long = "maximum-word-length")] 110 | maximum_length: Option, 111 | 112 | /// Set number of leading characters to get to a unique prefix, 113 | /// which can aid auto-complete functionality. 114 | /// Setting this value to say, 4, means that knowing the first 115 | /// 4 characters of any word on the generated list is enough 116 | /// to know which word it is. 117 | #[clap(short = 'x', long = "shared-prefix-length")] 118 | maximum_shared_prefix_length: Option, 119 | 120 | /// Set minimum edit distance between words, which 121 | /// can reduce the cost of typos when entering words 122 | #[clap(short = 'e', long = "minimum-edit-distance")] 123 | minimum_edit_distance: Option, 124 | 125 | /// Set minimum word length 126 | #[clap(short = 'm', long = "minimum-word-length")] 127 | minimum_length: Option, 128 | 129 | /// Sort by word length, with longest words first. First sorts words 130 | /// alphabetically, respecting inputted locale. 131 | #[clap(long = "sort-by-length", conflicts_with = "no_alpha_sort")] 132 | sort_by_length: bool, 133 | 134 | /// Do NOT sort outputted list alphabetically. Preserves original list order. 135 | /// Note that duplicate lines and blank lines will still be removed. 136 | #[clap(short = 'O', long = "no-sort", conflicts_with = "sort_by_length")] 137 | no_alpha_sort: bool, 138 | 139 | /// Normalize Unicode of all characters of all words. Accepts nfc, nfd, nfkc, or nfkd (case 140 | /// insensitive). 141 | #[clap(short = 'z', long = "normalization-form")] 142 | normalization_form: Option, 143 | 144 | /// Path for outputted list file. If none given, generated word list 145 | /// will be printed to terminal. 146 | #[clap(short = 'o', long = "output")] 147 | output: Option, 148 | 149 | /// When printing dice roll before word in output, print dice values 150 | /// according to the base selected through --dice option. Effectively 151 | /// this means that letters will be used to represent numbers higher 152 | /// than 9. Note that this option also 0-indexes the dice values. 153 | /// This setting defaults to `false`, which will 1-indexed dice values, 154 | /// and use double-digit numbers when necessary (e.g. 18-03-08). 155 | #[clap(long = "sides-as-base")] 156 | print_dice_sides_as_their_base: bool, 157 | 158 | /// Just before printing generated list, cut list down 159 | /// to a set number of words. Can accept expressions in the 160 | /// form of base**exponent (helpful for generating diceware lists). 161 | /// Words are selected from the beginning of processed list, and before it is sorted alphabetically. 162 | #[clap(long = "print-first", value_parser=eval_list_length)] 163 | print_first: Option, 164 | 165 | /// Just before printing generated list, cut list down 166 | /// to a set number of words. Can accept expressions in the 167 | /// form of base**exponent (helpful for generating diceware lists). 168 | /// Cuts are done randomly. 169 | #[clap(long = "print-rand", value_parser=eval_list_length)] 170 | print_rand: Option, 171 | 172 | /// Do not print any extra information 173 | #[clap(long = "quiet")] 174 | quiet: bool, 175 | 176 | /// Remove all words with integers in them from list 177 | #[clap(short = 'I', long = "remove-integers")] 178 | remove_integers: bool, 179 | 180 | /// Remove all words with non-alphanumeric characters from new list. Words with diacritics will 181 | /// remain 182 | #[clap(short = 'N', long = "remove-nonalphanumeric")] 183 | remove_nonalphanumeric: bool, 184 | 185 | /// Remove all words with non-alphabetic characters from new list. Words with diacritcis and 186 | /// other non-Latin characters will remain. 187 | #[clap(long = "remove-nonalphabetic")] 188 | remove_nonalphabetic: bool, 189 | 190 | /// Remove all words with any characters not in the Latin alphabet (A through Z and a through 191 | /// z). All words with accented or diacritic characters will be removed, as well as 192 | /// any words with puncuation and internal whitespace. 193 | #[clap(short = 'L', long = "remove-non-latin-alphabetic")] 194 | remove_non_latin_alphabetic: bool, 195 | 196 | /// Remove all words that have any non-ASCII characters from new list 197 | #[clap(short = 'C', long = "remove-nonascii")] 198 | remove_nonascii: bool, 199 | 200 | /// Remove prefix words from new list 201 | #[clap(short = 'P', long = "remove-prefix")] 202 | remove_prefix_words: bool, 203 | 204 | /// Remove suffix words from new list 205 | #[clap(short = 'S', long = "remove-suffix")] 206 | remove_suffix_words: bool, 207 | 208 | /// Path(s) for optional list of words to reject. Can accept multiple 209 | /// files. 210 | #[clap(short = 'r', long = "reject")] 211 | reject_list: Option>, 212 | 213 | /// Print a handful of pseudorandomly selected words from the created list 214 | /// to the terminal. Should NOT be used as secure passphrases. 215 | #[clap(short = 's', long = "samples")] 216 | samples: bool, 217 | 218 | /// Use Sardinas-Patterson algorithm to remove words to make list 219 | /// uniquely decodable. Experimental! 220 | #[clap(short = 'K', long = "schlinkert-prune")] 221 | schlinkert_prune: bool, 222 | 223 | /// Skip first number of lines from inputted files. Useful for dealing with headers like from 224 | /// PGP signatures 225 | #[clap(long = "skip-rows-start")] 226 | skip_rows_start: Option, 227 | 228 | /// Skip last number of lines from inputted files. Useful for dealing with footers like from 229 | /// PGP signatures. 230 | #[clap(long = "skip-rows-end")] 231 | skip_rows_end: Option, 232 | 233 | /// Replace “smart” quotation marks, both “double” and ‘single’, 234 | /// with their "straight" versions 235 | #[clap(short = 'q', long = "straighten")] 236 | straighten_quotes: bool, 237 | 238 | /// Only take first N words from inputted word list. If two or more word list files are 239 | /// inputted, it will combine all given lists by alternating words from the given word list 240 | /// files until it has N words 241 | #[clap(long = "take-first", value_parser=eval_list_length)] 242 | take_first: Option, 243 | 244 | /// Only take a random N number of words from inputted word list. 245 | /// If two or more word lists are inputted, it will 246 | /// combine arbitrarily and then take a random N words. If you're looking to cut a list exactly 247 | /// to a specified size, consider print-rand or whittle-to options. 248 | #[clap(long = "take-rand", value_parser=eval_list_length)] 249 | take_rand: Option, 250 | 251 | /// Whittle list exactly to a specified length, only taking minimum number of words 252 | /// from the beginning of inputted list(s). 253 | /// If the outputted list is not exactly the specified length, it will try again by taking a 254 | /// different amount of words form input list(s). As a result, this using this option may cause 255 | /// Tidy to take a moment to produce the finished list. 256 | /// Can accept expressions in the form of base**exponent (helpful for generating diceware lists). 257 | /// 258 | /// This option should generally only be used if all of the following conditions are met: 259 | /// (a) the inputted word list is sorted by desirability (e.g. ordered by word frequency); 260 | /// (b) the user is either removing prefix words, removing suffix words, or doing a Schlinkert prune; 261 | /// (c) the user needs the resulting list to be a specified length. 262 | /// 263 | /// Optionally can also take a "starting point" after a comma. For example, 264 | /// --whittle-to 7776,15000 would start by taking the first 15,000 words 265 | /// from the inputted list(s) as a first attempt at making a list of 7,776 words, iterating 266 | /// if necessary. 267 | #[clap(short = 'W', long = "whittle-to")] 268 | whittle_to: Option, 269 | 270 | /// Word list input files. Can be more than one, in which case 271 | /// they'll be combined and de-duplicated. Requires at least 272 | /// one file. 273 | #[clap(name = "Inputted Word Lists", required = true)] 274 | inputted_word_lists: Vec, 275 | } 276 | 277 | fn main() -> Result<(), String> { 278 | let opt = Args::parse(); 279 | if opt.debug { 280 | eprintln!("Received args: {:?}", opt); 281 | } 282 | 283 | // Some initial validations 284 | // Check given number of dice sides 285 | match validate_dice_sides(opt.dice_sides) { 286 | Ok(()) => (), 287 | Err(e) => { 288 | return Err(e.to_string()); 289 | } 290 | } 291 | 292 | // Check if any of inputted_word_lists are directories 293 | for file in &opt.inputted_word_lists { 294 | if file.is_dir() { 295 | return Err(format!("Given file {:?} is a directory", file)); 296 | } 297 | } 298 | 299 | if opt.cards && opt.dice_sides.is_some() { 300 | return Err( 301 | "Error: Cannot use dice and cards. Must be either cards or dice or neither." 302 | .to_string(), 303 | ); 304 | } 305 | 306 | match validate_list_truncation_options( 307 | &opt.whittle_to, 308 | opt.print_rand, 309 | opt.take_first, 310 | opt.take_rand, 311 | ) { 312 | Ok(()) => (), 313 | Err(e) => { 314 | return Err(e.to_string()); 315 | } 316 | } 317 | 318 | // Check if output file exists 319 | if let Some(ref output_file_name) = opt.output { 320 | if !opt.force_overwrite && Path::new(output_file_name).exists() { 321 | return Err( 322 | "Specified output file already exists. Use --force flag to force an overwrite." 323 | .to_string(), 324 | ); 325 | } 326 | } 327 | 328 | // Determine if this is a niche case in which whittle_to would be a smarter choice 329 | // than (either) print_first or print_rand. 330 | if (opt.print_first.is_some() || opt.print_rand.is_some()) 331 | && opt.whittle_to.is_none() 332 | && (opt.remove_prefix_words || opt.remove_suffix_words || opt.schlinkert_prune) 333 | && !opt.quiet 334 | { 335 | if opt.print_first.is_some() { 336 | eprintln!( 337 | "RECOMMENDATION: If your input list is sorted by desirability (e.g. word frequency), consider using --whittle-to rather than --print-first if you're removing prefix words, removing suffix words, and/or doing a Schlinkert prune.\n" 338 | ); 339 | } 340 | if opt.print_rand.is_some() { 341 | eprintln!( 342 | "RECOMMENDATION: If your input list is sorted by desirability (e.g. word frequency), consider using --whittle-to rather than --print-rand if you're removing prefix words, removing suffix words, and/or doing a Schlinkert prune.\n" 343 | ); 344 | } 345 | } 346 | 347 | // OK let's do this. Make a Tidy request. 348 | // While it's not declared as mutable here, we will reassign it 349 | // it later, unfortunately. 350 | let this_tidy_request = TidyRequest { 351 | list: make_vec_from_filenames( 352 | &opt.inputted_word_lists, 353 | opt.skip_rows_start, 354 | opt.skip_rows_end, 355 | ), 356 | take_first: opt.take_first, 357 | take_rand: opt.take_rand, 358 | sort_alphabetically: !opt.no_alpha_sort, 359 | sort_by_length: opt.sort_by_length, 360 | ignore_after_delimiter: opt.ignore_after_delimiter, 361 | ignore_before_delimiter: opt.ignore_before_delimiter, 362 | to_lowercase: opt.to_lowercase, 363 | normalization_form: opt.normalization_form, 364 | locale: match opt.locale { 365 | Some(lang) => lang, 366 | None => match get_system_lang() { 367 | Some(lang) => lang.replace("_", "-"), 368 | None => "en-US".to_string(), 369 | }, 370 | }, 371 | should_straighten_quotes: opt.straighten_quotes, 372 | should_remove_prefix_words: opt.remove_prefix_words, 373 | should_remove_suffix_words: opt.remove_suffix_words, 374 | should_schlinkert_prune: opt.schlinkert_prune, 375 | should_remove_integers: opt.remove_integers, 376 | should_delete_integers: opt.delete_integers, 377 | should_remove_nonalphanumeric: opt.remove_nonalphanumeric, 378 | should_delete_nonalphanumeric: opt.delete_nonalphanumeric, 379 | should_remove_nonalphabetic: opt.remove_nonalphabetic, 380 | should_remove_non_latin_alphabetic: opt.remove_non_latin_alphabetic, 381 | should_remove_nonascii: opt.remove_nonascii, 382 | should_delete_after_first_delimiter: opt.delete_after_delimiter, 383 | should_delete_before_first_delimiter: opt.delete_before_delimiter, 384 | 385 | // If given more than one file of reject words, combine them 386 | // right here. 387 | reject_list: opt 388 | .reject_list 389 | .map(|list_of_files| make_vec_from_filenames(&list_of_files, None, None)), 390 | // Likewise with approved word lists 391 | approved_list: opt 392 | .approved_list 393 | .map(|list_of_files| make_vec_from_filenames(&list_of_files, None, None)), 394 | // And homophones 395 | homophones_list: opt 396 | .homophones_list 397 | .map(|list_of_files| read_homophones_list_from_filenames(&list_of_files)), 398 | minimum_length: opt.minimum_length, 399 | maximum_length: opt.maximum_length, 400 | maximum_shared_prefix_length: opt.maximum_shared_prefix_length, 401 | minimum_edit_distance: opt.minimum_edit_distance, 402 | print_rand: opt.print_rand, 403 | print_first: opt.print_first, 404 | }; 405 | 406 | let (ignore_before_delimiter, ignore_after_delimiter) = match validate_and_parse_ignore_options( 407 | &this_tidy_request, 408 | opt.dice_sides, 409 | opt.print_dice_sides_as_their_base, 410 | ) { 411 | Ok((ignore_before_delimiter, ignore_after_delimiter)) => { 412 | (ignore_before_delimiter, ignore_after_delimiter) 413 | } 414 | Err(e) => { 415 | return Err(e.to_string()); 416 | } 417 | }; 418 | 419 | // Parse provided "whittle string" for a length_to_whittle_to and an 420 | // optional starting point. 421 | let (mut this_tidy_request, length_to_whittle_to, starting_point) = 422 | match parse_whittle_options(this_tidy_request, opt.whittle_to) { 423 | Ok((this_tidy_request, length_to_whittle_to, starting_point)) => { 424 | (this_tidy_request, length_to_whittle_to, starting_point) 425 | } 426 | Err(e) => { 427 | return Err(e); 428 | } 429 | }; 430 | 431 | // Finally get to actually tidy the inputted_word_list 432 | // If we have a length_to_whittle_to and a starting_point, we know we're 433 | // whittling, which is (still) a bit too complicated for my tastes. But we 434 | // need a while loop here. 435 | let mut this_list_length = 0; 436 | let tidied_list = match (length_to_whittle_to, starting_point) { 437 | (Some(our_length_to_whittle_to), Some(mut our_starting_point)) => { 438 | let mut this_tidied_list = vec![]; 439 | while this_list_length != our_length_to_whittle_to { 440 | // Edit this_tidy_request to have our new starting point 441 | this_tidy_request.take_first = Some(our_starting_point); 442 | 443 | // This clone might be too expensice. maybe tidy_list can take a 444 | // reference? 445 | this_tidied_list = tidy_list(this_tidy_request.clone()); 446 | 447 | this_list_length = this_tidied_list.len(); 448 | our_starting_point = get_new_starting_point_guess( 449 | our_starting_point, 450 | this_list_length, 451 | our_length_to_whittle_to, 452 | ); 453 | if opt.debug { 454 | eprintln!( 455 | "Whittled list to {}. Will try again, taking {} words.", 456 | this_list_length, our_starting_point 457 | ); 458 | } 459 | } 460 | // Out of the loop, which means the list is the user-specified 461 | // length. return this verison of the list. 462 | this_tidied_list 463 | } 464 | (_, _) => { 465 | // In all other cases, `whittle_to` option not specified, so 466 | // proceed as normal, sending all parameters in this_tidied_list 467 | // as they are just once. 468 | tidy_list(this_tidy_request) 469 | } 470 | }; 471 | 472 | // Next, we figure out what to print where 473 | let this_print_request = PrintRequest { 474 | tidied_list, 475 | dry_run: opt.dry_run, 476 | quiet: opt.quiet, 477 | output: opt.output, 478 | cards: opt.cards, 479 | dice_sides: opt.dice_sides, 480 | print_dice_sides_as_their_base: opt.print_dice_sides_as_their_base, 481 | attributes: opt.attributes, 482 | attributes_as_json: opt.attributes_as_json, 483 | samples: opt.samples, 484 | ignore_before_delimiter, 485 | ignore_after_delimiter, 486 | }; 487 | print_list(this_print_request); 488 | 489 | Ok(()) 490 | } 491 | 492 | /// Read LANG environmental variable, if possible 493 | fn get_system_lang() -> Option { 494 | let name_of_environmental_variable = "LANG"; 495 | match env::var(name_of_environmental_variable) { 496 | Ok(l) => Some(l.split('.').collect::>()[0].to_string()), 497 | Err(_e) => None, 498 | } 499 | } 500 | -------------------------------------------------------------------------------- /src/display_information/mod.rs: -------------------------------------------------------------------------------- 1 | //! Display attributes and information about the generated word list 2 | 3 | pub mod uniquely_decodable; 4 | use crate::count_characters; 5 | use crate::display_information::uniquely_decodable::is_uniquely_decodable; 6 | use crate::parse_delimiter; 7 | use crate::split_and_vectorize; 8 | use serde::{Deserialize, Serialize}; 9 | use std::fmt; 10 | 11 | #[derive(Serialize, Deserialize)] 12 | pub struct ListAttributes { 13 | pub list_length: usize, 14 | pub mean_word_length: f32, 15 | pub entropy_per_word: f64, 16 | pub shortest_word_length: usize, 17 | pub shortest_word_example: String, 18 | pub longest_word_length: usize, 19 | pub longest_word_example: String, 20 | 21 | pub is_free_of_prefix_words: Option, 22 | pub is_free_of_suffix_words: Option, 23 | 24 | pub is_uniquely_decodable: Option, 25 | 26 | pub efficiency_per_character: f64, 27 | pub assumed_entropy_per_character: f64, 28 | pub is_above_brute_force_line: bool, 29 | pub is_above_shannon_line: bool, 30 | pub shortest_edit_distance: Option, 31 | pub mean_edit_distance: Option, 32 | pub longest_shared_prefix: Option, 33 | pub unique_character_prefix: Option, 34 | pub kraft_mcmillan: KraftMcmillanOutcome, 35 | pub samples: Option>, 36 | } 37 | 38 | #[derive(Serialize, Deserialize, Debug)] 39 | pub enum KraftMcmillanOutcome { 40 | Satisfied, 41 | NotSatisfied, 42 | } 43 | impl fmt::Display for KraftMcmillanOutcome { 44 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 45 | match self { 46 | KraftMcmillanOutcome::Satisfied => write!(f, "satisfied"), 47 | KraftMcmillanOutcome::NotSatisfied => write!(f, "not satisfied"), 48 | } 49 | } 50 | } 51 | 52 | fn make_attributes(list: &[String], level: u8, samples: bool) -> ListAttributes { 53 | let samples = if samples { 54 | Some(generate_samples(list)) 55 | } else { 56 | None 57 | }; 58 | let shortest_word_example = list 59 | .iter() 60 | .min_by(|a, b| count_characters(a).cmp(&count_characters(b))) 61 | .unwrap() 62 | .to_string(); 63 | let longest_word_example = list 64 | .iter() 65 | .max_by(|a, b| count_characters(a).cmp(&count_characters(b))) 66 | .unwrap() 67 | .to_string(); 68 | 69 | let is_free_of_prefix_words = if level >= 2 { 70 | Some(!has_prefix_words(list)) 71 | } else { 72 | None 73 | }; 74 | 75 | let is_free_of_suffix_words = if level >= 2 { 76 | Some(!has_suffix_words(list)) 77 | } else { 78 | None 79 | }; 80 | 81 | let is_uniquely_decodable = if level >= 2 { 82 | Some(is_uniquely_decodable(list)) 83 | } else { 84 | None 85 | }; 86 | 87 | let shortest_edit_distance = if level >= 3 { 88 | Some(find_shortest_edit_distance(list)) 89 | } else { 90 | None 91 | }; 92 | let mean_edit_distance = if level >= 3 { 93 | Some(find_mean_edit_distance(list)) 94 | } else { 95 | None 96 | }; 97 | 98 | let longest_shared_prefix = if level >= 4 { 99 | Some(find_longest_shared_prefix( 100 | list, 101 | Some(count_characters(&longest_word_example)), 102 | )) 103 | } else { 104 | None 105 | }; 106 | let unique_character_prefix = if level >= 4 { 107 | // By definition, unique_character_prefix == longest_shared_prefix + 1 108 | // We have to use map in case longest_shared_prefix is None, which is 109 | // unlikely, but technically possible. 110 | longest_shared_prefix.map(|longest_shared_prefix| longest_shared_prefix + 1) 111 | } else { 112 | None 113 | }; 114 | ListAttributes { 115 | list_length: list.len(), 116 | mean_word_length: mean_word_length(list), 117 | entropy_per_word: calc_entropy_per_word(list.len()), 118 | shortest_word_length: count_characters(&shortest_word_example), 119 | shortest_word_example, 120 | longest_word_length: count_characters(&longest_word_example), 121 | longest_word_example, 122 | efficiency_per_character: efficiency_per_character(list), 123 | assumed_entropy_per_character: assumed_entropy_per_character(list), 124 | is_above_brute_force_line: is_above_brute_force_line(list), 125 | is_above_shannon_line: is_above_shannon_line(list), 126 | is_free_of_prefix_words, 127 | is_free_of_suffix_words, 128 | is_uniquely_decodable, 129 | shortest_edit_distance, 130 | mean_edit_distance, 131 | longest_shared_prefix, 132 | unique_character_prefix, 133 | kraft_mcmillan: satisfies_kraft_mcmillan(list), 134 | samples, 135 | } 136 | } 137 | 138 | /// If user gets a passphrase consisting entirely of shortest words, 139 | /// it's theoretically possible that we could OVERESTIMATE entropy 140 | /// per word. We can deterimine if we've done this by comparing out 141 | /// entropy estimate against a simple brute force attack of all lowercase 142 | /// English letters, under which we assume each character adds roughly 4.7 bits of entropy. 143 | /// Note that this slightly obscure method of calculation ensures that floating-point arithmetic is 144 | /// not used, thus ensuring a higher level of accuracy. 145 | fn is_above_brute_force_line(list: &[String]) -> bool { 146 | let g: i32 = 26; // roughly: assumed alphabet length 147 | let shortest_word_length = get_shortest_word_length(list) as u32; 148 | let list_length = list.len() as i32; 149 | list_length as f64 <= g.pow(shortest_word_length).into() 150 | } 151 | 152 | /// In 1951, Claude Shannon estimated that English words only have 153 | /// about 2.6 bits of entropy per character, rather than (roughly) 4.7 bits per character. 154 | /// 155 | /// Thus, this is a more difficult line for a given list to pass above than 156 | /// the "brute force" line described above. 157 | fn is_above_shannon_line(list: &[String]) -> bool { 158 | let shortest_word_length = get_shortest_word_length(list) as u32; 159 | let g: f64 = 6.1; // 2**2.6 is 6.1 when we maintain correct number of significant digits. 160 | let list_length = list.len() as i32; 161 | list_length as f64 <= g.powf(shortest_word_length.into()) 162 | } 163 | 164 | /// This is a large and long function that prints all of the attributes of 165 | /// the generated (new) list. 166 | /// 167 | /// We just want to "display" this information, rather than print it to files 168 | /// or stdout, so we use `eprintln!` 169 | pub fn display_list_information( 170 | list: &[String], 171 | level: u8, 172 | attributes_as_json: bool, 173 | ignore_ending_metadata_delimiter: Option, 174 | ignore_starting_metadata_delimiter: Option, 175 | samples: bool, 176 | ) { 177 | let list = make_list_free_of_metadata( 178 | list, 179 | ignore_starting_metadata_delimiter, 180 | ignore_ending_metadata_delimiter, 181 | ); 182 | let list_attributes = make_attributes(&list, level, samples); 183 | if attributes_as_json { 184 | print_attributes_as_json(&list_attributes); 185 | } else { 186 | if level >= 1 { 187 | eprintln!("Attributes of new list"); 188 | eprintln!("----------------------"); 189 | eprintln!( 190 | "List length : {} words", 191 | list_attributes.list_length 192 | ); 193 | eprintln!( 194 | "Mean word length : {:.2} characters", 195 | list_attributes.mean_word_length 196 | ); 197 | eprintln!( 198 | "Length of shortest word : {} characters ({})", 199 | list_attributes.shortest_word_length, list_attributes.shortest_word_example 200 | ); 201 | eprintln!( 202 | "Length of longest word : {} characters ({})", 203 | list_attributes.longest_word_length, list_attributes.longest_word_example 204 | ); 205 | if let Some(is_free_of_prefix_words) = list_attributes.is_free_of_prefix_words { 206 | eprintln!("Free of prefix words? : {}", is_free_of_prefix_words); 207 | } 208 | if let Some(is_free_of_suffix_words) = list_attributes.is_free_of_suffix_words { 209 | eprintln!("Free of suffix words? : {:?}", is_free_of_suffix_words); 210 | } 211 | 212 | // At least for now, this one is EXPENSIVE 213 | if let Some(is_uniquely_decodable) = list_attributes.is_uniquely_decodable { 214 | eprintln!("Uniquely decodable? : {:?}", is_uniquely_decodable); 215 | } 216 | 217 | eprintln!( 218 | "Entropy per word : {:.3} bits", 219 | list_attributes.entropy_per_word 220 | ); 221 | eprintln!( 222 | "Efficiency per character : {:.3} bits", 223 | list_attributes.efficiency_per_character 224 | ); 225 | eprintln!( 226 | "Assumed entropy per char : {:.3} bits", 227 | list_attributes.assumed_entropy_per_character 228 | ); 229 | eprintln!( 230 | "Above brute force line? : {}", 231 | list_attributes.is_above_brute_force_line 232 | ); 233 | 234 | if level >= 4 { 235 | eprintln!( 236 | "Above Shannon line? : {}", 237 | list_attributes.is_above_shannon_line 238 | ); 239 | } 240 | 241 | if let Some(shortest_edit_distance) = list_attributes.shortest_edit_distance { 242 | eprintln!("Shortest edit distance : {}", shortest_edit_distance) 243 | } 244 | if let Some(mean_edit_distance) = list_attributes.mean_edit_distance { 245 | eprintln!("Mean edit distance : {:.3}", mean_edit_distance) 246 | } 247 | 248 | if let Some(longest_shared_prefix) = list_attributes.longest_shared_prefix { 249 | eprintln!("Longest shared prefix : {}", longest_shared_prefix) 250 | } 251 | // Numbers of characters required to definitely get to a unique 252 | // prefix 253 | if let Some(unique_character_prefix) = list_attributes.unique_character_prefix { 254 | eprintln!("Unique character prefix : {}", unique_character_prefix) 255 | } 256 | 257 | if level >= 4 { 258 | eprintln!( 259 | "Kraft-McMillan inequality : {}", 260 | list_attributes.kraft_mcmillan 261 | ); 262 | } 263 | } 264 | if let Some(samples) = list_attributes.samples { 265 | print_samples(samples) 266 | } 267 | } 268 | } 269 | 270 | fn print_attributes_as_json(list_attributes: &ListAttributes) { 271 | let json = serde_json::to_string(&list_attributes).unwrap(); 272 | eprintln!("{}", json); 273 | } 274 | 275 | fn make_list_free_of_metadata( 276 | list: &[String], 277 | ignore_ending_metadata_delimiter: Option, 278 | ignore_starting_metadata_delimiter: Option, 279 | ) -> Vec { 280 | match ( 281 | ignore_ending_metadata_delimiter, 282 | ignore_starting_metadata_delimiter, 283 | ) { 284 | (Some(delimiter), None) => { 285 | let delimiter = parse_delimiter(delimiter).unwrap(); 286 | let mut just_the_words = vec![]; 287 | for word in list { 288 | let split_vec = split_and_vectorize(word, &delimiter.to_string()); 289 | just_the_words.push(split_vec[1].to_string()); 290 | } 291 | just_the_words 292 | } 293 | (None, Some(delimiter)) => { 294 | let delimiter = parse_delimiter(delimiter).unwrap(); 295 | let mut just_the_words = vec![]; 296 | for word in list { 297 | let split_vec = split_and_vectorize(word, &delimiter.to_string()); 298 | just_the_words.push(split_vec[0].to_string()); 299 | } 300 | just_the_words 301 | } 302 | (Some(ref _delimiter1), Some(ref _delimiter2)) => { 303 | panic!("Can't ignore metadata on both sides currently") 304 | } 305 | (None, None) => list.to_vec(), 306 | } 307 | } 308 | 309 | use rand::prelude::IndexedRandom; 310 | /// Print 5 sample 6-word passphrases from the newly created 311 | /// word list. 312 | pub fn generate_samples(list: &[String]) -> Vec { 313 | let mut samples: Vec = vec![]; 314 | for _n in 0..30 { 315 | match list.choose(&mut rand::rng()) { 316 | Some(word) => samples.push(word.to_string()), 317 | None => panic!("Couldn't pick a random word"), 318 | } 319 | } 320 | samples 321 | } 322 | 323 | /// Calculate the entropy per word of a word list, given its size. 324 | /// We want this entropy value measured in bits, hence the use 325 | /// of log2() 326 | /// 327 | /// Returns `f64` because this value to return (bits of entropy per 328 | /// word) will most likely not be a whole number (which is fine!) 329 | pub fn calc_entropy_per_word(list_length: usize) -> f64 { 330 | (list_length as f64).log2() 331 | } 332 | 333 | use crate::edit_distance::find_edit_distance; 334 | /// Calculate the shortest edit distance between any two words on the list. 335 | fn find_shortest_edit_distance(list: &[String]) -> usize { 336 | // This use of max_value is smelly, but not sure I know how to do it better. 337 | let mut shortest_edit_distance = u32::MAX; 338 | // I think I can cheat and only go through half of the list here 339 | for word1 in list[0..(list.len() / 2)].iter() { 340 | for word2 in list { 341 | if word1 != word2 { 342 | let this_edit_distance = find_edit_distance(word1, word2); 343 | if this_edit_distance < shortest_edit_distance { 344 | shortest_edit_distance = this_edit_distance; 345 | } 346 | // If we're found an edit distance of 1, we know that'll be the 347 | // shortest possible (since Tidy removes duplicates by default, so 348 | // a shortest_edit_distance of 0 is NOT possible) 349 | if shortest_edit_distance == 1 { 350 | return 1; 351 | } 352 | } 353 | } 354 | } 355 | shortest_edit_distance.try_into().unwrap() 356 | } 357 | 358 | /// Calculate the mean edit distance between all pairs of words on the list. 359 | pub fn find_mean_edit_distance(list: &[String]) -> f64 { 360 | let mut sum_of_all_edit_distances: f64 = 0.0; 361 | let mut number_of_edit_distances_measured: f64 = 0.0; 362 | for (i, word1) in list.iter().enumerate() { 363 | // The list[0..i] upper-bound in this inner loop is so that we don't do 364 | // twice as many calls as necessary. Otherwise we would be finding the 365 | // edit distance from word1 -> word2 and word2 -> word1. 366 | // This also loop helpfully prevents us from checking a word's edit 367 | // distance to itself (0). 368 | for word2 in list[0..i].iter() { 369 | let this_edit_distance = find_edit_distance(word1, word2); 370 | number_of_edit_distances_measured += 1.0; 371 | sum_of_all_edit_distances += this_edit_distance as f64; 372 | } 373 | } 374 | eprintln!( 375 | "Number of edit distances recorded: {}\nSum of all edit distances: {}", 376 | number_of_edit_distances_measured, sum_of_all_edit_distances 377 | ); 378 | sum_of_all_edit_distances / number_of_edit_distances_measured 379 | } 380 | 381 | /// Nested loops in this function get the `longest_shared_prefix` 382 | /// between any two words on the given list. Returns length of this 383 | /// longest shared prefix, a notable cryptographic metric. 384 | /// Optionally takes longest_word_length to speed up process. 385 | pub fn find_longest_shared_prefix(list: &[String], longest_word_length: Option) -> usize { 386 | let mut longest_shared_prefix = 0; 387 | 388 | // If longest_word_length is given, use that. If not, 389 | // calculate it here. 390 | let longest_word_length = match longest_word_length { 391 | Some(longest_word_length) => longest_word_length, 392 | None => count_characters( 393 | list.iter() 394 | .max_by(|a, b| count_characters(a).cmp(&count_characters(b))) 395 | .unwrap(), 396 | ), 397 | }; 398 | for word1 in list { 399 | for word2 in list { 400 | if word1 != word2 { 401 | // Here we convert from zero-indexed first different to 402 | // the (1-indexed) length of the long shared prefix, so we don't 403 | // need a `- 1`. 404 | let this_shared_prefix_length = 405 | find_first_different_character_zero_indexed(word1, word2); 406 | if this_shared_prefix_length > longest_shared_prefix { 407 | longest_shared_prefix = this_shared_prefix_length; 408 | } 409 | // If we found a shared prefix that's only one fewer than the longest word on 410 | // the list, we know this is the longest shared prefix we'll ever find. 411 | // We can short-circuit return to save time. 412 | if this_shared_prefix_length == longest_word_length - 1 { 413 | return this_shared_prefix_length; 414 | } 415 | } 416 | } 417 | } 418 | longest_shared_prefix 419 | } 420 | 421 | /// Given 2 words, finds the index of the first character that is 422 | /// **different** within them. 423 | /// ``` 424 | /// use tidy::display_information::find_first_different_character_zero_indexed; 425 | /// 426 | /// assert_eq!( 427 | /// find_first_different_character_zero_indexed("hello", "help"), 3 428 | /// // First **different** character is `l` vs. `p`. 429 | /// ); 430 | /// 431 | /// // Handles words of different length by falling back to the length of the shorter 432 | /// // of the two words: 433 | /// assert_eq!( 434 | /// find_first_different_character_zero_indexed("zip", "zippy"), 3 435 | /// ); 436 | /// assert_eq!( 437 | /// find_first_different_character_zero_indexed("zippy", "zip"), 3 438 | /// ); 439 | /// ``` 440 | pub fn find_first_different_character_zero_indexed(word1: &str, word2: &str) -> usize { 441 | for (i, c1) in word1.chars().enumerate() { 442 | match word2.chars().nth(i) { 443 | Some(c2) => { 444 | if c1 != c2 { 445 | return i; 446 | } else { 447 | continue; 448 | } 449 | } 450 | // word1 is longer than word2 451 | None => { 452 | return count_characters(word2); 453 | } 454 | } 455 | } 456 | // Fall back to shorter word length 457 | if count_characters(word1) < count_characters(word2) { 458 | count_characters(word1) 459 | } else { 460 | count_characters(word2) 461 | } 462 | } 463 | 464 | /// Checks if a list has any words that are prefixs of other 465 | /// words on the list. 466 | fn has_prefix_words(list: &[String]) -> bool { 467 | for word1 in list { 468 | for word2 in list { 469 | if word1 != word2 && word1.starts_with(word2) { 470 | return true; 471 | } 472 | } 473 | } 474 | false 475 | } 476 | 477 | /// Checks if a list has any words that are suffixes of other 478 | /// words on the list. 479 | fn has_suffix_words(list: &[String]) -> bool { 480 | for word1 in list { 481 | for word2 in list { 482 | if word1 != word2 && word1.ends_with(word2) { 483 | return true; 484 | } 485 | } 486 | } 487 | false 488 | } 489 | 490 | /// Assuming that users get a passphrase consisting solely of 491 | /// the shortest word on the list, we want to check against 492 | /// a brute-force attack in exactly that situation. To do so, 493 | /// we calculate a value I'm calling "assumed entropy per character". 494 | /// 495 | /// If this value is above `log2(26)` or 4.7 bits, there's a chance 496 | /// that we'd _over_-estimate the entropy of passphrases created 497 | /// using the word list. 498 | pub fn assumed_entropy_per_character(list: &[String]) -> f64 { 499 | let shortest_word_length = get_shortest_word_length(list) as f64; 500 | let assumed_entropy_per_word = calc_entropy_per_word(list.len()); 501 | 502 | assumed_entropy_per_word / shortest_word_length 503 | } 504 | 505 | /// Calculates the "efficiency" of the list. 506 | /// Basically this is the number of bits of entropy generated by 507 | /// the AVERAGE character. Thus it is different from 508 | /// `assumed_entropy_per_word`, which you can think of as 509 | /// the "worst case scenario" (user getting only words of the SHORTEST 510 | /// length in their passphrase). 511 | pub fn efficiency_per_character(list: &[String]) -> f64 { 512 | let mean_word_length = mean_word_length(list) as f64; 513 | let entropy_per_word = calc_entropy_per_word(list.len()); 514 | 515 | entropy_per_word / mean_word_length 516 | } 517 | 518 | /// This function returns a bool based on whether the list fulfills something 519 | /// called the McMillan Inequality 520 | /// See: 521 | pub fn satisfies_kraft_mcmillan(list: &[String]) -> KraftMcmillanOutcome { 522 | let alphabet_size = count_unique_characters(list); 523 | let mut running_total: f64 = 0.0; 524 | for word in list { 525 | running_total += 526 | 1.0 / (alphabet_size.pow(count_characters(word).try_into().unwrap()) as f64); 527 | } 528 | if running_total <= 1.0 { 529 | KraftMcmillanOutcome::Satisfied 530 | } else { 531 | KraftMcmillanOutcome::NotSatisfied 532 | } 533 | } 534 | 535 | /// Helper function for calculating the Kraft McMillian Inequality 536 | fn count_unique_characters(list: &[String]) -> usize { 537 | let mut characters = vec![]; 538 | for word in list { 539 | for l in word.chars() { 540 | characters.push(l); 541 | } 542 | } 543 | characters.sort(); 544 | characters.dedup(); 545 | characters.len() 546 | } 547 | 548 | /// A simple helper function that gets the shortest word on 549 | /// a list. 550 | pub fn get_shortest_word_length(list: &[String]) -> usize { 551 | count_characters( 552 | list.iter() 553 | .min_by(|a, b| count_characters(a).cmp(&count_characters(b))) 554 | .unwrap(), 555 | ) 556 | } 557 | 558 | /// Calculates mean (or average) word length of given word 559 | /// list 560 | pub fn mean_word_length(list: &[String]) -> f32 { 561 | list.iter() 562 | .map(|word| count_characters(word)) 563 | .sum::() as f32 564 | / list.len() as f32 565 | } 566 | 567 | fn print_samples(samples: Vec) { 568 | eprintln!("\nWord samples"); 569 | eprintln!("------------"); 570 | for n in 0..30 { 571 | if n != 0 && n % 6 == 0 { 572 | // if we're at the end of the 6th word, 573 | // print a newline 574 | eprintln!(); 575 | } else if n != 0 { 576 | // else just print a space to go between each 577 | // word 578 | eprint!(" "); 579 | } 580 | eprint!("{}", samples[n]); 581 | } 582 | eprintln!(); 583 | } 584 | -------------------------------------------------------------------------------- /tests/list_manipulation_tests.rs: -------------------------------------------------------------------------------- 1 | mod list_manipulation_tests { 2 | use tidy::dice::print_as_dice; // not exactly sure why I need this here... 3 | use tidy::list_manipulations::reverse_all_words; 4 | use tidy::*; 5 | 6 | fn make_lists() -> (Vec, Vec, Vec, Vec) { 7 | ( 8 | vec![ 9 | " zookeeper", 10 | "cHarLie", 11 | "keeper", 12 | "app", 13 | "tea", 14 | "addiction", 15 | "zoo", 16 | "keeper", 17 | "stationary ", 18 | "tea", 19 | "station", 20 | "apple", 21 | "sécréter", 22 | "séc", 23 | "actor", 24 | ] 25 | .iter() 26 | .map(|x| x.to_string()) 27 | .collect(), 28 | vec![ 29 | " wizard ", 30 | "ardoR", 31 | "tea", 32 | "11225 active", 33 | " ", 34 | "11152 acclaim", 35 | "its456", 36 | "11156 word tabs", 37 | "19-6-8 clad", 38 | "be", 39 | "I", 40 | "vAcation", 41 | "take", 42 | "world999", 43 | "", 44 | "mistake", 45 | "tee", 46 | "post-modern", 47 | "13910 word with spaces in it", 48 | "comma,203478", 49 | "“smart”", 50 | "‘quotes’", 51 | " h as spaces ", 52 | ] 53 | .iter() 54 | .map(|x| x.to_string()) 55 | .collect(), 56 | vec![ 57 | "Normal", 58 | "the,2048", 59 | "اج 12", 60 | "11225 tab", 61 | "11152 space", 62 | "11156 word tabs", 63 | "word-with-hypens", 64 | "Uppercase", 65 | "hello109823", 66 | " ", 67 | "", 68 | "13910 word with spaces in it", 69 | "comma,203478", 70 | "京", 71 | "can't", 72 | "\"dumb quotes\"", 73 | ] 74 | .iter() 75 | .map(|x| x.to_string()) 76 | .collect(), 77 | vec![ 78 | "énigme", "enlever", "abbey", "zoo", "Zambia", "eager", "ezra", "año", "antena", 79 | "anaconda", "aptitude", 80 | ] 81 | .iter() 82 | .map(|word| word.to_string()) 83 | .collect(), 84 | ) 85 | } 86 | 87 | #[test] 88 | fn can_remove_duplicate_words() { 89 | let this_tidy_request = TidyRequest { 90 | list: make_lists().0, 91 | ..Default::default() 92 | }; 93 | let new_list = tidy_list(this_tidy_request); 94 | assert!(new_list.contains(&"tea".to_string())); 95 | assert!(new_list.contains(&"apple".to_string())); 96 | assert!(new_list.len() == make_lists().0.len() - 2); 97 | } 98 | 99 | #[test] 100 | fn can_sort_words_alphabetically() { 101 | let this_tidy_request = TidyRequest { 102 | list: make_lists().0, 103 | sort_alphabetically: true, 104 | locale: "en-US".to_string(), 105 | ..Default::default() 106 | }; 107 | let new_list = tidy_list(this_tidy_request); 108 | assert!(new_list[0] == "actor".to_string()); 109 | assert!(new_list.contains(&"station".to_string())); 110 | assert!(new_list[new_list.len() - 1] == "zookeeper".to_string()); 111 | } 112 | 113 | #[test] 114 | fn respect_option_to_not_sort_alphabetically() { 115 | let this_tidy_request = TidyRequest { 116 | list: make_lists().0, 117 | sort_alphabetically: false, 118 | ..Default::default() 119 | }; 120 | let new_list = tidy_list(this_tidy_request); 121 | assert!(new_list[0] == "zookeeper".to_string()); 122 | assert!(new_list.contains(&"apple".to_string())); 123 | assert_eq!(new_list[new_list.len() - 4], "apple".to_string()); 124 | } 125 | 126 | #[test] 127 | fn can_sort_by_length() { 128 | let this_tidy_request = TidyRequest { 129 | list: make_lists().0, 130 | sort_by_length: true, 131 | locale: "en-US".to_string(), 132 | ..Default::default() 133 | }; 134 | let new_list = tidy_list(this_tidy_request); 135 | assert!(new_list[0] == "stationary".to_string()); 136 | assert!(new_list[1] == "addiction".to_string()); 137 | } 138 | 139 | #[test] 140 | fn removes_blank_lines() { 141 | let this_tidy_request = TidyRequest { 142 | list: make_lists().1, 143 | ..Default::default() 144 | }; 145 | let new_list = tidy_list(this_tidy_request); 146 | assert!(new_list.len() == make_lists().1.len() - 2); 147 | } 148 | 149 | #[test] 150 | fn can_take_first_3_elements() { 151 | let this_tidy_request = TidyRequest { 152 | list: make_lists().1, 153 | take_first: Some(4), 154 | ..Default::default() 155 | }; 156 | let new_list = tidy_list(this_tidy_request); 157 | println!("List length now {}: {:?}", new_list.len(), new_list); 158 | assert_eq!(new_list.len(), 4); 159 | assert_ne!(new_list.len(), 3); 160 | assert_ne!(new_list.len(), 15); 161 | } 162 | 163 | #[test] 164 | fn removes_starting_and_trailing_whitespace() { 165 | let this_tidy_request = TidyRequest { 166 | list: make_lists().1, 167 | ..Default::default() 168 | }; 169 | let new_list = tidy_list(this_tidy_request); 170 | assert!(new_list.contains(&"wizard".to_string())); 171 | } 172 | 173 | #[test] 174 | fn does_not_remove_inner_spaces() { 175 | let this_tidy_request = TidyRequest { 176 | list: make_lists().1, 177 | ..Default::default() 178 | }; 179 | let new_list = tidy_list(this_tidy_request); 180 | assert!(new_list.contains(&"h as spaces".to_string())); 181 | } 182 | 183 | #[test] 184 | fn can_straighten_quotes() { 185 | let this_tidy_request = TidyRequest { 186 | list: make_lists().1, 187 | should_straighten_quotes: true, 188 | ..Default::default() 189 | }; 190 | let new_list = tidy_list(this_tidy_request); 191 | assert!(new_list.contains(&"\"smart\"".to_string())); 192 | assert!(new_list.contains(&"'quotes'".to_string())); 193 | } 194 | #[test] 195 | fn can_delete_integers_from_words() { 196 | let this_tidy_request = TidyRequest { 197 | list: make_lists().1, 198 | should_delete_integers: true, 199 | ..Default::default() 200 | }; 201 | let new_list = tidy_list(this_tidy_request); 202 | assert!(new_list.contains(&"active".to_string())); 203 | } 204 | 205 | #[test] 206 | fn can_delete_nonalphanumeric_from_words() { 207 | let this_tidy_request = TidyRequest { 208 | list: make_lists().1, 209 | should_delete_nonalphanumeric: true, 210 | ..Default::default() 211 | }; 212 | let new_list = tidy_list(this_tidy_request); 213 | assert!(new_list.contains(&"1968clad".to_string())); 214 | assert!(new_list.contains(&"take".to_string())); 215 | } 216 | 217 | #[test] 218 | fn can_remove_nonalphanumeric_words_from_list() { 219 | let this_tidy_request = TidyRequest { 220 | list: make_lists().2, 221 | should_remove_nonalphanumeric: true, 222 | ..Default::default() 223 | }; 224 | let new_list = tidy_list(this_tidy_request); 225 | 226 | assert!(new_list.contains(&"Uppercase".to_string())); 227 | assert!(new_list.contains(&"京".to_string())); 228 | assert!(new_list.contains(&"hello109823".to_string())); 229 | assert!(!new_list.contains(&"word-with-hypens".to_string())); 230 | assert!(!new_list.contains(&"comma,203478".to_string())); 231 | assert!(!new_list.contains(&"اج 12".to_string())); 232 | } 233 | 234 | #[test] 235 | fn can_remove_nonalphabetic_words_from_list() { 236 | let this_tidy_request = TidyRequest { 237 | list: make_lists().2, 238 | should_remove_nonalphabetic: true, 239 | ..Default::default() 240 | }; 241 | let new_list = tidy_list(this_tidy_request); 242 | 243 | assert!(new_list.contains(&"Uppercase".to_string())); 244 | assert!(new_list.contains(&"京".to_string())); 245 | assert!(!new_list.contains(&"hello109823".to_string())); 246 | assert!(!new_list.contains(&"word-with-hypens".to_string())); 247 | assert!(!new_list.contains(&"comma,203478".to_string())); 248 | assert!(!new_list.contains(&"اج 12".to_string())); 249 | } 250 | #[test] 251 | fn can_remove_non_latin_alphabetic_words_from_list() { 252 | let this_tidy_request = TidyRequest { 253 | list: make_lists().2, 254 | should_remove_non_latin_alphabetic: true, 255 | ..Default::default() 256 | }; 257 | let new_list = tidy_list(this_tidy_request); 258 | 259 | assert!(new_list.contains(&"Uppercase".to_string())); 260 | assert!(!new_list.contains(&"京".to_string())); 261 | assert!(!new_list.contains(&"hello109823".to_string())); 262 | assert!(!new_list.contains(&"word-with-hypens".to_string())); 263 | assert!(!new_list.contains(&"comma,203478".to_string())); 264 | assert!(!new_list.contains(&"اج 12".to_string())); 265 | } 266 | 267 | #[test] 268 | fn can_remove_non_ascii_words_from_list() { 269 | let this_tidy_request = TidyRequest { 270 | list: make_lists().2, 271 | should_remove_nonascii: true, 272 | ..Default::default() 273 | }; 274 | let new_list = tidy_list(this_tidy_request); 275 | 276 | assert!(new_list.contains(&"Uppercase".to_string())); 277 | assert!(new_list.contains(&"hello109823".to_string())); 278 | assert!(new_list.contains(&"word-with-hypens".to_string())); 279 | assert!(new_list.contains(&"comma,203478".to_string())); 280 | assert!(!new_list.contains(&"京".to_string())); 281 | assert!(!new_list.contains(&"اج 12".to_string())); 282 | } 283 | 284 | #[test] 285 | fn can_delete_before_first_tab() { 286 | let this_tidy_request = TidyRequest { 287 | list: make_lists().1, 288 | should_delete_before_first_delimiter: Some('\t'), 289 | ..Default::default() 290 | }; 291 | let new_list = tidy_list(this_tidy_request); 292 | assert!(new_list.contains(&"active".to_string())); 293 | assert!(new_list.contains(&"acclaim".to_string())); 294 | // Only remove through FIRST tab 295 | assert!(new_list.contains(&"word\ttabs".to_string())); 296 | } 297 | 298 | #[test] 299 | fn can_delete_before_first_space() { 300 | let this_tidy_request = TidyRequest { 301 | list: make_lists().1, 302 | should_delete_before_first_delimiter: Some(' '), 303 | ..Default::default() 304 | }; 305 | let new_list = tidy_list(this_tidy_request); 306 | assert!(new_list.contains(&"clad".to_string())); 307 | // Check that it only removes characters through first space, rather than just 308 | // between first space and second space, for example 309 | assert!(new_list.contains(&"word with spaces in it".to_string())); 310 | // Tidy trims leading whitespace first, so the "h" 311 | // will be cut here. 312 | assert!(new_list.contains(&"as spaces".to_string())); 313 | } 314 | #[test] 315 | fn can_delete_before_first_comma() { 316 | let this_tidy_request = TidyRequest { 317 | list: make_lists().1, 318 | should_delete_before_first_delimiter: Some(','), 319 | ..Default::default() 320 | }; 321 | let new_list = tidy_list(this_tidy_request); 322 | assert!(new_list.contains(&"203478".to_string())); 323 | assert!(new_list.contains(&"h as spaces".to_string())); 324 | } 325 | 326 | #[test] 327 | fn can_delete_after_first_tab() { 328 | let this_tidy_request = TidyRequest { 329 | list: make_lists().1, 330 | should_delete_after_first_delimiter: Some('\t'), 331 | ..Default::default() 332 | }; 333 | let new_list = tidy_list(this_tidy_request); 334 | assert!(new_list.contains(&"11225".to_string())); 335 | assert!(new_list.contains(&"11152".to_string())); 336 | // remove after FIRST tab 337 | assert!(new_list.contains(&"11156".to_string())); 338 | } 339 | #[test] 340 | fn can_delete_after_first_space() { 341 | let this_tidy_request = TidyRequest { 342 | list: make_lists().1, 343 | should_delete_after_first_delimiter: Some(' '), 344 | ..Default::default() 345 | }; 346 | let new_list = tidy_list(this_tidy_request); 347 | assert!(new_list.contains(&"19-6-8".to_string())); 348 | assert!(new_list.contains(&"13910".to_string())); 349 | assert!(new_list.contains(&"post-modern".to_string())); 350 | assert!(new_list.contains(&"comma,203478".to_string())); 351 | } 352 | #[test] 353 | fn can_delete_after_first_comma() { 354 | let this_tidy_request = TidyRequest { 355 | list: make_lists().1, 356 | should_delete_after_first_delimiter: Some(','), 357 | ..Default::default() 358 | }; 359 | let new_list = tidy_list(this_tidy_request); 360 | assert!(new_list.contains(&"comma".to_string())); 361 | assert!(new_list.contains(&"h as spaces".to_string())); 362 | } 363 | 364 | #[test] 365 | fn can_lowercase_words() { 366 | let this_tidy_request = TidyRequest { 367 | list: make_lists().0, 368 | to_lowercase: true, 369 | ..Default::default() 370 | }; 371 | let new_list = tidy_list(this_tidy_request); 372 | assert!(new_list.contains(&"charlie".to_string())); 373 | let this_tidy_request = TidyRequest { 374 | list: make_lists().1, 375 | to_lowercase: true, 376 | ..Default::default() 377 | }; 378 | let new_list = tidy_list(this_tidy_request); 379 | assert!(new_list.contains(&"vacation".to_string())); 380 | assert!(new_list.contains(&"ardor".to_string())); 381 | } 382 | 383 | #[test] 384 | fn can_remove_prefix_words() { 385 | let this_tidy_request = TidyRequest { 386 | list: make_lists().0, 387 | should_remove_prefix_words: true, 388 | ..Default::default() 389 | }; 390 | let new_list = tidy_list(this_tidy_request); 391 | assert!(!new_list.contains(&"station".to_string())); 392 | assert!(new_list.contains(&"stationary".to_string())); 393 | assert!(!new_list.contains(&"zoo".to_string())); 394 | assert!(new_list.contains(&"zookeeper".to_string())); 395 | assert!(new_list.contains(&"apple".to_string())); 396 | } 397 | 398 | #[test] 399 | fn can_remove_a_prefix_word_that_has_accents() { 400 | let this_tidy_request = TidyRequest { 401 | list: make_lists().0, 402 | should_remove_prefix_words: true, 403 | ..Default::default() 404 | }; 405 | let new_list = tidy_list(this_tidy_request); 406 | assert!(!new_list.contains(&"séc".to_string())); 407 | assert!(new_list.contains(&"sécréter".to_string())); 408 | } 409 | 410 | #[test] 411 | fn can_remove_suffix_words() { 412 | let this_tidy_request = TidyRequest { 413 | list: make_lists().0, 414 | should_remove_suffix_words: true, 415 | ..Default::default() 416 | }; 417 | let new_list = tidy_list(this_tidy_request); 418 | assert!(!new_list.contains(&"keeper".to_string())); 419 | assert!(new_list.contains(&"apple".to_string())); 420 | } 421 | 422 | #[test] 423 | fn can_remove_words_with_nonalphanumeric_characters() { 424 | let this_tidy_request = TidyRequest { 425 | list: make_lists().1, 426 | should_remove_nonalphanumeric: true, 427 | ..Default::default() 428 | }; 429 | let new_list = tidy_list(this_tidy_request); 430 | assert!(!new_list.contains(&"19-6-8 clad".to_string())); 431 | assert!(new_list.contains(&"world999".to_string())); 432 | assert!(new_list.contains(&"take".to_string())); 433 | } 434 | 435 | #[test] 436 | fn can_remove_words_with_nonalphabetic_characters() { 437 | let this_tidy_request = TidyRequest { 438 | list: make_lists().1, 439 | should_remove_nonalphabetic: true, 440 | ..Default::default() 441 | }; 442 | let new_list = tidy_list(this_tidy_request); 443 | assert!(!new_list.contains(&"19-6-8 clad".to_string())); 444 | assert!(!new_list.contains(&"world999".to_string())); 445 | assert!(!new_list.contains(&"world".to_string())); 446 | assert!(!new_list.contains(&"post-modern".to_string())); 447 | assert!(!new_list.contains(&"postmodern".to_string())); 448 | assert!(new_list.contains(&"take".to_string())); 449 | assert!(new_list.contains(&"wizard".to_string())); 450 | assert!(new_list.contains(&"vAcation".to_string())); 451 | } 452 | 453 | #[test] 454 | fn can_remove_words_with_integers() { 455 | let this_tidy_request = TidyRequest { 456 | list: make_lists().1, 457 | should_remove_integers: true, 458 | ..Default::default() 459 | }; 460 | let new_list = tidy_list(this_tidy_request); 461 | assert!(!new_list.contains(&"19-6-8 clad".to_string())); 462 | assert!(!new_list.contains(&"world999".to_string())); 463 | assert!(new_list.contains(&"be".to_string())); 464 | assert!(new_list.contains(&"I".to_string())); 465 | } 466 | 467 | #[test] 468 | fn can_remove_words_shorter_than_a_specified_minimum_length() { 469 | let this_tidy_request = TidyRequest { 470 | list: make_lists().1, 471 | minimum_length: Some(3), 472 | ..Default::default() 473 | }; 474 | let new_list = tidy_list(this_tidy_request); 475 | assert!(!new_list.contains(&"I".to_string())); 476 | assert!(!new_list.contains(&"be".to_string())); 477 | assert!(new_list.contains(&"tea".to_string())); 478 | assert!(new_list.contains(&"mistake".to_string())); 479 | } 480 | 481 | #[test] 482 | fn can_remove_words_longer_than_a_specified_minimum_and_maximum_length() { 483 | let this_tidy_request = TidyRequest { 484 | list: make_lists().0, 485 | minimum_length: Some(4), 486 | maximum_length: Some(7), 487 | ..Default::default() 488 | }; 489 | let new_list = tidy_list(this_tidy_request); 490 | assert!(!new_list.contains(&"addiction".to_string())); 491 | assert!(!new_list.contains(&"zookeeper".to_string())); 492 | assert!(!new_list.contains(&"stationary".to_string())); 493 | assert!(!new_list.contains(&"its".to_string())); 494 | assert!(!new_list.contains(&"its456".to_string())); 495 | assert!(!new_list.contains(&"tea".to_string())); 496 | assert!(new_list.contains(&"station".to_string())); 497 | } 498 | 499 | #[test] 500 | fn can_remove_words_longer_than_a_specified_maximum_length_after_deleting_integers() { 501 | let this_tidy_request = TidyRequest { 502 | list: make_lists().1, 503 | should_delete_integers: true, 504 | maximum_length: Some(7), 505 | ..Default::default() 506 | }; 507 | let new_list = tidy_list(this_tidy_request); 508 | assert!(new_list.contains(&"active".to_string())); 509 | assert!(new_list.contains(&"acclaim".to_string())); 510 | assert!(!new_list.contains(&"word with spacaes in it".to_string())); 511 | } 512 | 513 | #[test] 514 | fn can_gurantee_a_maximum_length_of_shared_prefix_for_autocomplete() { 515 | let this_tidy_request = TidyRequest { 516 | list: make_lists().0, 517 | maximum_shared_prefix_length: Some(3), 518 | ..Default::default() 519 | }; 520 | let new_list = tidy_list(this_tidy_request); 521 | assert!(new_list.contains(&"zoo".to_string())); 522 | assert!(!new_list.contains(&"zookeeper".to_string())); 523 | assert!(new_list.contains(&"station".to_string())); 524 | assert!(!new_list.contains(&"stationary".to_string())); 525 | assert!(new_list.contains(&"app".to_string())); 526 | assert!(!new_list.contains(&"apple".to_string())); 527 | } 528 | 529 | #[test] 530 | fn can_remove_reject_words() { 531 | let words_to_reject: Vec = vec!["mistake", "carnival"] 532 | .iter() 533 | .map(|x| x.to_string()) 534 | .collect(); 535 | 536 | let this_tidy_request = TidyRequest { 537 | list: make_lists().1, 538 | reject_list: Some(words_to_reject), 539 | to_lowercase: true, 540 | ..Default::default() 541 | }; 542 | let new_list = tidy_list(this_tidy_request); 543 | assert!(!new_list.contains(&"mistake".to_string())); 544 | assert!(!new_list.contains(&"carnival".to_string())); 545 | assert!(new_list.contains(&"wizard".to_string())); 546 | } 547 | 548 | #[test] 549 | fn can_remove_all_words_not_on_approved_list_words() { 550 | let approved_words: Vec = vec!["take", "vAcation", "airplane"] 551 | .iter() 552 | .map(|x| x.to_string()) 553 | .collect(); 554 | 555 | let this_tidy_request = TidyRequest { 556 | list: make_lists().1, 557 | approved_list: Some(approved_words), 558 | ..Default::default() 559 | }; 560 | let new_list = tidy_list(this_tidy_request); 561 | assert!(new_list.contains(&"take".to_string())); 562 | assert!(new_list.contains(&"vAcation".to_string())); 563 | assert!(!new_list.contains(&"carnival".to_string())); 564 | assert!(!new_list.contains(&"wizard".to_string())); 565 | assert!(!new_list.contains(&"airplane".to_string())); 566 | } 567 | 568 | #[test] 569 | fn can_remove_specified_homophones() { 570 | let homophone1 = ("be".to_string(), "bee".to_string()); 571 | let homophone2 = ("right".to_string(), "write".to_string()); 572 | let homophone3 = ("tea".to_string(), "tee".to_string()); 573 | let this_tidy_request = TidyRequest { 574 | list: make_lists().1, 575 | homophones_list: Some(vec![homophone1, homophone2, homophone3]), 576 | to_lowercase: true, 577 | ..Default::default() 578 | }; 579 | let new_list = tidy_list(this_tidy_request); 580 | assert!(new_list.contains(&"tea".to_string())); 581 | assert!(!new_list.contains(&"tee".to_string())); 582 | assert!(new_list.contains(&"be".to_string())); 583 | assert!(!new_list.contains(&"bee".to_string())); 584 | assert!(new_list.contains(&"mistake".to_string())); 585 | } 586 | 587 | #[test] 588 | fn can_sort_accented_and_capitalized_letters_properly() { 589 | let this_tidy_request = TidyRequest { 590 | list: make_lists().3, 591 | sort_alphabetically: true, 592 | locale: "es-ES".to_string(), 593 | normalization_form: Some("nfkd".to_string()), 594 | ..Default::default() 595 | }; 596 | let new_list = tidy_list(this_tidy_request); 597 | 598 | let how_list_should_be_sorted: Vec = vec![ 599 | "abbey", 600 | "anaconda", 601 | "antena", 602 | "año", 603 | "aptitude", 604 | "eager", 605 | &normalize_unicode("énigme", "nfkd").unwrap(), 606 | "enlever", 607 | "ezra", 608 | "Zambia", 609 | "zoo", 610 | ] 611 | .iter() 612 | .map(|word| word.to_string()) 613 | .collect(); 614 | assert_eq!(new_list, how_list_should_be_sorted); 615 | } 616 | 617 | // this is really a WORD manipulation, so maybe should be in a 618 | // different test file 619 | use tidy::list_manipulations::normalize_unicode; 620 | #[test] 621 | fn can_normalize_unicode_in_a_given_word() { 622 | let word_with_combined_accents = "sécréter"; 623 | let word_with_two_char_accents = "sécréter"; 624 | assert_eq!( 625 | word_with_combined_accents, 626 | normalize_unicode(word_with_combined_accents, "nfc").unwrap() 627 | ); 628 | assert_eq!( 629 | word_with_combined_accents, 630 | normalize_unicode(word_with_combined_accents, "nfkc").unwrap() 631 | ); 632 | assert_eq!( 633 | word_with_two_char_accents, 634 | normalize_unicode(word_with_two_char_accents, "nfd").unwrap() 635 | ); 636 | assert_eq!( 637 | word_with_two_char_accents, 638 | normalize_unicode(word_with_two_char_accents, "nfkd").unwrap() 639 | ); 640 | } 641 | #[test] 642 | fn can_accurately_count_characters() { 643 | let normal_word = "normal"; 644 | assert_eq!(count_characters(normal_word), 6); 645 | 646 | // These two words below seem the same, don't they? 647 | let word_with_combined_accents = "sécréter"; 648 | let word_with_two_char_accents = "sécréter"; 649 | 650 | // Oh, you sweet summer child... 651 | assert_ne!( 652 | word_with_combined_accents.chars().count(), 653 | word_with_two_char_accents.chars().count() 654 | ); 655 | // Hence, my count_characters function, which normalizes 656 | // Unicopde via NFC before counting the length of given string slice 657 | // I chose NFC because it seems to be closest to how human read/count 658 | // letters (e.g. and accented e always counts as 1 character). 659 | assert_eq!(count_characters(word_with_combined_accents), 8); 660 | assert_eq!(count_characters(word_with_two_char_accents), 8); 661 | 662 | let emojis = "😀😃😄😁😆"; 663 | assert_eq!(count_characters(emojis), 5); 664 | } 665 | 666 | #[test] 667 | fn can_accurately_count_characters_of_nfc_and_nfkd_normalized_words() { 668 | let word_with_combined_accents = "sécréter"; 669 | let word_with_two_char_accents = "sécréter"; 670 | assert_eq!( 671 | normalize_unicode(word_with_combined_accents, "nfc") 672 | .unwrap() 673 | .chars() 674 | .count(), 675 | normalize_unicode(word_with_two_char_accents, "nfc") 676 | .unwrap() 677 | .chars() 678 | .count() 679 | ); 680 | assert_eq!( 681 | normalize_unicode(word_with_combined_accents, "nfkd") 682 | .unwrap() 683 | .chars() 684 | .count(), 685 | normalize_unicode(word_with_two_char_accents, "nfkd") 686 | .unwrap() 687 | .chars() 688 | .count() 689 | ); 690 | } 691 | 692 | #[test] 693 | fn can_reverse_list() { 694 | let list = vec![ 695 | "hotdog".to_string(), 696 | "hamburger".to_string(), 697 | "alligator".to_string(), 698 | "😀😁😆".to_string(), 699 | ]; 700 | let rev_list = reverse_all_words(&list); 701 | assert_eq!(rev_list, ["godtoh", "regrubmah", "rotagilla", "😆😁😀"]); 702 | } 703 | 704 | #[test] 705 | fn can_print_dice_rolls_of_base_6() { 706 | assert_eq!(print_as_dice(0, 6, 7776, false), "11111".to_string()); 707 | assert_eq!(print_as_dice(7775, 6, 7776, false), "66666".to_string()); 708 | assert_eq!(print_as_dice(2548, 6, 7776, false), "26555".to_string()); 709 | assert_eq!(print_as_dice(2548, 6, 7000, false), "26555".to_string()); 710 | } 711 | #[test] 712 | fn can_print_dice_rolls_of_base_2() { 713 | assert_eq!(print_as_dice(1, 2, 7776, true), "0000000000001".to_string()); 714 | assert_eq!(print_as_dice(127, 2, 128, true), "1111111".to_string()); 715 | } 716 | #[test] 717 | fn can_print_dice_rolls_of_base_20() { 718 | assert_eq!(print_as_dice(1000, 20, 8000, false), "03-11-01".to_string()); 719 | assert_eq!(print_as_dice(1000, 20, 8000, true), "2A0".to_string()); 720 | } 721 | } 722 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "anstream" 7 | version = "0.6.21" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" 10 | dependencies = [ 11 | "anstyle", 12 | "anstyle-parse", 13 | "anstyle-query", 14 | "anstyle-wincon", 15 | "colorchoice", 16 | "is_terminal_polyfill", 17 | "utf8parse", 18 | ] 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.13" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" 25 | 26 | [[package]] 27 | name = "anstyle-parse" 28 | version = "0.2.7" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" 31 | dependencies = [ 32 | "utf8parse", 33 | ] 34 | 35 | [[package]] 36 | name = "anstyle-query" 37 | version = "1.1.5" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" 40 | dependencies = [ 41 | "windows-sys", 42 | ] 43 | 44 | [[package]] 45 | name = "anstyle-wincon" 46 | version = "3.0.11" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" 49 | dependencies = [ 50 | "anstyle", 51 | "once_cell_polyfill", 52 | "windows-sys", 53 | ] 54 | 55 | [[package]] 56 | name = "autocfg" 57 | version = "1.5.0" 58 | source = "registry+https://github.com/rust-lang/crates.io-index" 59 | checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" 60 | 61 | [[package]] 62 | name = "calendrical_calculations" 63 | version = "0.2.3" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "3a0b39595c6ee54a8d0900204ba4c401d0ab4eb45adaf07178e8d017541529e7" 66 | dependencies = [ 67 | "core_maths", 68 | "displaydoc", 69 | ] 70 | 71 | [[package]] 72 | name = "cfg-if" 73 | version = "1.0.4" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" 76 | 77 | [[package]] 78 | name = "clap" 79 | version = "4.5.53" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" 82 | dependencies = [ 83 | "clap_builder", 84 | "clap_derive", 85 | ] 86 | 87 | [[package]] 88 | name = "clap_builder" 89 | version = "4.5.53" 90 | source = "registry+https://github.com/rust-lang/crates.io-index" 91 | checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" 92 | dependencies = [ 93 | "anstream", 94 | "anstyle", 95 | "clap_lex", 96 | "strsim", 97 | ] 98 | 99 | [[package]] 100 | name = "clap_derive" 101 | version = "4.5.49" 102 | source = "registry+https://github.com/rust-lang/crates.io-index" 103 | checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" 104 | dependencies = [ 105 | "heck", 106 | "proc-macro2", 107 | "quote", 108 | "syn", 109 | ] 110 | 111 | [[package]] 112 | name = "clap_lex" 113 | version = "0.7.6" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" 116 | 117 | [[package]] 118 | name = "colorchoice" 119 | version = "1.0.4" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" 122 | 123 | [[package]] 124 | name = "core_maths" 125 | version = "0.1.1" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "77745e017f5edba1a9c1d854f6f3a52dac8a12dd5af5d2f54aecf61e43d80d30" 128 | dependencies = [ 129 | "libm", 130 | ] 131 | 132 | [[package]] 133 | name = "displaydoc" 134 | version = "0.2.5" 135 | source = "registry+https://github.com/rust-lang/crates.io-index" 136 | checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" 137 | dependencies = [ 138 | "proc-macro2", 139 | "quote", 140 | "syn", 141 | ] 142 | 143 | [[package]] 144 | name = "either" 145 | version = "1.15.0" 146 | source = "registry+https://github.com/rust-lang/crates.io-index" 147 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 148 | 149 | [[package]] 150 | name = "fixed_decimal" 151 | version = "0.7.1" 152 | source = "registry+https://github.com/rust-lang/crates.io-index" 153 | checksum = "35eabf480f94d69182677e37571d3be065822acfafd12f2f085db44fbbcc8e57" 154 | dependencies = [ 155 | "displaydoc", 156 | "smallvec", 157 | "writeable", 158 | ] 159 | 160 | [[package]] 161 | name = "getrandom" 162 | version = "0.3.4" 163 | source = "registry+https://github.com/rust-lang/crates.io-index" 164 | checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" 165 | dependencies = [ 166 | "cfg-if", 167 | "libc", 168 | "r-efi", 169 | "wasip2", 170 | ] 171 | 172 | [[package]] 173 | name = "heck" 174 | version = "0.5.0" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 177 | 178 | [[package]] 179 | name = "icu" 180 | version = "2.1.1" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "67ab713dd86fa032cb5487f9ac3a85d47b5dcf4c7b8c7dd00210b3cadd6a6551" 183 | dependencies = [ 184 | "icu_calendar", 185 | "icu_casemap", 186 | "icu_collator", 187 | "icu_collections", 188 | "icu_datetime", 189 | "icu_decimal", 190 | "icu_experimental", 191 | "icu_list", 192 | "icu_locale", 193 | "icu_normalizer", 194 | "icu_pattern", 195 | "icu_plurals", 196 | "icu_properties", 197 | "icu_provider", 198 | "icu_segmenter", 199 | "icu_time", 200 | ] 201 | 202 | [[package]] 203 | name = "icu_calendar" 204 | version = "2.1.1" 205 | source = "registry+https://github.com/rust-lang/crates.io-index" 206 | checksum = "d6f0e52e009b6b16ba9c0693578796f2dd4aaa59a7f8f920423706714a89ac4e" 207 | dependencies = [ 208 | "calendrical_calculations", 209 | "displaydoc", 210 | "icu_calendar_data", 211 | "icu_locale", 212 | "icu_locale_core", 213 | "icu_provider", 214 | "ixdtf", 215 | "serde", 216 | "tinystr", 217 | "zerovec", 218 | ] 219 | 220 | [[package]] 221 | name = "icu_calendar_data" 222 | version = "2.1.1" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "527f04223b17edfe0bd43baf14a0cb1b017830db65f3950dc00224860a9a446d" 225 | 226 | [[package]] 227 | name = "icu_casemap" 228 | version = "2.1.1" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "d4ca9983e8bf51223c2f89014fa4eaa9e9b336c47f3af0d000538f86f841fba1" 231 | dependencies = [ 232 | "icu_casemap_data", 233 | "icu_collections", 234 | "icu_locale_core", 235 | "icu_properties", 236 | "icu_provider", 237 | "potential_utf", 238 | "writeable", 239 | "zerovec", 240 | ] 241 | 242 | [[package]] 243 | name = "icu_casemap_data" 244 | version = "2.1.1" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "98d4663d0f99b301033a19e0acf94e9d2fa4b107638580165e5a6ccc49ad1450" 247 | 248 | [[package]] 249 | name = "icu_collator" 250 | version = "2.1.1" 251 | source = "registry+https://github.com/rust-lang/crates.io-index" 252 | checksum = "32eed11a5572f1088b63fa21dc2e70d4a865e5739fc2d10abc05be93bae97019" 253 | dependencies = [ 254 | "icu_collator_data", 255 | "icu_collections", 256 | "icu_locale", 257 | "icu_locale_core", 258 | "icu_normalizer", 259 | "icu_properties", 260 | "icu_provider", 261 | "smallvec", 262 | "utf16_iter", 263 | "utf8_iter", 264 | "zerovec", 265 | ] 266 | 267 | [[package]] 268 | name = "icu_collator_data" 269 | version = "2.1.1" 270 | source = "registry+https://github.com/rust-lang/crates.io-index" 271 | checksum = "5ab06f0e83a613efddba3e4913e00e43ed4001fae651cb7d40fc7e66b83b6fb9" 272 | 273 | [[package]] 274 | name = "icu_collections" 275 | version = "2.1.1" 276 | source = "registry+https://github.com/rust-lang/crates.io-index" 277 | checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" 278 | dependencies = [ 279 | "displaydoc", 280 | "potential_utf", 281 | "serde", 282 | "yoke", 283 | "zerofrom", 284 | "zerovec", 285 | ] 286 | 287 | [[package]] 288 | name = "icu_datetime" 289 | version = "2.1.1" 290 | source = "registry+https://github.com/rust-lang/crates.io-index" 291 | checksum = "1b9d49f41ded8e63761b6b4c3120dfdc289415a1ed10107db6198eb311057ca5" 292 | dependencies = [ 293 | "displaydoc", 294 | "fixed_decimal", 295 | "icu_calendar", 296 | "icu_datetime_data", 297 | "icu_decimal", 298 | "icu_locale", 299 | "icu_locale_core", 300 | "icu_pattern", 301 | "icu_plurals", 302 | "icu_provider", 303 | "icu_time", 304 | "potential_utf", 305 | "tinystr", 306 | "writeable", 307 | "zerovec", 308 | ] 309 | 310 | [[package]] 311 | name = "icu_datetime_data" 312 | version = "2.1.1" 313 | source = "registry+https://github.com/rust-lang/crates.io-index" 314 | checksum = "0bf2a384725c67fcd32d27737bc7ba9dc5fe21311dfe3ba530f4b4d53e72bacc" 315 | 316 | [[package]] 317 | name = "icu_decimal" 318 | version = "2.1.1" 319 | source = "registry+https://github.com/rust-lang/crates.io-index" 320 | checksum = "a38c52231bc348f9b982c1868a2af3195199623007ba2c7650f432038f5b3e8e" 321 | dependencies = [ 322 | "fixed_decimal", 323 | "icu_decimal_data", 324 | "icu_locale", 325 | "icu_locale_core", 326 | "icu_provider", 327 | "serde", 328 | "writeable", 329 | "zerovec", 330 | ] 331 | 332 | [[package]] 333 | name = "icu_decimal_data" 334 | version = "2.1.1" 335 | source = "registry+https://github.com/rust-lang/crates.io-index" 336 | checksum = "2905b4044eab2dd848fe84199f9195567b63ab3a93094711501363f63546fef7" 337 | 338 | [[package]] 339 | name = "icu_experimental" 340 | version = "0.4.0" 341 | source = "registry+https://github.com/rust-lang/crates.io-index" 342 | checksum = "f4ffa4d60b9cb8b024082afaf9e94d853184e483ec69322c74dc437bf8a882a5" 343 | dependencies = [ 344 | "displaydoc", 345 | "either", 346 | "fixed_decimal", 347 | "icu_casemap", 348 | "icu_collections", 349 | "icu_decimal", 350 | "icu_experimental_data", 351 | "icu_list", 352 | "icu_locale", 353 | "icu_locale_core", 354 | "icu_normalizer", 355 | "icu_pattern", 356 | "icu_plurals", 357 | "icu_properties", 358 | "icu_provider", 359 | "litemap", 360 | "num-bigint", 361 | "num-rational", 362 | "num-traits", 363 | "potential_utf", 364 | "smallvec", 365 | "tinystr", 366 | "writeable", 367 | "zerotrie", 368 | "zerovec", 369 | ] 370 | 371 | [[package]] 372 | name = "icu_experimental_data" 373 | version = "0.4.0" 374 | source = "registry+https://github.com/rust-lang/crates.io-index" 375 | checksum = "2578ea93f0373bb28800f7d1100e7e771c4d248d0d3759250fed08fa27694139" 376 | 377 | [[package]] 378 | name = "icu_list" 379 | version = "2.1.1" 380 | source = "registry+https://github.com/rust-lang/crates.io-index" 381 | checksum = "d3a0b7b126e2fc42777d3c348611553d540bd3683caa39b387c5dd1036bb21a8" 382 | dependencies = [ 383 | "icu_list_data", 384 | "icu_locale", 385 | "icu_provider", 386 | "regex-automata", 387 | "serde", 388 | "writeable", 389 | "zerovec", 390 | ] 391 | 392 | [[package]] 393 | name = "icu_list_data" 394 | version = "2.1.1" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "51044c242fe2a882cc0a464314bbdb9f441556a1cb238fb527fc47355ec2827b" 397 | 398 | [[package]] 399 | name = "icu_locale" 400 | version = "2.1.1" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "532b11722e350ab6bf916ba6eb0efe3ee54b932666afec989465f9243fe6dd60" 403 | dependencies = [ 404 | "icu_collections", 405 | "icu_locale_core", 406 | "icu_locale_data", 407 | "icu_provider", 408 | "potential_utf", 409 | "tinystr", 410 | "zerovec", 411 | ] 412 | 413 | [[package]] 414 | name = "icu_locale_core" 415 | version = "2.1.1" 416 | source = "registry+https://github.com/rust-lang/crates.io-index" 417 | checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" 418 | dependencies = [ 419 | "displaydoc", 420 | "litemap", 421 | "serde", 422 | "tinystr", 423 | "writeable", 424 | "zerovec", 425 | ] 426 | 427 | [[package]] 428 | name = "icu_locale_data" 429 | version = "2.1.1" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "f03e2fcaefecdf05619f3d6f91740e79ab969b4dd54f77cbf546b1d0d28e3147" 432 | 433 | [[package]] 434 | name = "icu_normalizer" 435 | version = "2.1.1" 436 | source = "registry+https://github.com/rust-lang/crates.io-index" 437 | checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" 438 | dependencies = [ 439 | "icu_collections", 440 | "icu_normalizer_data", 441 | "icu_properties", 442 | "icu_provider", 443 | "smallvec", 444 | "utf16_iter", 445 | "utf8_iter", 446 | "write16", 447 | "zerovec", 448 | ] 449 | 450 | [[package]] 451 | name = "icu_normalizer_data" 452 | version = "2.1.1" 453 | source = "registry+https://github.com/rust-lang/crates.io-index" 454 | checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" 455 | 456 | [[package]] 457 | name = "icu_pattern" 458 | version = "0.4.1" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "7a7ff8c0ff6f61cdce299dcb54f557b0a251adbc78f6f0c35a21332c452b4a1b" 461 | dependencies = [ 462 | "displaydoc", 463 | "either", 464 | "serde", 465 | "writeable", 466 | "yoke", 467 | "zerovec", 468 | ] 469 | 470 | [[package]] 471 | name = "icu_plurals" 472 | version = "2.1.1" 473 | source = "registry+https://github.com/rust-lang/crates.io-index" 474 | checksum = "4f9cfe49f5b1d1163cc58db451562339916a9ca5cbcaae83924d41a0bf839474" 475 | dependencies = [ 476 | "fixed_decimal", 477 | "icu_locale", 478 | "icu_plurals_data", 479 | "icu_provider", 480 | "zerovec", 481 | ] 482 | 483 | [[package]] 484 | name = "icu_plurals_data" 485 | version = "2.1.1" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "f018a98dccf7f0eb02ba06ac0ff67d102d8ded80734724305e924de304e12ff0" 488 | 489 | [[package]] 490 | name = "icu_properties" 491 | version = "2.1.1" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "e93fcd3157766c0c8da2f8cff6ce651a31f0810eaa1c51ec363ef790bbb5fb99" 494 | dependencies = [ 495 | "icu_collections", 496 | "icu_locale_core", 497 | "icu_properties_data", 498 | "icu_provider", 499 | "serde", 500 | "zerotrie", 501 | "zerovec", 502 | ] 503 | 504 | [[package]] 505 | name = "icu_properties_data" 506 | version = "2.1.1" 507 | source = "registry+https://github.com/rust-lang/crates.io-index" 508 | checksum = "02845b3647bb045f1100ecd6480ff52f34c35f82d9880e029d329c21d1054899" 509 | 510 | [[package]] 511 | name = "icu_provider" 512 | version = "2.1.1" 513 | source = "registry+https://github.com/rust-lang/crates.io-index" 514 | checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" 515 | dependencies = [ 516 | "displaydoc", 517 | "icu_locale_core", 518 | "serde", 519 | "stable_deref_trait", 520 | "writeable", 521 | "yoke", 522 | "zerofrom", 523 | "zerotrie", 524 | "zerovec", 525 | ] 526 | 527 | [[package]] 528 | name = "icu_segmenter" 529 | version = "2.1.1" 530 | source = "registry+https://github.com/rust-lang/crates.io-index" 531 | checksum = "43da5e7e9b540df15e53ca27f69b50e36e01b652584b40b3335ed65d18303834" 532 | dependencies = [ 533 | "core_maths", 534 | "icu_collections", 535 | "icu_locale", 536 | "icu_provider", 537 | "icu_segmenter_data", 538 | "potential_utf", 539 | "utf8_iter", 540 | "zerovec", 541 | ] 542 | 543 | [[package]] 544 | name = "icu_segmenter_data" 545 | version = "2.1.1" 546 | source = "registry+https://github.com/rust-lang/crates.io-index" 547 | checksum = "6ebbb7321d9e21d25f5660366cb6c08201d0175898a3a6f7a41ee9685af21c80" 548 | 549 | [[package]] 550 | name = "icu_time" 551 | version = "2.1.1" 552 | source = "registry+https://github.com/rust-lang/crates.io-index" 553 | checksum = "8242b00da3b3b6678f731437a11c8833a43c821ae081eca60ba1b7579d45b6d8" 554 | dependencies = [ 555 | "calendrical_calculations", 556 | "displaydoc", 557 | "icu_calendar", 558 | "icu_locale_core", 559 | "icu_provider", 560 | "icu_time_data", 561 | "ixdtf", 562 | "serde", 563 | "zerotrie", 564 | "zerovec", 565 | ] 566 | 567 | [[package]] 568 | name = "icu_time_data" 569 | version = "2.1.1" 570 | source = "registry+https://github.com/rust-lang/crates.io-index" 571 | checksum = "3e10b0e5e87a2c84bd5fa407705732052edebe69291d347d0c3033785470edbf" 572 | 573 | [[package]] 574 | name = "is_terminal_polyfill" 575 | version = "1.70.2" 576 | source = "registry+https://github.com/rust-lang/crates.io-index" 577 | checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" 578 | 579 | [[package]] 580 | name = "itertools" 581 | version = "0.14.0" 582 | source = "registry+https://github.com/rust-lang/crates.io-index" 583 | checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" 584 | dependencies = [ 585 | "either", 586 | ] 587 | 588 | [[package]] 589 | name = "itoa" 590 | version = "1.0.15" 591 | source = "registry+https://github.com/rust-lang/crates.io-index" 592 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 593 | 594 | [[package]] 595 | name = "ixdtf" 596 | version = "0.6.4" 597 | source = "registry+https://github.com/rust-lang/crates.io-index" 598 | checksum = "84de9d95a6d2547d9b77ee3f25fa0ee32e3c3a6484d47a55adebc0439c077992" 599 | 600 | [[package]] 601 | name = "libc" 602 | version = "0.2.178" 603 | source = "registry+https://github.com/rust-lang/crates.io-index" 604 | checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" 605 | 606 | [[package]] 607 | name = "libm" 608 | version = "0.2.15" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" 611 | 612 | [[package]] 613 | name = "litemap" 614 | version = "0.8.1" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" 617 | 618 | [[package]] 619 | name = "memchr" 620 | version = "2.7.6" 621 | source = "registry+https://github.com/rust-lang/crates.io-index" 622 | checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" 623 | 624 | [[package]] 625 | name = "num-bigint" 626 | version = "0.4.6" 627 | source = "registry+https://github.com/rust-lang/crates.io-index" 628 | checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" 629 | dependencies = [ 630 | "num-integer", 631 | "num-traits", 632 | ] 633 | 634 | [[package]] 635 | name = "num-integer" 636 | version = "0.1.46" 637 | source = "registry+https://github.com/rust-lang/crates.io-index" 638 | checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" 639 | dependencies = [ 640 | "num-traits", 641 | ] 642 | 643 | [[package]] 644 | name = "num-rational" 645 | version = "0.4.2" 646 | source = "registry+https://github.com/rust-lang/crates.io-index" 647 | checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" 648 | dependencies = [ 649 | "num-bigint", 650 | "num-integer", 651 | "num-traits", 652 | ] 653 | 654 | [[package]] 655 | name = "num-traits" 656 | version = "0.2.19" 657 | source = "registry+https://github.com/rust-lang/crates.io-index" 658 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 659 | dependencies = [ 660 | "autocfg", 661 | ] 662 | 663 | [[package]] 664 | name = "once_cell_polyfill" 665 | version = "1.70.2" 666 | source = "registry+https://github.com/rust-lang/crates.io-index" 667 | checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" 668 | 669 | [[package]] 670 | name = "potential_utf" 671 | version = "0.1.4" 672 | source = "registry+https://github.com/rust-lang/crates.io-index" 673 | checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" 674 | dependencies = [ 675 | "serde_core", 676 | "writeable", 677 | "zerovec", 678 | ] 679 | 680 | [[package]] 681 | name = "ppv-lite86" 682 | version = "0.2.21" 683 | source = "registry+https://github.com/rust-lang/crates.io-index" 684 | checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" 685 | dependencies = [ 686 | "zerocopy", 687 | ] 688 | 689 | [[package]] 690 | name = "proc-macro2" 691 | version = "1.0.103" 692 | source = "registry+https://github.com/rust-lang/crates.io-index" 693 | checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8" 694 | dependencies = [ 695 | "unicode-ident", 696 | ] 697 | 698 | [[package]] 699 | name = "quote" 700 | version = "1.0.42" 701 | source = "registry+https://github.com/rust-lang/crates.io-index" 702 | checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" 703 | dependencies = [ 704 | "proc-macro2", 705 | ] 706 | 707 | [[package]] 708 | name = "r-efi" 709 | version = "5.3.0" 710 | source = "registry+https://github.com/rust-lang/crates.io-index" 711 | checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" 712 | 713 | [[package]] 714 | name = "radix_fmt" 715 | version = "1.0.0" 716 | source = "registry+https://github.com/rust-lang/crates.io-index" 717 | checksum = "ce082a9940a7ace2ad4a8b7d0b1eac6aa378895f18be598230c5f2284ac05426" 718 | 719 | [[package]] 720 | name = "rand" 721 | version = "0.9.2" 722 | source = "registry+https://github.com/rust-lang/crates.io-index" 723 | checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" 724 | dependencies = [ 725 | "rand_chacha", 726 | "rand_core", 727 | ] 728 | 729 | [[package]] 730 | name = "rand_chacha" 731 | version = "0.9.0" 732 | source = "registry+https://github.com/rust-lang/crates.io-index" 733 | checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" 734 | dependencies = [ 735 | "ppv-lite86", 736 | "rand_core", 737 | ] 738 | 739 | [[package]] 740 | name = "rand_core" 741 | version = "0.9.3" 742 | source = "registry+https://github.com/rust-lang/crates.io-index" 743 | checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" 744 | dependencies = [ 745 | "getrandom", 746 | ] 747 | 748 | [[package]] 749 | name = "regex-automata" 750 | version = "0.4.13" 751 | source = "registry+https://github.com/rust-lang/crates.io-index" 752 | checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" 753 | 754 | [[package]] 755 | name = "ryu" 756 | version = "1.0.20" 757 | source = "registry+https://github.com/rust-lang/crates.io-index" 758 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 759 | 760 | [[package]] 761 | name = "serde" 762 | version = "1.0.228" 763 | source = "registry+https://github.com/rust-lang/crates.io-index" 764 | checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" 765 | dependencies = [ 766 | "serde_core", 767 | "serde_derive", 768 | ] 769 | 770 | [[package]] 771 | name = "serde_core" 772 | version = "1.0.228" 773 | source = "registry+https://github.com/rust-lang/crates.io-index" 774 | checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" 775 | dependencies = [ 776 | "serde_derive", 777 | ] 778 | 779 | [[package]] 780 | name = "serde_derive" 781 | version = "1.0.228" 782 | source = "registry+https://github.com/rust-lang/crates.io-index" 783 | checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" 784 | dependencies = [ 785 | "proc-macro2", 786 | "quote", 787 | "syn", 788 | ] 789 | 790 | [[package]] 791 | name = "serde_json" 792 | version = "1.0.145" 793 | source = "registry+https://github.com/rust-lang/crates.io-index" 794 | checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" 795 | dependencies = [ 796 | "itoa", 797 | "memchr", 798 | "ryu", 799 | "serde", 800 | "serde_core", 801 | ] 802 | 803 | [[package]] 804 | name = "smallvec" 805 | version = "1.15.1" 806 | source = "registry+https://github.com/rust-lang/crates.io-index" 807 | checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" 808 | 809 | [[package]] 810 | name = "stable_deref_trait" 811 | version = "1.2.1" 812 | source = "registry+https://github.com/rust-lang/crates.io-index" 813 | checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" 814 | 815 | [[package]] 816 | name = "strsim" 817 | version = "0.11.1" 818 | source = "registry+https://github.com/rust-lang/crates.io-index" 819 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 820 | 821 | [[package]] 822 | name = "syn" 823 | version = "2.0.111" 824 | source = "registry+https://github.com/rust-lang/crates.io-index" 825 | checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" 826 | dependencies = [ 827 | "proc-macro2", 828 | "quote", 829 | "unicode-ident", 830 | ] 831 | 832 | [[package]] 833 | name = "synstructure" 834 | version = "0.13.2" 835 | source = "registry+https://github.com/rust-lang/crates.io-index" 836 | checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" 837 | dependencies = [ 838 | "proc-macro2", 839 | "quote", 840 | "syn", 841 | ] 842 | 843 | [[package]] 844 | name = "tidy" 845 | version = "0.3.17" 846 | dependencies = [ 847 | "clap", 848 | "icu", 849 | "itertools", 850 | "memchr", 851 | "radix_fmt", 852 | "rand", 853 | "serde", 854 | "serde_json", 855 | "unicode-normalization", 856 | "unicode-segmentation", 857 | ] 858 | 859 | [[package]] 860 | name = "tinystr" 861 | version = "0.8.2" 862 | source = "registry+https://github.com/rust-lang/crates.io-index" 863 | checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" 864 | dependencies = [ 865 | "displaydoc", 866 | "serde_core", 867 | "zerovec", 868 | ] 869 | 870 | [[package]] 871 | name = "tinyvec" 872 | version = "1.10.0" 873 | source = "registry+https://github.com/rust-lang/crates.io-index" 874 | checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" 875 | dependencies = [ 876 | "tinyvec_macros", 877 | ] 878 | 879 | [[package]] 880 | name = "tinyvec_macros" 881 | version = "0.1.1" 882 | source = "registry+https://github.com/rust-lang/crates.io-index" 883 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" 884 | 885 | [[package]] 886 | name = "unicode-ident" 887 | version = "1.0.22" 888 | source = "registry+https://github.com/rust-lang/crates.io-index" 889 | checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" 890 | 891 | [[package]] 892 | name = "unicode-normalization" 893 | version = "0.1.25" 894 | source = "registry+https://github.com/rust-lang/crates.io-index" 895 | checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" 896 | dependencies = [ 897 | "tinyvec", 898 | ] 899 | 900 | [[package]] 901 | name = "unicode-segmentation" 902 | version = "1.12.0" 903 | source = "registry+https://github.com/rust-lang/crates.io-index" 904 | checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" 905 | 906 | [[package]] 907 | name = "utf16_iter" 908 | version = "1.0.5" 909 | source = "registry+https://github.com/rust-lang/crates.io-index" 910 | checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" 911 | 912 | [[package]] 913 | name = "utf8_iter" 914 | version = "1.0.4" 915 | source = "registry+https://github.com/rust-lang/crates.io-index" 916 | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" 917 | 918 | [[package]] 919 | name = "utf8parse" 920 | version = "0.2.2" 921 | source = "registry+https://github.com/rust-lang/crates.io-index" 922 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 923 | 924 | [[package]] 925 | name = "wasip2" 926 | version = "1.0.1+wasi-0.2.4" 927 | source = "registry+https://github.com/rust-lang/crates.io-index" 928 | checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" 929 | dependencies = [ 930 | "wit-bindgen", 931 | ] 932 | 933 | [[package]] 934 | name = "windows-link" 935 | version = "0.2.1" 936 | source = "registry+https://github.com/rust-lang/crates.io-index" 937 | checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 938 | 939 | [[package]] 940 | name = "windows-sys" 941 | version = "0.61.2" 942 | source = "registry+https://github.com/rust-lang/crates.io-index" 943 | checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" 944 | dependencies = [ 945 | "windows-link", 946 | ] 947 | 948 | [[package]] 949 | name = "wit-bindgen" 950 | version = "0.46.0" 951 | source = "registry+https://github.com/rust-lang/crates.io-index" 952 | checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" 953 | 954 | [[package]] 955 | name = "write16" 956 | version = "1.0.0" 957 | source = "registry+https://github.com/rust-lang/crates.io-index" 958 | checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" 959 | 960 | [[package]] 961 | name = "writeable" 962 | version = "0.6.2" 963 | source = "registry+https://github.com/rust-lang/crates.io-index" 964 | checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" 965 | dependencies = [ 966 | "either", 967 | ] 968 | 969 | [[package]] 970 | name = "yoke" 971 | version = "0.8.1" 972 | source = "registry+https://github.com/rust-lang/crates.io-index" 973 | checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" 974 | dependencies = [ 975 | "stable_deref_trait", 976 | "yoke-derive", 977 | "zerofrom", 978 | ] 979 | 980 | [[package]] 981 | name = "yoke-derive" 982 | version = "0.8.1" 983 | source = "registry+https://github.com/rust-lang/crates.io-index" 984 | checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" 985 | dependencies = [ 986 | "proc-macro2", 987 | "quote", 988 | "syn", 989 | "synstructure", 990 | ] 991 | 992 | [[package]] 993 | name = "zerocopy" 994 | version = "0.8.31" 995 | source = "registry+https://github.com/rust-lang/crates.io-index" 996 | checksum = "fd74ec98b9250adb3ca554bdde269adf631549f51d8a8f8f0a10b50f1cb298c3" 997 | dependencies = [ 998 | "zerocopy-derive", 999 | ] 1000 | 1001 | [[package]] 1002 | name = "zerocopy-derive" 1003 | version = "0.8.31" 1004 | source = "registry+https://github.com/rust-lang/crates.io-index" 1005 | checksum = "d8a8d209fdf45cf5138cbb5a506f6b52522a25afccc534d1475dad8e31105c6a" 1006 | dependencies = [ 1007 | "proc-macro2", 1008 | "quote", 1009 | "syn", 1010 | ] 1011 | 1012 | [[package]] 1013 | name = "zerofrom" 1014 | version = "0.1.6" 1015 | source = "registry+https://github.com/rust-lang/crates.io-index" 1016 | checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" 1017 | dependencies = [ 1018 | "zerofrom-derive", 1019 | ] 1020 | 1021 | [[package]] 1022 | name = "zerofrom-derive" 1023 | version = "0.1.6" 1024 | source = "registry+https://github.com/rust-lang/crates.io-index" 1025 | checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" 1026 | dependencies = [ 1027 | "proc-macro2", 1028 | "quote", 1029 | "syn", 1030 | "synstructure", 1031 | ] 1032 | 1033 | [[package]] 1034 | name = "zerotrie" 1035 | version = "0.2.3" 1036 | source = "registry+https://github.com/rust-lang/crates.io-index" 1037 | checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" 1038 | dependencies = [ 1039 | "displaydoc", 1040 | "yoke", 1041 | "zerofrom", 1042 | ] 1043 | 1044 | [[package]] 1045 | name = "zerovec" 1046 | version = "0.11.5" 1047 | source = "registry+https://github.com/rust-lang/crates.io-index" 1048 | checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" 1049 | dependencies = [ 1050 | "serde", 1051 | "yoke", 1052 | "zerofrom", 1053 | "zerovec-derive", 1054 | ] 1055 | 1056 | [[package]] 1057 | name = "zerovec-derive" 1058 | version = "0.11.2" 1059 | source = "registry+https://github.com/rust-lang/crates.io-index" 1060 | checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" 1061 | dependencies = [ 1062 | "proc-macro2", 1063 | "quote", 1064 | "syn", 1065 | ] 1066 | --------------------------------------------------------------------------------