├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── feature-request.md │ └── filtering-error.md └── workflows │ └── build.yml ├── .gitignore ├── Cargo.toml ├── LICENSE-MIT ├── Makefile ├── README.md ├── examples ├── advanced.rs ├── analyze.rs └── censor.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz_targets │ └── fuzz.rs ├── pages ├── .gitignore ├── Cargo.toml ├── Trunk.prod.toml ├── index.html └── src │ └── main.rs └── src ├── banned.rs ├── banned_chars.txt ├── buffer_proxy_iterator.rs ├── censor.rs ├── character_analyzer.rs ├── character_widths.bin ├── context.rs ├── dictionary_blacklist.txt ├── dictionary_common_valid_short.txt ├── dictionary_extra.txt ├── false_positive_finder.rs ├── false_positives.txt ├── feature_cell.rs ├── lib.rs ├── mtch.rs ├── pii.rs ├── profanity.csv ├── replacement_finder.rs ├── replacements.csv ├── replacements.rs ├── replacements_extra.csv ├── safe.txt ├── test_broken.txt ├── test_negative.txt ├── test_positive.txt ├── test_safe.txt ├── trace.rs ├── trie.rs ├── typ.rs ├── unicode_fonts.txt └── width.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [finnbear] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Request a new feature 4 | title: Feature request 5 | labels: feature 6 | assignees: finnbear 7 | 8 | --- 9 | 10 | ### Motivation 11 | 12 | ### Summary 13 | 14 | ### Alternatives 15 | 16 | 17 | 18 | ### Context 19 | 20 | I am using `rustrict` version `X.Y.Z` (if not latest version) 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/filtering-error.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Filtering Error 3 | about: Easily report false positive detections or false negative detections 4 | title: Filtering error (false positive and/or false negative) 5 | labels: bug 6 | assignees: finnbear 7 | 8 | --- 9 | 10 | ### False Positives 11 | The following shouldn't have been detected, but was: 12 | ``` 13 | 14 | ``` 15 | 16 | 17 | 18 | ### False Negatives 19 | The following should have been detected, but wasn't: 20 | ``` 21 | 22 | ``` 23 | 24 | 25 | ### Context 26 | 27 | I am using `rustrict` version `X.Y.Z` (if not latest version) 28 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | workflow_dispatch: 9 | 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | concurrency: 16 | group: "pages" 17 | cancel-in-progress: false 18 | 19 | env: 20 | CARGO_TERM_COLOR: always 21 | 22 | jobs: 23 | build: 24 | environment: 25 | name: github-pages 26 | url: ${{ steps.deployment.outputs.page_url }} 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v3 31 | with: 32 | persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token. 33 | fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. 34 | - name: Install Rust 35 | uses: actions-rs/toolchain@v1 36 | with: 37 | toolchain: nightly 38 | override: true 39 | components: rustfmt, clippy 40 | - name: Download Testing Data 41 | run: curl https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv --output test.csv 42 | - name: Test (context, pii, serde) 43 | run: cargo test --release --features context,pii,serde 44 | - name: Test (context, width) 45 | run: cargo test --release --features context,width 46 | - name: Add wasm32 target 47 | run: rustup target add wasm32-unknown-unknown 48 | - name: Install Trunk 49 | uses: baptiste0928/cargo-install@v2 50 | with: 51 | crate: trunk 52 | version: 0.21.1 53 | - name: Build Pages 54 | run: cd pages && trunk --config Trunk.prod.toml build --release --filehash=false 55 | - name: Setup Pages 56 | uses: actions/configure-pages@v3 57 | - name: Upload artifact 58 | uses: actions/upload-pages-artifact@v3 59 | with: 60 | path: './pages/dist/' 61 | - name: Deploy to GitHub Pages 62 | id: deployment 63 | uses: actions/deploy-pages@v4 64 | fuzz: 65 | runs-on: ubuntu-latest 66 | steps: 67 | - uses: actions/checkout@v2 68 | - uses: actions-rs/toolchain@v1 69 | with: 70 | toolchain: nightly 71 | override: true 72 | - name: Install cargo-fuzz 73 | uses: baptiste0928/cargo-install@v3 74 | with: 75 | crate: cargo-fuzz 76 | locked: false 77 | - name: Fuzz 78 | run: RUST_BACKTRACE=1 cargo fuzz run fuzz -- -max_total_time=900 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | 12 | # Downloads (run `make downloads` to get). Only required for testing and false positive finding. 13 | test.csv 14 | src/dictionary.txt 15 | src/dictionary_common.txt 16 | src/unicode_confusables.txt 17 | 18 | # Downloads not covered under `make downloads` 19 | ttf/ 20 | 21 | .idea 22 | *.iml 23 | .vscode/ -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rustrict" 3 | authors = ["Finn Bear"] 4 | version = "0.7.35" 5 | edition = "2021" 6 | license = "MIT OR Apache-2.0" 7 | repository = "https://github.com/finnbear/rustrict/" 8 | description = "rustrict is a profanity filter for Rust" 9 | exclude = ["fuzz/"] 10 | 11 | [lib] 12 | name = "rustrict" 13 | path = "src/lib.rs" 14 | 15 | [[bin]] 16 | name = "false_positive_finder" 17 | path = "src/false_positive_finder.rs" 18 | required-features = ["find_false_positives"] 19 | 20 | [[bin]] 21 | name = "replacement_finder" 22 | path = "src/replacement_finder.rs" 23 | required-features = ["find_replacements"] 24 | 25 | [[bin]] 26 | name = "character_analyzer" 27 | path = "src/character_analyzer.rs" 28 | required-features = ["imageproc", "image", "rusttype", "unicode-width", "walkdir", "rayon"] 29 | 30 | [[bin]] 31 | name = "trace" 32 | path = "src/trace.rs" 33 | required-features = ["trace"] 34 | 35 | [features] 36 | default = ["censor", "context"] 37 | censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"] 38 | context = ["censor", "strsim"] 39 | customize = ["censor"] 40 | width = ["lazy_static", "itertools"] 41 | pii = ["lazy_static", "regex"] 42 | find_false_positives = ["censor", "regex", "indicatif", "rayon"] 43 | find_replacements = ["csv"] 44 | trace = ["censor"] 45 | trace_full = ["trace"] 46 | serde = ["dep:serde", "arrayvec/serde"] 47 | 48 | [package.metadata.docs.rs] 49 | features = ["censor", "context", "customize", "width"] 50 | 51 | [profile.release] 52 | panic = 'abort' 53 | 54 | [dependencies] 55 | arrayvec = {version = "0.7", optional = true} 56 | finl_unicode = "1.2" 57 | unicode-normalization = {version = "0.1.22", optional = true} 58 | unicode-width = {version = "0.1", optional = true} 59 | bitflags = {version = "1.3", optional = true} 60 | lazy_static = {version = "1.4", optional = true} 61 | itertools = {version = "0.10", optional = true} 62 | rustc-hash = {version = "1.1", optional = true} 63 | regex = {version = "1.5", optional = true} 64 | indicatif = {version = "0.17.0-beta.1", optional = true} 65 | rayon = {version = "1.5", optional = true} 66 | doc-comment = "0.3.3" 67 | strsim = {version = "0.10.0", optional = true} 68 | csv = {version="1.1", optional = true} 69 | imageproc = {version = "0.22", optional = true} 70 | rusttype = {version = "0.9", optional = true} 71 | image = {version = "0.23.14", optional = true} 72 | walkdir = {version = "2", optional = true} 73 | serde = {version = "1", features=["derive"], optional = true} 74 | 75 | [dev-dependencies] 76 | rand = "0.8" 77 | csv = "1.1" 78 | censor_crate = { package = "censor", version = "0.3.0" } 79 | rustrict_old = { package = "rustrict", version = "0.7.24" } 80 | serial_test = "0.5" 81 | stfu_crate = { package = "stfu", version = "0.1.0" } 82 | profane_rs_crate = { package = "profane-rs", version = "0.0.4" } 83 | bincode = "1.3.3" 84 | serde_json = "1" 85 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Finn Bear 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: fuzz 2 | 3 | all: test 4 | 5 | downloads: 6 | wget -O test.csv https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv 7 | wget -O src/dictionary.txt https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt 8 | wget -O src/dictionary_common.txt https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt 9 | wget -O src/unicode_confusables.txt https://www.unicode.org/Public/security/14.0.0/confusables.txt 10 | # TODO: ttf fonts 11 | 12 | false_positives: 13 | cargo run --bin false_positive_finder --release --features censor,regex,indicatif,rayon,find_false_positives 14 | 15 | replacements: 16 | cargo run --bin replacement_finder --features find_replacements 17 | 18 | widths: 19 | cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width 20 | 21 | test: 22 | cargo test --release --features width,pii,serde -- --nocapture 23 | 24 | compare: 25 | COMPARE=1 make test 26 | 27 | table: 28 | cargo test --release -- accuracy --nocapture 29 | 30 | # Skips accuracy analysis so finishes faster. 31 | test_debug: 32 | cargo test --features pii -- --nocapture 33 | 34 | fuzz: 35 | cargo fuzz run fuzz 36 | 37 | test_customize: 38 | cargo test --release --features customize --no-default-features -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rustrict 2 | 3 | [![Documentation](https://docs.rs/rustrict/badge.svg)](https://docs.rs/rustrict) 4 | [![crates.io](https://img.shields.io/crates/v/rustrict.svg)](https://crates.io/crates/rustrict) 5 | [![Build](https://github.com/finnbear/rustrict/actions/workflows/build.yml/badge.svg)](https://github.com/finnbear/rustrict/actions/workflows/build.yml) 6 | [![Test Page](https://img.shields.io/badge/Test-page-green)](https://finnbear.github.io/rustrict/) 7 | 8 | 9 | `rustrict` is a profanity filter for Rust. 10 | 11 | Disclaimer: Multiple source files (`.txt`, `.csv`, `.rs` test cases) contain profanity. Viewer discretion is advised. 12 | 13 | ## Features 14 | 15 | - Multiple types (profane, offensive, sexual, mean, spam) 16 | - Multiple levels (mild, moderate, severe) 17 | - Resistant to evasion 18 | - Alternative spellings (like "fck") 19 | - Repeated characters (like "craaaap") 20 | - Confusable characters (like 'ᑭ', '𝕡', and '🅿') 21 | - Spacing (like "c r_a-p") 22 | - Accents (like "pÓöp") 23 | - Bidirectional Unicode ([related reading](https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html)) 24 | - Self-censoring (like "f*ck") 25 | - Safe phrase list for known bad actors] 26 | - Censors invalid Unicode characters 27 | - Battle-tested in [Mk48.io](https://mk48.io) 28 | - Resistant to false positives 29 | - One word (like "**ass**assin") 30 | - Two words (like "pu**sh it**") 31 | - Flexible 32 | - Censor and/or analyze 33 | - Input `&str` or `Iterator` 34 | - Can track per-user state with `context` feature 35 | - Can add words with the `customize` feature 36 | - Accurately reports the width of Unicode via the `width` feature 37 | - Plenty of options 38 | - Performant 39 | - O(n) analysis and censoring 40 | - No `regex` (uses custom trie) 41 | - 3 MB/s in `release` mode 42 | - 100 KB/s in `debug` mode 43 | 44 | ## Limitations 45 | 46 | - Mostly English/emoji 47 | - Censoring removes most diacritics (accents) 48 | - Does not detect right-to-left profanity while analyzing, so... 49 | - Censoring forces Unicode to be left-to-right 50 | - Doesn't understand context 51 | - Not resistant to false positives affecting profanities added at runtime 52 | 53 | ## Usage 54 | 55 | ### Strings (`&str`) 56 | ```rust 57 | use rustrict::CensorStr; 58 | 59 | let censored: String = "hello crap".censor(); 60 | let inappropriate: bool = "f u c k".is_inappropriate(); 61 | 62 | assert_eq!(censored, "hello c***"); 63 | assert!(inappropriate); 64 | ``` 65 | 66 | ### Iterators (`Iterator`) 67 | 68 | ```rust 69 | use rustrict::CensorIter; 70 | 71 | let censored: String = "hello crap".chars().censor().collect(); 72 | 73 | assert_eq!(censored, "hello c***"); 74 | ``` 75 | 76 | ### Advanced 77 | 78 | By constructing a `Censor`, one can avoid scanning text multiple times to get a censored `String` and/or 79 | answer multiple `is` queries. This also opens up more customization options (defaults are below). 80 | 81 | ```rust 82 | use rustrict::{Censor, Type}; 83 | 84 | let (censored, analysis) = Censor::from_str("123 Crap") 85 | .with_censor_threshold(Type::INAPPROPRIATE) 86 | .with_censor_first_character_threshold(Type::OFFENSIVE & Type::SEVERE) 87 | .with_ignore_false_positives(false) 88 | .with_ignore_self_censoring(false) 89 | .with_censor_replacement('*') 90 | .censor_and_analyze(); 91 | 92 | assert_eq!(censored, "123 C***"); 93 | assert!(analysis.is(Type::INAPPROPRIATE)); 94 | assert!(analysis.isnt(Type::PROFANE & Type::SEVERE | Type::SEXUAL)); 95 | ``` 96 | 97 | If you cannot afford to let anything slip though, or have reason to believe a particular user 98 | is trying to evade the filter, you can check if their input matches a [short list of safe strings](src/safe.txt): 99 | 100 | ```rust 101 | use rustrict::{CensorStr, Type}; 102 | 103 | // Figure out if a user is trying to evade the filter. 104 | assert!("pron".is(Type::EVASIVE)); 105 | assert!("porn".isnt(Type::EVASIVE)); 106 | 107 | // Only let safe messages through. 108 | assert!("Hello there!".is(Type::SAFE)); 109 | assert!("nice work.".is(Type::SAFE)); 110 | assert!("yes".is(Type::SAFE)); 111 | assert!("NVM".is(Type::SAFE)); 112 | assert!("gtg".is(Type::SAFE)); 113 | assert!("not a common phrase".isnt(Type::SAFE)); 114 | ``` 115 | 116 | If you want to add custom profanities or safe words, enable the `customize` feature. 117 | 118 | ```rust 119 | #[cfg(feature = "customize")] 120 | { 121 | use rustrict::{add_word, CensorStr, Type}; 122 | 123 | // You must take care not to call these when the crate is being 124 | // used in any other way (to avoid concurrent mutation). 125 | unsafe { 126 | add_word("reallyreallybadword", (Type::PROFANE & Type::SEVERE) | Type::MEAN); 127 | add_word("mybrandname", Type::SAFE); 128 | } 129 | 130 | assert!("Reallllllyreallllllybaaaadword".is(Type::PROFANE)); 131 | assert!("MyBrandName".is(Type::SAFE)); 132 | } 133 | ``` 134 | 135 | If your use-case is chat moderation, and you store data on a per-user basis, you can use `rustrict::Context` as a reference implementation: 136 | 137 | ```rust 138 | #[cfg(feature = "context")] 139 | { 140 | use rustrict::{BlockReason, Context}; 141 | use std::time::Duration; 142 | 143 | pub struct User { 144 | context: Context, 145 | } 146 | 147 | let mut bob = User { 148 | context: Context::default() 149 | }; 150 | 151 | // Ok messages go right through. 152 | assert_eq!(bob.context.process(String::from("hello")), Ok(String::from("hello"))); 153 | 154 | // Bad words are censored. 155 | assert_eq!(bob.context.process(String::from("crap")), Ok(String::from("c***"))); 156 | 157 | // Can take user reports (After many reports or inappropriate messages, 158 | // will only let known safe messages through.) 159 | for _ in 0..5 { 160 | bob.context.report(); 161 | } 162 | 163 | // If many bad words are used or reports are made, the first letter of 164 | // future bad words starts getting censored too. 165 | assert_eq!(bob.context.process(String::from("crap")), Ok(String::from("****"))); 166 | 167 | // Can manually mute. 168 | bob.context.mute_for(Duration::from_secs(2)); 169 | assert!(matches!(bob.context.process(String::from("anything")), Err(BlockReason::Muted(_)))); 170 | } 171 | ``` 172 | 173 | ## Comparison 174 | 175 | To compare filters, the first 100,000 items of [this list](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv) 176 | is used as a dataset. Positive accuracy is the percentage of profanity detected as profanity. Negative accuracy is the percentage of clean text detected as clean. 177 | 178 | | Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time | 179 | |-------|----------|-------------------|-------------------|------| 180 | | [rustrict](https://crates.io/crates/rustrict) | 80.00% | 94.01% | 76.50% | 9s | 181 | | [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s | 182 | | [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s | 183 | | [profane-rs](https://crates.io/crates/profane-rs) | 80.47% | 73.79% | 82.14% | 52s | 184 | 185 | ## Development 186 | 187 | [![Build](https://github.com/finnbear/rustrict/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/finnbear/rustrict/actions/workflows/build.yml) 188 | 189 | If you make an adjustment that would affect false positives, such as adding profanity, 190 | you will need to run `false_positive_finder`: 191 | 1. Run `make downloads` to download the required word lists and dictionaries 192 | 2. Run `make false_positives` to automatically find false positives 193 | 194 | If you modify `replacements_extra.csv`, run `make replacements` to rebuild `replacements.csv`. 195 | 196 | Finally, run `make test` for a full test or `make test_debug` for a fast test. 197 | 198 | ## License 199 | 200 | Licensed under either of 201 | 202 | * Apache License, Version 2.0 203 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 204 | * MIT license 205 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 206 | 207 | at your option. 208 | 209 | ## Contribution 210 | 211 | Unless you explicitly state otherwise, any contribution intentionally submitted 212 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 213 | dual licensed as above, without any additional terms or conditions. 214 | -------------------------------------------------------------------------------- /examples/advanced.rs: -------------------------------------------------------------------------------- 1 | use rustrict::{Censor, Type}; 2 | 3 | fn main() { 4 | let (censored, analysis) = Censor::from_str("123 Crap") 5 | .with_censor_first_character_threshold(Type::OFFENSIVE & Type::SEVERE) 6 | .with_ignore_false_positives(false) 7 | .with_censor_replacement('?') 8 | .censor_and_analyze(); 9 | 10 | assert_eq!(censored, "123 C???"); 11 | assert!(analysis.is(Type::INAPPROPRIATE)); 12 | assert!(analysis.isnt(Type::PROFANE & Type::SEVERE | Type::SEXUAL)); 13 | } 14 | -------------------------------------------------------------------------------- /examples/analyze.rs: -------------------------------------------------------------------------------- 1 | use rustrict::{CensorStr, Type}; 2 | 3 | fn main() { 4 | show_analysis("Helló world!"); 5 | show_analysis("Hello shit world ass"); 6 | show_analysis("assassin push it"); 7 | show_analysis("$#1t f-u_c_k βιτ⊂η d u m b a s s"); 8 | } 9 | 10 | fn show_analysis(text: &str) { 11 | println!("\"{}\" is mean? {}", text, text.is(Type::MEAN)); 12 | } 13 | -------------------------------------------------------------------------------- /examples/censor.rs: -------------------------------------------------------------------------------- 1 | use rustrict::CensorStr; 2 | 3 | fn main() { 4 | // Okay words are unaffected (with the exception of having their accents removed). 5 | show_censor("Helló world!"); 6 | 7 | // Bad words are censored. 8 | show_censor("Hello shit world ass"); 9 | 10 | // False positives are avoided. 11 | show_censor("assassin push it"); 12 | 13 | // Obfuscation is mostly ignored. 14 | show_censor("$#1t f-u_c_k βιτ⊂η d u m b a s s"); 15 | } 16 | 17 | fn show_censor(text: &str) { 18 | println!("{} -> {}", text, text.censor()); 19 | } 20 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rustrict-fuzz" 3 | version = "0.0.0" 4 | authors = ["Automatically generated"] 5 | publish = false 6 | edition = "2018" 7 | 8 | [package.metadata] 9 | cargo-fuzz = true 10 | 11 | [dependencies] 12 | libfuzzer-sys = "0.4" 13 | 14 | [dependencies.rustrict] 15 | path = ".." 16 | features = ["pii", "width"] 17 | 18 | # Prevent this from interfering with workspaces 19 | [workspace] 20 | members = ["."] 21 | 22 | [[bin]] 23 | name = "fuzz" 24 | path = "fuzz_targets/fuzz.rs" 25 | test = false 26 | doc = false 27 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | use libfuzzer_sys::fuzz_target; 3 | use rustrict::{Censor, Context, Type}; 4 | 5 | fuzz_target!(|data: &[u8]| { 6 | if !data.is_empty() { 7 | let flags = data[0]; 8 | let input = &data[1..]; 9 | 10 | if let Ok(text) = std::str::from_utf8(input) { 11 | let _ = rustrict::width_str(text); 12 | let _ = rustrict::width_str_max_unbroken(text, rustrict::WordBreak::BreakAll); 13 | let _ = rustrict::trim_to_width(text, 10); 14 | let _ = rustrict::censor_and_analyze_pii(text); 15 | 16 | let (_censored, _analysis) = Censor::from_str(text) 17 | .with_ignore_self_censoring(flag(flags, 0)) 18 | .with_ignore_false_positives(flag(flags, 1)) 19 | .with_censor_threshold(if flag(flags, 2) { 20 | Type::INAPPROPRIATE 21 | } else { 22 | Type::SPAM 23 | }) 24 | .with_censor_first_character_threshold(if flag(flags, 3) { 25 | Type::INAPPROPRIATE 26 | } else { 27 | Type::SPAM 28 | }) 29 | .with_censor_replacement(if flag(flags, 4) { '#' } else { '*' }) 30 | .censor_and_analyze(); 31 | 32 | let mut ctx = Context::new(); 33 | 34 | for _ in 0..3 { 35 | let _ = ctx.process(String::from(text)); 36 | let _ = ctx.process(String::from("hi")); 37 | let _ = ctx.process(String::from(text)); 38 | } 39 | } 40 | } 41 | }); 42 | 43 | fn flag(flags: u8, index: u8) -> bool { 44 | ((flags >> index) & 1) == 1 45 | } 46 | -------------------------------------------------------------------------------- /pages/.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | target/ -------------------------------------------------------------------------------- /pages/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pages" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | rustrict = { path = "..", features = ["trace_full", "width", "pii"] } 8 | yew = { version = "0.21", features = ["csr"] } 9 | 10 | [dependencies.web-sys] 11 | version = "0.3" 12 | features = [ 13 | 'HtmlInputElement', 14 | 'HtmlTextAreaElement', 15 | ] 16 | 17 | [profile.release] 18 | codegen-units = 1 19 | lto = true 20 | opt-level = "z" 21 | panic = "abort" 22 | strip = "debuginfo" -------------------------------------------------------------------------------- /pages/Trunk.prod.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "index.html" 3 | release = true 4 | public_url = "/rustrict/" -------------------------------------------------------------------------------- /pages/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Rustrict 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /pages/src/main.rs: -------------------------------------------------------------------------------- 1 | use web_sys::{HtmlInputElement, window, InputEvent, HtmlTextAreaElement, wasm_bindgen::JsCast}; 2 | use yew::{html, Html, Callback, function_component, TargetCast}; 3 | use rustrict::{censor_and_analyze_pii, Censor, WordBreak}; 4 | 5 | #[function_component(App)] 6 | fn app() -> Html { 7 | let oninput = Callback::from(move |event: InputEvent| { 8 | if let Some(input) = event.target_dyn_into::() { 9 | let uncensored = input.value(); 10 | let (uncensored, pii) = censor_and_analyze_pii(&uncensored); 11 | let analysis_element = window().unwrap().document().unwrap().get_element_by_id("analysis").unwrap(); 12 | let censored_element = window().unwrap().document().unwrap().get_element_by_id("censored").unwrap().dyn_into::().unwrap(); 13 | if uncensored.is_empty() { 14 | analysis_element.set_inner_html("N/A"); 15 | censored_element.set_value(""); 16 | } else { 17 | let mut censor = Censor::from_str(&uncensored); 18 | let (censored, analysis) = censor.censor_and_analyze(); 19 | let count = censor.total_matches(); 20 | let detections = censor.detections(); 21 | let width = rustrict::width_str(&uncensored); 22 | let max_unbroken = rustrict::width_str_max_unbroken(&uncensored, WordBreak::BreakAll); 23 | let result = format!("{analysis:?} (width={width}, max-unbroken={max_unbroken}, count={count}, detections={detections:?}, pii={pii:?})"); 24 | analysis_element.set_inner_html(&result); 25 | censored_element.set_value(&censored); 26 | } 27 | } 28 | }); 29 | html! {<> 30 |

{"Rustrict"}

31 |

{"Input"}

32 | 38 |

{"Analysis"}

39 |

{"N/A"}

40 |

{"Output"}

41 | 49 | } 50 | } 51 | 52 | /* 53 | 75 | */ 76 | 77 | fn main() { 78 | yew::Renderer::::new().render(); 79 | } -------------------------------------------------------------------------------- /src/banned.rs: -------------------------------------------------------------------------------- 1 | use crate::feature_cell::FeatureCell; 2 | use crate::Set; 3 | use lazy_static::lazy_static; 4 | use std::ops::Deref; 5 | 6 | lazy_static! { 7 | pub(crate) static ref BANNED: FeatureCell = FeatureCell::new(Banned( 8 | include_str!("banned_chars.txt") 9 | .lines() 10 | .filter(|s| s.starts_with("U+")) 11 | .map(|s| { 12 | u32::from_str_radix(&s[2..], 16) 13 | .ok() 14 | .and_then(char::from_u32) 15 | .unwrap() 16 | }) 17 | // If you care about width, you probably also care about height. 18 | .chain(if cfg!(feature = "width") { 19 | ['\u{A9C1}', '\u{A9C2}'].as_slice().into_iter().copied() 20 | } else { 21 | [].as_slice().into_iter().copied() 22 | }) 23 | .collect() 24 | )); 25 | } 26 | 27 | /// Set of character to strip from input without replacement. 28 | #[derive(Clone, Debug)] 29 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 30 | pub struct Banned(Set); 31 | 32 | impl Default for Banned { 33 | fn default() -> Self { 34 | BANNED.deref().deref().clone() 35 | } 36 | } 37 | 38 | impl Banned { 39 | /// Empty. 40 | pub fn new() -> Self { 41 | Self(Default::default()) 42 | } 43 | 44 | /// Allows direct mutable access to the global default set of banned characters. 45 | /// 46 | /// # Safety 47 | /// 48 | /// You must manually avoid concurrent access/censoring. 49 | #[cfg(feature = "customize")] 50 | #[cfg_attr(doc, doc(cfg(feature = "customize")))] 51 | pub unsafe fn customize_default() -> &'static mut Self { 52 | BANNED.get_mut() 53 | } 54 | 55 | pub(crate) fn contains(&self, c: char) -> bool { 56 | self.0.contains(&c) 57 | } 58 | 59 | /// Adds a banned character. 60 | pub fn insert(&mut self, c: char) { 61 | self.0.insert(c); 62 | } 63 | 64 | /// Removes a banned character. 65 | pub fn remove(&mut self, c: char) { 66 | self.0.remove(&c); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/banned_chars.txt: -------------------------------------------------------------------------------- 1 | # https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html 2 | U+202A 3 | U+202B 4 | U+202C 5 | U+202D 6 | U+202E 7 | U+2066 8 | U+2067 9 | U+2068 10 | U+2069 11 | 12 | # Very small Arabic ligature 13 | U+FC60 -------------------------------------------------------------------------------- /src/buffer_proxy_iterator.rs: -------------------------------------------------------------------------------- 1 | use std::collections::VecDeque; 2 | use std::ops::RangeInclusive; 3 | 4 | /// This iterator buffers characters until they can be determined to be clean of profanity. 5 | pub(crate) struct BufferProxyIterator> { 6 | iter: I, 7 | /// The index into iter of the start of buffer. 8 | buffer_start_position: usize, 9 | /// Staging area (to possibly censor). 10 | buffer: VecDeque, 11 | } 12 | 13 | impl> BufferProxyIterator { 14 | pub fn new(iter: I) -> Self { 15 | BufferProxyIterator { 16 | iter, 17 | buffer_start_position: 0, 18 | buffer: VecDeque::new(), 19 | } 20 | } 21 | 22 | /// Returns index of the last character read, or None if nothing has been read yet. 23 | pub fn index(&self) -> Option { 24 | if self.buffer_start_position + self.buffer.len() == 0 { 25 | // Didn't read anything yet. 26 | return None; 27 | } 28 | Some(self.buffer_start_position + self.buffer.len() - 1) 29 | } 30 | 31 | /// Returns index of the next character that can be spied, or empty if no characters can be spied. 32 | pub fn spy_next_index(&self) -> Option { 33 | if self.buffer.is_empty() { 34 | None 35 | } else { 36 | Some(self.buffer_start_position) 37 | } 38 | } 39 | 40 | /// Spies one one more character. 41 | pub fn spy_next(&mut self) -> Option { 42 | let ret = self.buffer.pop_front(); 43 | if ret.is_some() { 44 | self.buffer_start_position += 1; 45 | } 46 | ret 47 | } 48 | 49 | /// Censors a given range (must be fully resident in the buffer). 50 | pub fn censor(&mut self, range: RangeInclusive, replacement: char) { 51 | let start = self.buffer_start_position; 52 | for i in range { 53 | self.buffer[i - start] = replacement; 54 | } 55 | } 56 | } 57 | 58 | impl> Iterator for BufferProxyIterator { 59 | type Item = I::Item; 60 | 61 | fn next(&mut self) -> Option { 62 | let ret = self.iter.next(); 63 | if let Some(val) = ret.as_ref() { 64 | self.buffer.push_back(*val); 65 | } 66 | ret 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/character_analyzer.rs: -------------------------------------------------------------------------------- 1 | #![feature(binary_heap_into_iter_sorted)] 2 | 3 | use image::{GrayImage, Luma, Rgb, RgbImage}; 4 | use imageproc::drawing::draw_text_mut; 5 | use rayon::prelude::{IntoParallelIterator, ParallelIterator}; 6 | use rusttype::{Font, Point, Scale}; 7 | use std::ffi::OsStr; 8 | use std::fs::OpenOptions; 9 | use std::io::{BufWriter, Write}; 10 | use std::sync::Mutex; 11 | use walkdir::WalkDir; 12 | 13 | /// Output file has the following format: 14 | /// - One byte storing the length in 10ths of an `m` of all omitted characters. 15 | /// - For each character (sorted by character) 16 | /// - Character in UTF-8 17 | /// - Length in 10ths of an `m` as a byte 18 | fn main() { 19 | let fonts: Vec = WalkDir::new("./src/ttf") 20 | .into_iter() 21 | .map(|r| r.unwrap()) 22 | .filter(|d| d.path().extension() == Some(OsStr::new("ttf"))) 23 | .map(|d| { 24 | let bytes = std::fs::read(d.path()).unwrap(); 25 | Font::try_from_vec(bytes).unwrap() 26 | }) 27 | .collect(); 28 | 29 | struct Output { 30 | histogram: [usize; 256], 31 | tab: Vec<(char, u8)>, 32 | } 33 | 34 | impl Output { 35 | pub fn push(&mut self, c: char, max_width: u8) { 36 | self.histogram[max_width as usize] += 1; 37 | self.tab.push((c, max_width)); 38 | } 39 | } 40 | 41 | let output = Mutex::new(Output { 42 | histogram: [0; 256], 43 | tab: Vec::new(), 44 | }); 45 | 46 | (0..=char::MAX as u32).into_par_iter().for_each(|u| { 47 | if let Some(c) = char::from_u32(u) { 48 | let max_width = match c { 49 | '🐿' => 20, 50 | '𒐫' => 80, 51 | '𒈙' => 35, 52 | '༺' | '༻' => 25, 53 | _ => { 54 | let max_width = (max_width(c, &fonts) as f32 / 100f32).round() as u16; 55 | if max_width > u8::MAX as u16 { 56 | panic!("{}", c); 57 | } 58 | max_width as u8 59 | } 60 | }; 61 | 62 | output.lock().unwrap().push(c, max_width); 63 | 64 | //println!("{} -> {}", c, max_width); 65 | } 66 | }); 67 | 68 | let mut output = output.into_inner().unwrap(); 69 | 70 | output.tab.sort_by_key(|&(c, _)| c); 71 | 72 | let mut mode = 0; 73 | let mut mode_n = 0; 74 | for (i, &n) in output.histogram.iter().enumerate() { 75 | let i = i as u8; 76 | println!("{}, {}", i, n); 77 | if n > mode_n { 78 | mode = i; 79 | mode_n = n; 80 | } 81 | } 82 | 83 | println!("Mode: {}", mode); 84 | 85 | let output_file = OpenOptions::new() 86 | .create(true) 87 | .write(true) 88 | .open("./src/character_widths.bin") 89 | .unwrap(); 90 | let mut buffered = BufWriter::new(output_file); 91 | 92 | buffered.write_all(&[mode]).unwrap(); 93 | 94 | for (c, max_width) in output.tab { 95 | if max_width == mode { 96 | continue; 97 | } 98 | let mut tmp = [0u8; 4]; 99 | let s = c.encode_utf8(&mut tmp); 100 | buffered.write_all(s.as_bytes()).unwrap(); 101 | buffered.write_all(&[max_width as u8]).unwrap(); 102 | 103 | if max_width > 60 { 104 | println!("character '{}' has width {}", c, max_width); 105 | } 106 | } 107 | 108 | buffered.flush().unwrap(); 109 | } 110 | 111 | /// Computes max width in milli-m's. 112 | fn max_width(c: char, fonts: &[Font]) -> usize { 113 | use unicode_width::UnicodeWidthChar; 114 | let mut max_width = c.width().map(|w| w * 1000).unwrap_or(0); 115 | for font in fonts { 116 | let width = width(c, font); 117 | max_width = max_width.max(width); 118 | } 119 | max_width 120 | } 121 | 122 | /// Computes with in milli-m's. 123 | fn width(c: char, font: &Font) -> usize { 124 | let mut tmp = [0u8; 4]; 125 | let s = c.encode_utf8(&mut tmp); 126 | 127 | let mut min = i32::MAX; 128 | let mut max = i32::MIN; 129 | 130 | font.layout(s, Scale::uniform(1344.0), Point::default()) 131 | .for_each(|i| { 132 | if let Some(b) = i.pixel_bounding_box() { 133 | min = min.min(b.min.x); 134 | max = max.max(b.max.x); 135 | } else if false { 136 | i.draw(|x, _y, _c| { 137 | min = min.min(x as i32); 138 | max = max.max(x as i32); 139 | }) 140 | } 141 | }); 142 | 143 | max.checked_sub(min).unwrap_or(0) as usize 144 | } 145 | 146 | fn render(c: char, font: &Font, resolution: u32) { 147 | let mut image = GrayImage::new(resolution, resolution); 148 | 149 | let height = resolution as f32; 150 | let scale = Scale { 151 | x: height, 152 | y: height, 153 | }; 154 | 155 | let mut tmp = [0u8; 4]; 156 | let text = c.encode_utf8(&mut tmp); 157 | draw_text_mut(&mut image, Luma([255u8]), 0, 0, scale, &font, text); 158 | 159 | let _ = image.save("image.png").unwrap(); 160 | } 161 | -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | use crate::{trim_whitespace, Censor, Type}; 2 | 3 | use crate::censor::should_skip_censor; 4 | use std::collections::VecDeque; 5 | use std::fmt::{self, Debug, Display, Formatter}; 6 | use std::num::{NonZeroU16, NonZeroUsize}; 7 | use std::time::{Duration, Instant}; 8 | 9 | /// Context is useful for taking moderation actions on a per-user basis i.e. each user would get 10 | /// their own Context. 11 | /// 12 | /// # Recommendation 13 | /// 14 | /// Use this as a reference implementation e.g. by copying and adapting it. 15 | #[derive(Clone)] 16 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] 17 | #[cfg_attr(doc, doc(cfg(feature = "context")))] 18 | pub struct Context { 19 | history: VecDeque<(String, Time)>, 20 | burst_used: u8, 21 | suspicion: u8, 22 | reports: u8, 23 | total: u16, 24 | total_inappropriate: u16, 25 | muted_until: Option