├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── feature-request.md
│ └── filtering-error.md
└── workflows
│ └── build.yml
├── .gitignore
├── Cargo.toml
├── LICENSE-MIT
├── Makefile
├── README.md
├── examples
├── advanced.rs
├── analyze.rs
└── censor.rs
├── fuzz
├── .gitignore
├── Cargo.toml
└── fuzz_targets
│ └── fuzz.rs
├── pages
├── .gitignore
├── Cargo.toml
├── Trunk.prod.toml
├── index.html
└── src
│ └── main.rs
└── src
├── banned.rs
├── banned_chars.txt
├── buffer_proxy_iterator.rs
├── censor.rs
├── character_analyzer.rs
├── character_widths.bin
├── context.rs
├── dictionary_blacklist.txt
├── dictionary_common_valid_short.txt
├── dictionary_extra.txt
├── false_positive_finder.rs
├── false_positives.txt
├── feature_cell.rs
├── lib.rs
├── mtch.rs
├── pii.rs
├── profanity.csv
├── replacement_finder.rs
├── replacements.csv
├── replacements.rs
├── replacements_extra.csv
├── safe.txt
├── test_broken.txt
├── test_negative.txt
├── test_positive.txt
├── test_safe.txt
├── trace.rs
├── trie.rs
├── typ.rs
├── unicode_fonts.txt
└── width.rs
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [finnbear]
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Request a new feature
4 | title: Feature request
5 | labels: feature
6 | assignees: finnbear
7 |
8 | ---
9 |
10 | ### Motivation
11 |
12 | ### Summary
13 |
14 | ### Alternatives
15 |
16 |
17 |
18 | ### Context
19 |
20 | I am using `rustrict` version `X.Y.Z` (if not latest version)
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/filtering-error.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Filtering Error
3 | about: Easily report false positive detections or false negative detections
4 | title: Filtering error (false positive and/or false negative)
5 | labels: bug
6 | assignees: finnbear
7 |
8 | ---
9 |
10 | ### False Positives
11 | The following shouldn't have been detected, but was:
12 | ```
13 |
14 | ```
15 |
16 |
17 |
18 | ### False Negatives
19 | The following should have been detected, but wasn't:
20 | ```
21 |
22 | ```
23 |
24 |
25 | ### Context
26 |
27 | I am using `rustrict` version `X.Y.Z` (if not latest version)
28 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | pull_request:
7 | branches: [ master ]
8 | workflow_dispatch:
9 |
10 | permissions:
11 | contents: read
12 | pages: write
13 | id-token: write
14 |
15 | concurrency:
16 | group: "pages"
17 | cancel-in-progress: false
18 |
19 | env:
20 | CARGO_TERM_COLOR: always
21 |
22 | jobs:
23 | build:
24 | environment:
25 | name: github-pages
26 | url: ${{ steps.deployment.outputs.page_url }}
27 | runs-on: ubuntu-latest
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v3
31 | with:
32 | persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
33 | fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
34 | - name: Install Rust
35 | uses: actions-rs/toolchain@v1
36 | with:
37 | toolchain: nightly
38 | override: true
39 | components: rustfmt, clippy
40 | - name: Download Testing Data
41 | run: curl https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv --output test.csv
42 | - name: Test (context, pii, serde)
43 | run: cargo test --release --features context,pii,serde
44 | - name: Test (context, width)
45 | run: cargo test --release --features context,width
46 | - name: Add wasm32 target
47 | run: rustup target add wasm32-unknown-unknown
48 | - name: Install Trunk
49 | uses: baptiste0928/cargo-install@v2
50 | with:
51 | crate: trunk
52 | version: 0.21.1
53 | - name: Build Pages
54 | run: cd pages && trunk --config Trunk.prod.toml build --release --filehash=false
55 | - name: Setup Pages
56 | uses: actions/configure-pages@v3
57 | - name: Upload artifact
58 | uses: actions/upload-pages-artifact@v3
59 | with:
60 | path: './pages/dist/'
61 | - name: Deploy to GitHub Pages
62 | id: deployment
63 | uses: actions/deploy-pages@v4
64 | fuzz:
65 | runs-on: ubuntu-latest
66 | steps:
67 | - uses: actions/checkout@v2
68 | - uses: actions-rs/toolchain@v1
69 | with:
70 | toolchain: nightly
71 | override: true
72 | - name: Install cargo-fuzz
73 | uses: baptiste0928/cargo-install@v3
74 | with:
75 | crate: cargo-fuzz
76 | locked: false
77 | - name: Fuzz
78 | run: RUST_BACKTRACE=1 cargo fuzz run fuzz -- -max_total_time=900
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Generated by Cargo
2 | # will have compiled files and executables
3 | /target/
4 |
5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
7 | Cargo.lock
8 |
9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 |
12 | # Downloads (run `make downloads` to get). Only required for testing and false positive finding.
13 | test.csv
14 | src/dictionary.txt
15 | src/dictionary_common.txt
16 | src/unicode_confusables.txt
17 |
18 | # Downloads not covered under `make downloads`
19 | ttf/
20 |
21 | .idea
22 | *.iml
23 | .vscode/
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rustrict"
3 | authors = ["Finn Bear"]
4 | version = "0.7.35"
5 | edition = "2021"
6 | license = "MIT OR Apache-2.0"
7 | repository = "https://github.com/finnbear/rustrict/"
8 | description = "rustrict is a profanity filter for Rust"
9 | exclude = ["fuzz/"]
10 |
11 | [lib]
12 | name = "rustrict"
13 | path = "src/lib.rs"
14 |
15 | [[bin]]
16 | name = "false_positive_finder"
17 | path = "src/false_positive_finder.rs"
18 | required-features = ["find_false_positives"]
19 |
20 | [[bin]]
21 | name = "replacement_finder"
22 | path = "src/replacement_finder.rs"
23 | required-features = ["find_replacements"]
24 |
25 | [[bin]]
26 | name = "character_analyzer"
27 | path = "src/character_analyzer.rs"
28 | required-features = ["imageproc", "image", "rusttype", "unicode-width", "walkdir", "rayon"]
29 |
30 | [[bin]]
31 | name = "trace"
32 | path = "src/trace.rs"
33 | required-features = ["trace"]
34 |
35 | [features]
36 | default = ["censor", "context"]
37 | censor = ["arrayvec", "bitflags", "lazy_static", "itertools", "unicode-normalization", "rustc-hash"]
38 | context = ["censor", "strsim"]
39 | customize = ["censor"]
40 | width = ["lazy_static", "itertools"]
41 | pii = ["lazy_static", "regex"]
42 | find_false_positives = ["censor", "regex", "indicatif", "rayon"]
43 | find_replacements = ["csv"]
44 | trace = ["censor"]
45 | trace_full = ["trace"]
46 | serde = ["dep:serde", "arrayvec/serde"]
47 |
48 | [package.metadata.docs.rs]
49 | features = ["censor", "context", "customize", "width"]
50 |
51 | [profile.release]
52 | panic = 'abort'
53 |
54 | [dependencies]
55 | arrayvec = {version = "0.7", optional = true}
56 | finl_unicode = "1.2"
57 | unicode-normalization = {version = "0.1.22", optional = true}
58 | unicode-width = {version = "0.1", optional = true}
59 | bitflags = {version = "1.3", optional = true}
60 | lazy_static = {version = "1.4", optional = true}
61 | itertools = {version = "0.10", optional = true}
62 | rustc-hash = {version = "1.1", optional = true}
63 | regex = {version = "1.5", optional = true}
64 | indicatif = {version = "0.17.0-beta.1", optional = true}
65 | rayon = {version = "1.5", optional = true}
66 | doc-comment = "0.3.3"
67 | strsim = {version = "0.10.0", optional = true}
68 | csv = {version="1.1", optional = true}
69 | imageproc = {version = "0.22", optional = true}
70 | rusttype = {version = "0.9", optional = true}
71 | image = {version = "0.23.14", optional = true}
72 | walkdir = {version = "2", optional = true}
73 | serde = {version = "1", features=["derive"], optional = true}
74 |
75 | [dev-dependencies]
76 | rand = "0.8"
77 | csv = "1.1"
78 | censor_crate = { package = "censor", version = "0.3.0" }
79 | rustrict_old = { package = "rustrict", version = "0.7.24" }
80 | serial_test = "0.5"
81 | stfu_crate = { package = "stfu", version = "0.1.0" }
82 | profane_rs_crate = { package = "profane-rs", version = "0.0.4" }
83 | bincode = "1.3.3"
84 | serde_json = "1"
85 |
--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Finn Bear
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: fuzz
2 |
3 | all: test
4 |
5 | downloads:
6 | wget -O test.csv https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv
7 | wget -O src/dictionary.txt https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt
8 | wget -O src/dictionary_common.txt https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt
9 | wget -O src/unicode_confusables.txt https://www.unicode.org/Public/security/14.0.0/confusables.txt
10 | # TODO: ttf fonts
11 |
12 | false_positives:
13 | cargo run --bin false_positive_finder --release --features censor,regex,indicatif,rayon,find_false_positives
14 |
15 | replacements:
16 | cargo run --bin replacement_finder --features find_replacements
17 |
18 | widths:
19 | cargo run --bin character_analyzer --release --features imageproc,image,rusttype,walkdir,rayon,unicode-width
20 |
21 | test:
22 | cargo test --release --features width,pii,serde -- --nocapture
23 |
24 | compare:
25 | COMPARE=1 make test
26 |
27 | table:
28 | cargo test --release -- accuracy --nocapture
29 |
30 | # Skips accuracy analysis so finishes faster.
31 | test_debug:
32 | cargo test --features pii -- --nocapture
33 |
34 | fuzz:
35 | cargo fuzz run fuzz
36 |
37 | test_customize:
38 | cargo test --release --features customize --no-default-features
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # rustrict
2 |
3 | [](https://docs.rs/rustrict)
4 | [](https://crates.io/crates/rustrict)
5 | [](https://github.com/finnbear/rustrict/actions/workflows/build.yml)
6 | [](https://finnbear.github.io/rustrict/)
7 |
8 |
9 | `rustrict` is a profanity filter for Rust.
10 |
11 | Disclaimer: Multiple source files (`.txt`, `.csv`, `.rs` test cases) contain profanity. Viewer discretion is advised.
12 |
13 | ## Features
14 |
15 | - Multiple types (profane, offensive, sexual, mean, spam)
16 | - Multiple levels (mild, moderate, severe)
17 | - Resistant to evasion
18 | - Alternative spellings (like "fck")
19 | - Repeated characters (like "craaaap")
20 | - Confusable characters (like 'ᑭ', '𝕡', and '🅿')
21 | - Spacing (like "c r_a-p")
22 | - Accents (like "pÓöp")
23 | - Bidirectional Unicode ([related reading](https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html))
24 | - Self-censoring (like "f*ck")
25 | - Safe phrase list for known bad actors]
26 | - Censors invalid Unicode characters
27 | - Battle-tested in [Mk48.io](https://mk48.io)
28 | - Resistant to false positives
29 | - One word (like "**ass**assin")
30 | - Two words (like "pu**sh it**")
31 | - Flexible
32 | - Censor and/or analyze
33 | - Input `&str` or `Iterator- `
34 | - Can track per-user state with `context` feature
35 | - Can add words with the `customize` feature
36 | - Accurately reports the width of Unicode via the `width` feature
37 | - Plenty of options
38 | - Performant
39 | - O(n) analysis and censoring
40 | - No `regex` (uses custom trie)
41 | - 3 MB/s in `release` mode
42 | - 100 KB/s in `debug` mode
43 |
44 | ## Limitations
45 |
46 | - Mostly English/emoji
47 | - Censoring removes most diacritics (accents)
48 | - Does not detect right-to-left profanity while analyzing, so...
49 | - Censoring forces Unicode to be left-to-right
50 | - Doesn't understand context
51 | - Not resistant to false positives affecting profanities added at runtime
52 |
53 | ## Usage
54 |
55 | ### Strings (`&str`)
56 | ```rust
57 | use rustrict::CensorStr;
58 |
59 | let censored: String = "hello crap".censor();
60 | let inappropriate: bool = "f u c k".is_inappropriate();
61 |
62 | assert_eq!(censored, "hello c***");
63 | assert!(inappropriate);
64 | ```
65 |
66 | ### Iterators (`Iterator`)
67 |
68 | ```rust
69 | use rustrict::CensorIter;
70 |
71 | let censored: String = "hello crap".chars().censor().collect();
72 |
73 | assert_eq!(censored, "hello c***");
74 | ```
75 |
76 | ### Advanced
77 |
78 | By constructing a `Censor`, one can avoid scanning text multiple times to get a censored `String` and/or
79 | answer multiple `is` queries. This also opens up more customization options (defaults are below).
80 |
81 | ```rust
82 | use rustrict::{Censor, Type};
83 |
84 | let (censored, analysis) = Censor::from_str("123 Crap")
85 | .with_censor_threshold(Type::INAPPROPRIATE)
86 | .with_censor_first_character_threshold(Type::OFFENSIVE & Type::SEVERE)
87 | .with_ignore_false_positives(false)
88 | .with_ignore_self_censoring(false)
89 | .with_censor_replacement('*')
90 | .censor_and_analyze();
91 |
92 | assert_eq!(censored, "123 C***");
93 | assert!(analysis.is(Type::INAPPROPRIATE));
94 | assert!(analysis.isnt(Type::PROFANE & Type::SEVERE | Type::SEXUAL));
95 | ```
96 |
97 | If you cannot afford to let anything slip though, or have reason to believe a particular user
98 | is trying to evade the filter, you can check if their input matches a [short list of safe strings](src/safe.txt):
99 |
100 | ```rust
101 | use rustrict::{CensorStr, Type};
102 |
103 | // Figure out if a user is trying to evade the filter.
104 | assert!("pron".is(Type::EVASIVE));
105 | assert!("porn".isnt(Type::EVASIVE));
106 |
107 | // Only let safe messages through.
108 | assert!("Hello there!".is(Type::SAFE));
109 | assert!("nice work.".is(Type::SAFE));
110 | assert!("yes".is(Type::SAFE));
111 | assert!("NVM".is(Type::SAFE));
112 | assert!("gtg".is(Type::SAFE));
113 | assert!("not a common phrase".isnt(Type::SAFE));
114 | ```
115 |
116 | If you want to add custom profanities or safe words, enable the `customize` feature.
117 |
118 | ```rust
119 | #[cfg(feature = "customize")]
120 | {
121 | use rustrict::{add_word, CensorStr, Type};
122 |
123 | // You must take care not to call these when the crate is being
124 | // used in any other way (to avoid concurrent mutation).
125 | unsafe {
126 | add_word("reallyreallybadword", (Type::PROFANE & Type::SEVERE) | Type::MEAN);
127 | add_word("mybrandname", Type::SAFE);
128 | }
129 |
130 | assert!("Reallllllyreallllllybaaaadword".is(Type::PROFANE));
131 | assert!("MyBrandName".is(Type::SAFE));
132 | }
133 | ```
134 |
135 | If your use-case is chat moderation, and you store data on a per-user basis, you can use `rustrict::Context` as a reference implementation:
136 |
137 | ```rust
138 | #[cfg(feature = "context")]
139 | {
140 | use rustrict::{BlockReason, Context};
141 | use std::time::Duration;
142 |
143 | pub struct User {
144 | context: Context,
145 | }
146 |
147 | let mut bob = User {
148 | context: Context::default()
149 | };
150 |
151 | // Ok messages go right through.
152 | assert_eq!(bob.context.process(String::from("hello")), Ok(String::from("hello")));
153 |
154 | // Bad words are censored.
155 | assert_eq!(bob.context.process(String::from("crap")), Ok(String::from("c***")));
156 |
157 | // Can take user reports (After many reports or inappropriate messages,
158 | // will only let known safe messages through.)
159 | for _ in 0..5 {
160 | bob.context.report();
161 | }
162 |
163 | // If many bad words are used or reports are made, the first letter of
164 | // future bad words starts getting censored too.
165 | assert_eq!(bob.context.process(String::from("crap")), Ok(String::from("****")));
166 |
167 | // Can manually mute.
168 | bob.context.mute_for(Duration::from_secs(2));
169 | assert!(matches!(bob.context.process(String::from("anything")), Err(BlockReason::Muted(_))));
170 | }
171 | ```
172 |
173 | ## Comparison
174 |
175 | To compare filters, the first 100,000 items of [this list](https://raw.githubusercontent.com/vzhou842/profanity-check/master/profanity_check/data/clean_data.csv)
176 | is used as a dataset. Positive accuracy is the percentage of profanity detected as profanity. Negative accuracy is the percentage of clean text detected as clean.
177 |
178 | | Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
179 | |-------|----------|-------------------|-------------------|------|
180 | | [rustrict](https://crates.io/crates/rustrict) | 80.00% | 94.01% | 76.50% | 9s |
181 | | [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |
182 | | [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s |
183 | | [profane-rs](https://crates.io/crates/profane-rs) | 80.47% | 73.79% | 82.14% | 52s |
184 |
185 | ## Development
186 |
187 | [](https://github.com/finnbear/rustrict/actions/workflows/build.yml)
188 |
189 | If you make an adjustment that would affect false positives, such as adding profanity,
190 | you will need to run `false_positive_finder`:
191 | 1. Run `make downloads` to download the required word lists and dictionaries
192 | 2. Run `make false_positives` to automatically find false positives
193 |
194 | If you modify `replacements_extra.csv`, run `make replacements` to rebuild `replacements.csv`.
195 |
196 | Finally, run `make test` for a full test or `make test_debug` for a fast test.
197 |
198 | ## License
199 |
200 | Licensed under either of
201 |
202 | * Apache License, Version 2.0
203 | ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
204 | * MIT license
205 | ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
206 |
207 | at your option.
208 |
209 | ## Contribution
210 |
211 | Unless you explicitly state otherwise, any contribution intentionally submitted
212 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be
213 | dual licensed as above, without any additional terms or conditions.
214 |
--------------------------------------------------------------------------------
/examples/advanced.rs:
--------------------------------------------------------------------------------
1 | use rustrict::{Censor, Type};
2 |
3 | fn main() {
4 | let (censored, analysis) = Censor::from_str("123 Crap")
5 | .with_censor_first_character_threshold(Type::OFFENSIVE & Type::SEVERE)
6 | .with_ignore_false_positives(false)
7 | .with_censor_replacement('?')
8 | .censor_and_analyze();
9 |
10 | assert_eq!(censored, "123 C???");
11 | assert!(analysis.is(Type::INAPPROPRIATE));
12 | assert!(analysis.isnt(Type::PROFANE & Type::SEVERE | Type::SEXUAL));
13 | }
14 |
--------------------------------------------------------------------------------
/examples/analyze.rs:
--------------------------------------------------------------------------------
1 | use rustrict::{CensorStr, Type};
2 |
3 | fn main() {
4 | show_analysis("Helló world!");
5 | show_analysis("Hello shit world ass");
6 | show_analysis("assassin push it");
7 | show_analysis("$#1t f-u_c_k βιτ⊂η d u m b a s s");
8 | }
9 |
10 | fn show_analysis(text: &str) {
11 | println!("\"{}\" is mean? {}", text, text.is(Type::MEAN));
12 | }
13 |
--------------------------------------------------------------------------------
/examples/censor.rs:
--------------------------------------------------------------------------------
1 | use rustrict::CensorStr;
2 |
3 | fn main() {
4 | // Okay words are unaffected (with the exception of having their accents removed).
5 | show_censor("Helló world!");
6 |
7 | // Bad words are censored.
8 | show_censor("Hello shit world ass");
9 |
10 | // False positives are avoided.
11 | show_censor("assassin push it");
12 |
13 | // Obfuscation is mostly ignored.
14 | show_censor("$#1t f-u_c_k βιτ⊂η d u m b a s s");
15 | }
16 |
17 | fn show_censor(text: &str) {
18 | println!("{} -> {}", text, text.censor());
19 | }
20 |
--------------------------------------------------------------------------------
/fuzz/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | corpus
3 | artifacts
4 |
--------------------------------------------------------------------------------
/fuzz/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "rustrict-fuzz"
3 | version = "0.0.0"
4 | authors = ["Automatically generated"]
5 | publish = false
6 | edition = "2018"
7 |
8 | [package.metadata]
9 | cargo-fuzz = true
10 |
11 | [dependencies]
12 | libfuzzer-sys = "0.4"
13 |
14 | [dependencies.rustrict]
15 | path = ".."
16 | features = ["pii", "width"]
17 |
18 | # Prevent this from interfering with workspaces
19 | [workspace]
20 | members = ["."]
21 |
22 | [[bin]]
23 | name = "fuzz"
24 | path = "fuzz_targets/fuzz.rs"
25 | test = false
26 | doc = false
27 |
--------------------------------------------------------------------------------
/fuzz/fuzz_targets/fuzz.rs:
--------------------------------------------------------------------------------
1 | #![no_main]
2 | use libfuzzer_sys::fuzz_target;
3 | use rustrict::{Censor, Context, Type};
4 |
5 | fuzz_target!(|data: &[u8]| {
6 | if !data.is_empty() {
7 | let flags = data[0];
8 | let input = &data[1..];
9 |
10 | if let Ok(text) = std::str::from_utf8(input) {
11 | let _ = rustrict::width_str(text);
12 | let _ = rustrict::width_str_max_unbroken(text, rustrict::WordBreak::BreakAll);
13 | let _ = rustrict::trim_to_width(text, 10);
14 | let _ = rustrict::censor_and_analyze_pii(text);
15 |
16 | let (_censored, _analysis) = Censor::from_str(text)
17 | .with_ignore_self_censoring(flag(flags, 0))
18 | .with_ignore_false_positives(flag(flags, 1))
19 | .with_censor_threshold(if flag(flags, 2) {
20 | Type::INAPPROPRIATE
21 | } else {
22 | Type::SPAM
23 | })
24 | .with_censor_first_character_threshold(if flag(flags, 3) {
25 | Type::INAPPROPRIATE
26 | } else {
27 | Type::SPAM
28 | })
29 | .with_censor_replacement(if flag(flags, 4) { '#' } else { '*' })
30 | .censor_and_analyze();
31 |
32 | let mut ctx = Context::new();
33 |
34 | for _ in 0..3 {
35 | let _ = ctx.process(String::from(text));
36 | let _ = ctx.process(String::from("hi"));
37 | let _ = ctx.process(String::from(text));
38 | }
39 | }
40 | }
41 | });
42 |
43 | fn flag(flags: u8, index: u8) -> bool {
44 | ((flags >> index) & 1) == 1
45 | }
46 |
--------------------------------------------------------------------------------
/pages/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | target/
--------------------------------------------------------------------------------
/pages/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pages"
3 | version = "0.1.0"
4 | edition = "2021"
5 |
6 | [dependencies]
7 | rustrict = { path = "..", features = ["trace_full", "width", "pii"] }
8 | yew = { version = "0.21", features = ["csr"] }
9 |
10 | [dependencies.web-sys]
11 | version = "0.3"
12 | features = [
13 | 'HtmlInputElement',
14 | 'HtmlTextAreaElement',
15 | ]
16 |
17 | [profile.release]
18 | codegen-units = 1
19 | lto = true
20 | opt-level = "z"
21 | panic = "abort"
22 | strip = "debuginfo"
--------------------------------------------------------------------------------
/pages/Trunk.prod.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | target = "index.html"
3 | release = true
4 | public_url = "/rustrict/"
--------------------------------------------------------------------------------
/pages/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Rustrict
6 |
7 |
8 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/pages/src/main.rs:
--------------------------------------------------------------------------------
1 | use web_sys::{HtmlInputElement, window, InputEvent, HtmlTextAreaElement, wasm_bindgen::JsCast};
2 | use yew::{html, Html, Callback, function_component, TargetCast};
3 | use rustrict::{censor_and_analyze_pii, Censor, WordBreak};
4 |
5 | #[function_component(App)]
6 | fn app() -> Html {
7 | let oninput = Callback::from(move |event: InputEvent| {
8 | if let Some(input) = event.target_dyn_into::() {
9 | let uncensored = input.value();
10 | let (uncensored, pii) = censor_and_analyze_pii(&uncensored);
11 | let analysis_element = window().unwrap().document().unwrap().get_element_by_id("analysis").unwrap();
12 | let censored_element = window().unwrap().document().unwrap().get_element_by_id("censored").unwrap().dyn_into::().unwrap();
13 | if uncensored.is_empty() {
14 | analysis_element.set_inner_html("N/A");
15 | censored_element.set_value("");
16 | } else {
17 | let mut censor = Censor::from_str(&uncensored);
18 | let (censored, analysis) = censor.censor_and_analyze();
19 | let count = censor.total_matches();
20 | let detections = censor.detections();
21 | let width = rustrict::width_str(&uncensored);
22 | let max_unbroken = rustrict::width_str_max_unbroken(&uncensored, WordBreak::BreakAll);
23 | let result = format!("{analysis:?} (width={width}, max-unbroken={max_unbroken}, count={count}, detections={detections:?}, pii={pii:?})");
24 | analysis_element.set_inner_html(&result);
25 | censored_element.set_value(&censored);
26 | }
27 | }
28 | });
29 | html! {<>
30 |
{"Rustrict"}
31 | {"Input"}
32 |
38 | {"Analysis"}
39 | {"N/A"}
40 | {"Output"}
41 |
49 | >}
50 | }
51 |
52 | /*
53 |
75 | */
76 |
77 | fn main() {
78 | yew::Renderer::::new().render();
79 | }
--------------------------------------------------------------------------------
/src/banned.rs:
--------------------------------------------------------------------------------
1 | use crate::feature_cell::FeatureCell;
2 | use crate::Set;
3 | use lazy_static::lazy_static;
4 | use std::ops::Deref;
5 |
6 | lazy_static! {
7 | pub(crate) static ref BANNED: FeatureCell = FeatureCell::new(Banned(
8 | include_str!("banned_chars.txt")
9 | .lines()
10 | .filter(|s| s.starts_with("U+"))
11 | .map(|s| {
12 | u32::from_str_radix(&s[2..], 16)
13 | .ok()
14 | .and_then(char::from_u32)
15 | .unwrap()
16 | })
17 | // If you care about width, you probably also care about height.
18 | .chain(if cfg!(feature = "width") {
19 | ['\u{A9C1}', '\u{A9C2}'].as_slice().into_iter().copied()
20 | } else {
21 | [].as_slice().into_iter().copied()
22 | })
23 | .collect()
24 | ));
25 | }
26 |
27 | /// Set of character to strip from input without replacement.
28 | #[derive(Clone, Debug)]
29 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
30 | pub struct Banned(Set);
31 |
32 | impl Default for Banned {
33 | fn default() -> Self {
34 | BANNED.deref().deref().clone()
35 | }
36 | }
37 |
38 | impl Banned {
39 | /// Empty.
40 | pub fn new() -> Self {
41 | Self(Default::default())
42 | }
43 |
44 | /// Allows direct mutable access to the global default set of banned characters.
45 | ///
46 | /// # Safety
47 | ///
48 | /// You must manually avoid concurrent access/censoring.
49 | #[cfg(feature = "customize")]
50 | #[cfg_attr(doc, doc(cfg(feature = "customize")))]
51 | pub unsafe fn customize_default() -> &'static mut Self {
52 | BANNED.get_mut()
53 | }
54 |
55 | pub(crate) fn contains(&self, c: char) -> bool {
56 | self.0.contains(&c)
57 | }
58 |
59 | /// Adds a banned character.
60 | pub fn insert(&mut self, c: char) {
61 | self.0.insert(c);
62 | }
63 |
64 | /// Removes a banned character.
65 | pub fn remove(&mut self, c: char) {
66 | self.0.remove(&c);
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/banned_chars.txt:
--------------------------------------------------------------------------------
1 | # https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html
2 | U+202A
3 | U+202B
4 | U+202C
5 | U+202D
6 | U+202E
7 | U+2066
8 | U+2067
9 | U+2068
10 | U+2069
11 |
12 | # Very small Arabic ligature
13 | U+FC60
--------------------------------------------------------------------------------
/src/buffer_proxy_iterator.rs:
--------------------------------------------------------------------------------
1 | use std::collections::VecDeque;
2 | use std::ops::RangeInclusive;
3 |
4 | /// This iterator buffers characters until they can be determined to be clean of profanity.
5 | pub(crate) struct BufferProxyIterator> {
6 | iter: I,
7 | /// The index into iter of the start of buffer.
8 | buffer_start_position: usize,
9 | /// Staging area (to possibly censor).
10 | buffer: VecDeque,
11 | }
12 |
13 | impl> BufferProxyIterator {
14 | pub fn new(iter: I) -> Self {
15 | BufferProxyIterator {
16 | iter,
17 | buffer_start_position: 0,
18 | buffer: VecDeque::new(),
19 | }
20 | }
21 |
22 | /// Returns index of the last character read, or None if nothing has been read yet.
23 | pub fn index(&self) -> Option {
24 | if self.buffer_start_position + self.buffer.len() == 0 {
25 | // Didn't read anything yet.
26 | return None;
27 | }
28 | Some(self.buffer_start_position + self.buffer.len() - 1)
29 | }
30 |
31 | /// Returns index of the next character that can be spied, or empty if no characters can be spied.
32 | pub fn spy_next_index(&self) -> Option {
33 | if self.buffer.is_empty() {
34 | None
35 | } else {
36 | Some(self.buffer_start_position)
37 | }
38 | }
39 |
40 | /// Spies one one more character.
41 | pub fn spy_next(&mut self) -> Option {
42 | let ret = self.buffer.pop_front();
43 | if ret.is_some() {
44 | self.buffer_start_position += 1;
45 | }
46 | ret
47 | }
48 |
49 | /// Censors a given range (must be fully resident in the buffer).
50 | pub fn censor(&mut self, range: RangeInclusive, replacement: char) {
51 | let start = self.buffer_start_position;
52 | for i in range {
53 | self.buffer[i - start] = replacement;
54 | }
55 | }
56 | }
57 |
58 | impl> Iterator for BufferProxyIterator {
59 | type Item = I::Item;
60 |
61 | fn next(&mut self) -> Option {
62 | let ret = self.iter.next();
63 | if let Some(val) = ret.as_ref() {
64 | self.buffer.push_back(*val);
65 | }
66 | ret
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/character_analyzer.rs:
--------------------------------------------------------------------------------
1 | #![feature(binary_heap_into_iter_sorted)]
2 |
3 | use image::{GrayImage, Luma, Rgb, RgbImage};
4 | use imageproc::drawing::draw_text_mut;
5 | use rayon::prelude::{IntoParallelIterator, ParallelIterator};
6 | use rusttype::{Font, Point, Scale};
7 | use std::ffi::OsStr;
8 | use std::fs::OpenOptions;
9 | use std::io::{BufWriter, Write};
10 | use std::sync::Mutex;
11 | use walkdir::WalkDir;
12 |
13 | /// Output file has the following format:
14 | /// - One byte storing the length in 10ths of an `m` of all omitted characters.
15 | /// - For each character (sorted by character)
16 | /// - Character in UTF-8
17 | /// - Length in 10ths of an `m` as a byte
18 | fn main() {
19 | let fonts: Vec = WalkDir::new("./src/ttf")
20 | .into_iter()
21 | .map(|r| r.unwrap())
22 | .filter(|d| d.path().extension() == Some(OsStr::new("ttf")))
23 | .map(|d| {
24 | let bytes = std::fs::read(d.path()).unwrap();
25 | Font::try_from_vec(bytes).unwrap()
26 | })
27 | .collect();
28 |
29 | struct Output {
30 | histogram: [usize; 256],
31 | tab: Vec<(char, u8)>,
32 | }
33 |
34 | impl Output {
35 | pub fn push(&mut self, c: char, max_width: u8) {
36 | self.histogram[max_width as usize] += 1;
37 | self.tab.push((c, max_width));
38 | }
39 | }
40 |
41 | let output = Mutex::new(Output {
42 | histogram: [0; 256],
43 | tab: Vec::new(),
44 | });
45 |
46 | (0..=char::MAX as u32).into_par_iter().for_each(|u| {
47 | if let Some(c) = char::from_u32(u) {
48 | let max_width = match c {
49 | '🐿' => 20,
50 | '𒐫' => 80,
51 | '𒈙' => 35,
52 | '༺' | '༻' => 25,
53 | _ => {
54 | let max_width = (max_width(c, &fonts) as f32 / 100f32).round() as u16;
55 | if max_width > u8::MAX as u16 {
56 | panic!("{}", c);
57 | }
58 | max_width as u8
59 | }
60 | };
61 |
62 | output.lock().unwrap().push(c, max_width);
63 |
64 | //println!("{} -> {}", c, max_width);
65 | }
66 | });
67 |
68 | let mut output = output.into_inner().unwrap();
69 |
70 | output.tab.sort_by_key(|&(c, _)| c);
71 |
72 | let mut mode = 0;
73 | let mut mode_n = 0;
74 | for (i, &n) in output.histogram.iter().enumerate() {
75 | let i = i as u8;
76 | println!("{}, {}", i, n);
77 | if n > mode_n {
78 | mode = i;
79 | mode_n = n;
80 | }
81 | }
82 |
83 | println!("Mode: {}", mode);
84 |
85 | let output_file = OpenOptions::new()
86 | .create(true)
87 | .write(true)
88 | .open("./src/character_widths.bin")
89 | .unwrap();
90 | let mut buffered = BufWriter::new(output_file);
91 |
92 | buffered.write_all(&[mode]).unwrap();
93 |
94 | for (c, max_width) in output.tab {
95 | if max_width == mode {
96 | continue;
97 | }
98 | let mut tmp = [0u8; 4];
99 | let s = c.encode_utf8(&mut tmp);
100 | buffered.write_all(s.as_bytes()).unwrap();
101 | buffered.write_all(&[max_width as u8]).unwrap();
102 |
103 | if max_width > 60 {
104 | println!("character '{}' has width {}", c, max_width);
105 | }
106 | }
107 |
108 | buffered.flush().unwrap();
109 | }
110 |
111 | /// Computes max width in milli-m's.
112 | fn max_width(c: char, fonts: &[Font]) -> usize {
113 | use unicode_width::UnicodeWidthChar;
114 | let mut max_width = c.width().map(|w| w * 1000).unwrap_or(0);
115 | for font in fonts {
116 | let width = width(c, font);
117 | max_width = max_width.max(width);
118 | }
119 | max_width
120 | }
121 |
122 | /// Computes with in milli-m's.
123 | fn width(c: char, font: &Font) -> usize {
124 | let mut tmp = [0u8; 4];
125 | let s = c.encode_utf8(&mut tmp);
126 |
127 | let mut min = i32::MAX;
128 | let mut max = i32::MIN;
129 |
130 | font.layout(s, Scale::uniform(1344.0), Point::default())
131 | .for_each(|i| {
132 | if let Some(b) = i.pixel_bounding_box() {
133 | min = min.min(b.min.x);
134 | max = max.max(b.max.x);
135 | } else if false {
136 | i.draw(|x, _y, _c| {
137 | min = min.min(x as i32);
138 | max = max.max(x as i32);
139 | })
140 | }
141 | });
142 |
143 | max.checked_sub(min).unwrap_or(0) as usize
144 | }
145 |
146 | fn render(c: char, font: &Font, resolution: u32) {
147 | let mut image = GrayImage::new(resolution, resolution);
148 |
149 | let height = resolution as f32;
150 | let scale = Scale {
151 | x: height,
152 | y: height,
153 | };
154 |
155 | let mut tmp = [0u8; 4];
156 | let text = c.encode_utf8(&mut tmp);
157 | draw_text_mut(&mut image, Luma([255u8]), 0, 0, scale, &font, text);
158 |
159 | let _ = image.save("image.png").unwrap();
160 | }
161 |
--------------------------------------------------------------------------------
/src/context.rs:
--------------------------------------------------------------------------------
1 | use crate::{trim_whitespace, Censor, Type};
2 |
3 | use crate::censor::should_skip_censor;
4 | use std::collections::VecDeque;
5 | use std::fmt::{self, Debug, Display, Formatter};
6 | use std::num::{NonZeroU16, NonZeroUsize};
7 | use std::time::{Duration, Instant};
8 |
9 | /// Context is useful for taking moderation actions on a per-user basis i.e. each user would get
10 | /// their own Context.
11 | ///
12 | /// # Recommendation
13 | ///
14 | /// Use this as a reference implementation e.g. by copying and adapting it.
15 | #[derive(Clone)]
16 | #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
17 | #[cfg_attr(doc, doc(cfg(feature = "context")))]
18 | pub struct Context {
19 | history: VecDeque<(String, Time)>,
20 | burst_used: u8,
21 | suspicion: u8,
22 | reports: u8,
23 | total: u16,
24 | total_inappropriate: u16,
25 | muted_until: Option