├── .commitlintrc.json
├── .eslintignore
├── .eslintrc.yml
├── .gitattributes
├── .github
├── CODE_OF_CONDUCT.md
├── ISSUE_TEMPLATE
│ ├── bug_report.yml
│ └── feature_request.yml
├── PULL_REQUEST_TEMPLATE.md
├── problem-matchers
│ └── tsc.json
└── workflows
│ ├── codeql-analysis.yml
│ └── continuous-integration.yml
├── .gitignore
├── .prettierignore
├── .prettierrc.json
├── .vscode
└── settings.json
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.md
├── README.md
├── docs
├── README.md
├── guide
│ ├── README.md
│ ├── censoring-text.md
│ ├── datasets.md
│ ├── matchers.md
│ ├── patterns.md
│ └── transformers.md
└── reference
│ ├── .nojekyll
│ ├── README.md
│ ├── classes
│ ├── DataSet.md
│ ├── ParserError.md
│ ├── PhraseBuilder.md
│ ├── RegExpMatcher.md
│ └── TextCensor.md
│ ├── enums
│ └── SyntaxKind.md
│ └── interfaces
│ ├── BlacklistedTerm.md
│ ├── BoundaryAssertionNode.md
│ ├── CollapseDuplicatesTransformerOptions.md
│ ├── LiteralNode.md
│ ├── MatchPayload.md
│ ├── Matcher.md
│ ├── OptionalNode.md
│ ├── ParsedPattern.md
│ ├── PhraseContainer.md
│ ├── ProcessedCollapseDuplicatesTransformerOptions.md
│ ├── RegExpMatcherOptions.md
│ └── WildcardNode.md
├── examples
├── extending-datasets.js
└── repl.js
├── jest.config.ts
├── package.json
├── pnpm-lock.yaml
├── renovate.json
├── scripts
├── english-words.txt
└── search-words.js
├── src
├── censor
│ ├── BuiltinStrategies.ts
│ └── TextCensor.ts
├── dataset
│ └── DataSet.ts
├── index.ts
├── matcher
│ ├── BlacklistedTerm.ts
│ ├── IntervalCollection.ts
│ ├── MatchPayload.ts
│ ├── Matcher.ts
│ └── regexp
│ │ └── RegExpMatcher.ts
├── pattern
│ ├── Nodes.ts
│ ├── Parser.ts
│ ├── ParserError.ts
│ ├── Pattern.ts
│ └── Util.ts
├── preset
│ └── english.ts
├── transformer
│ ├── TransformerSet.ts
│ ├── Transformers.ts
│ ├── collapse-duplicates
│ │ ├── index.ts
│ │ └── transformer.ts
│ ├── remap-characters
│ │ └── index.ts
│ ├── resolve-confusables
│ │ ├── confusables.ts
│ │ └── index.ts
│ ├── resolve-leetspeak
│ │ ├── dictionary.ts
│ │ └── index.ts
│ ├── skip-non-alphabetic
│ │ └── index.ts
│ └── to-ascii-lowercase
│ │ └── index.ts
├── tsconfig.json
└── util
│ ├── Char.ts
│ ├── CharacterIterator.ts
│ └── Interval.ts
├── test
├── censor
│ ├── BuiltinStrategies.test.ts
│ └── TextCensor.test.ts
├── dataset
│ └── DataSet.test.ts
├── jest.setup.ts
├── matcher
│ ├── BlacklistedTerm.test.ts
│ ├── IntervalCollection.test.ts
│ ├── MatchPayload.test.ts
│ └── regexp
│ │ └── RegExpMatcher.test.ts
├── pattern
│ ├── Parser.test.ts
│ ├── ParserError.test.ts
│ ├── Pattern.test.ts
│ └── Util.test.ts
├── transformer
│ ├── TransformerSet.test.ts
│ ├── Transformers.test.ts
│ ├── collapse-duplicates
│ │ ├── index.test.ts
│ │ └── transformer.test.ts
│ ├── remap-characters
│ │ └── index.test.ts
│ ├── resolve-confusables
│ │ └── index.test.ts
│ ├── resolve-leetspeak
│ │ └── index.test.ts
│ ├── skip-non-alphabetic
│ │ └── index.test.ts
│ └── to-ascii-lowercase
│ │ └── index.test.ts
├── tsconfig.json
└── util
│ ├── Char.test.ts
│ ├── CharacterIterator.fuzz.test.ts
│ ├── CharacterIterator.test.ts
│ └── Interval.test.ts
├── tsconfig.base.json
├── tsconfig.eslint.json
└── typedoc.json
/.commitlintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["@commitlint/config-angular"],
3 | "rules": {
4 | "scope-case": [2, "always", "lowerCase"],
5 | "type-enum": [
6 | 2,
7 | "always",
8 | ["chore", "build", "ci", "docs", "feat", "fix", "perf", "refactor", "revert", "style", "test"]
9 | ]
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 | coverage
4 | scripts
5 | examples
6 |
--------------------------------------------------------------------------------
/.eslintrc.yml:
--------------------------------------------------------------------------------
1 | ---
2 | extends:
3 | - plugin:@typescript-eslint/recommended-type-checked
4 | - plugin:jest/recommended
5 | - plugin:jest/style
6 |
7 | plugins:
8 | - jest
9 | - prettier
10 |
11 | parserOptions:
12 | project:
13 | - tsconfig.eslint.json
14 | - src/tsconfig.json
15 | - test/tsconfig.json
16 |
17 | rules:
18 | prettier/prettier:
19 | - error
20 | - endOfLine: auto
21 | no-duplicate-imports: off
22 | curly:
23 | - error
24 | - multi-line
25 | import/extensions: off
26 | id-length: off
27 | tsdoc/syntax: off
28 | '@typescript-eslint/restrict-plus-operands': off
29 | '@typescript-eslint/no-explicit-any': off
30 | '@typescript-eslint/no-unsafe-enum-comparison': off
31 | '@typescript-eslint/consistent-type-definitions':
32 | - error
33 | - interface
34 | '@typescript-eslint/prefer-literal-enum-member':
35 | - error
36 | - allowBitwiseExpressions: true
37 | '@typescript-eslint/no-use-before-define': off
38 | # The following rule conflicts with Prettier in certain cases.
39 | # Also see https://github.com/typescript-eslint/typescript-eslint/issues/1824.
40 | '@typescript-eslint/indent': off
41 | '@typescript-eslint/no-misused-promises':
42 | - error
43 | - checksVoidReturn: false
44 | '@typescript-eslint/no-unnecessary-condition':
45 | - error
46 | - allowConstantLoopConditions: true
47 | '@typescript-eslint/no-throw-literal': off
48 | '@typescript-eslint/naming-convention':
49 | - error
50 | - selector:
51 | - enumMember
52 | - typeAlias
53 | - interface
54 | - enum
55 | - class
56 | format:
57 | - PascalCase
58 | leadingUnderscore: forbid
59 | trailingUnderscore: forbid
60 |
61 | - selector:
62 | - method
63 | - accessor
64 | - parameterProperty
65 | format:
66 | - strictCamelCase
67 |
68 | - selector:
69 | - property
70 | format:
71 | - strictCamelCase
72 | - PascalCase
73 | filter:
74 | regex: '\d+'
75 | match: false
76 | leadingUnderscore: allow
77 | trailingUnderscore: forbid
78 |
79 | - selector: typeParameter
80 | format:
81 | - PascalCase
82 |
83 | - selector: variable
84 | format:
85 | - strictCamelCase
86 | - UPPER_CASE
87 | leadingUnderscore: allow
88 | trailingUnderscore: forbid
89 |
90 | - selector: function
91 | format:
92 | - strictCamelCase
93 | leadingUnderscore: forbid
94 | trailingUnderscore: forbid
95 | '@typescript-eslint/member-ordering':
96 | - error
97 | - default:
98 | - public-static-field
99 | - protected-static-field
100 | - private-static-field
101 | - static-field
102 | - public-static-method
103 | - protected-static-method
104 | - private-static-method
105 | - static-method
106 | - signature
107 | - public-instance-field
108 | - protected-instance-field
109 | - private-instance-field
110 | - instance-field
111 | - public-constructor
112 | - protected-constructor
113 | - private-constructor
114 | - constructor
115 | - public-instance-method
116 | - protected-instance-method
117 | - private-instance-method
118 | - instance-method
119 | '@typescript-eslint/consistent-type-imports': error
120 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | - Using welcoming and inclusive language
12 | - Being respectful of differing viewpoints and experiences
13 | - Gracefully accepting constructive criticism
14 | - Focusing on what is best for the community
15 | - Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | - Trolling, insulting/derogatory comments, and personal or political attacks
21 | - Public or private harassment
22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | - Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [jo3.l.dev@outlook.com](mailto:jo3.l.dev@outlook.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://contributor-covenant.org/version/1/4][version]
44 |
45 | [homepage]: https://contributor-covenant.org
46 | [version]: https://contributor-covenant.org/version/1/4/
47 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
1 | name: Bug report
2 | description: Create an issue about a possible bug
3 | title: 'bug: '
4 | labels: [bug]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Please first [check through existing issues](https://github.com/jo3-l/obscenity/issues) in case your problem
10 | has already been raised.
11 | - type: textarea
12 | attributes:
13 | label: Expected behavior
14 | description: What were you expecting to happen?
15 | placeholder: Using foo, I expected bar to happen.
16 | validations:
17 | required: true
18 | - type: textarea
19 | attributes:
20 | label: Actual behavior
21 | description: What happened instead?
22 | placeholder: Instead, baz happened.
23 | validations:
24 | required: true
25 | - type: markdown
26 | attributes:
27 | value: |
28 | Including more detail in your bug report will expedite the review
29 | process. A minimal reproducible example is preferred.
30 | - type: textarea
31 | attributes:
32 | label: Minimal reproducible example
33 | description: A [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) that demonstrates the problem.
34 | placeholder: |
35 | import { RegExpMatcher } from 'obscenity';
36 | // ...
37 | render: typescript
38 | - type: textarea
39 | attributes:
40 | label: Steps to reproduce
41 | description: Provide steps to reproduce the problem.
42 | placeholder: |
43 | 1. Run foo
44 | 2. ...
45 | 3. See error
46 | - type: textarea
47 | attributes:
48 | label: Additional context
49 | description: |
50 | Links? References? Anything that will give us more context about the issue you are encountering?
51 |
52 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
53 | - type: textarea
54 | attributes:
55 | label: Node.js version
56 | description: What version of Node are you using?
57 | placeholder: v16.7.0
58 | validations:
59 | required: true
60 | - type: textarea
61 | attributes:
62 | label: Obscenity version
63 | description: |
64 | What version of the library are you using?
65 |
66 | Tip: You can get this using 'npm ls obscenity', 'yarn list obscenity' or 'pnpm ls obscenity' (depending on which package manager you're using).
67 | placeholder: v0.1.0
68 | validations:
69 | required: true
70 | - type: checkboxes
71 | attributes:
72 | label: Priority
73 | description: What should the priority of this issue be?
74 | options:
75 | - label: Low
76 | - label: Medium
77 | - label: High
78 | validations:
79 | required: true
80 | - type: checkboxes
81 | attributes:
82 | label: Terms
83 | description: 'By submitting this issue, you confirm the following:'
84 | options:
85 | - label: I agree to follow the project's Code of Conduct.
86 | required: true
87 | - label: I have searched existing issues for similar reports.
88 | required: true
89 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
1 | name: Feature request
2 | description: Suggest an idea for the repository
3 | title: 'request: '
4 | labels: [enhancement]
5 | body:
6 | - type: markdown
7 | attributes:
8 | value: |
9 | Please first [check through existing issues](https://github.com/jo3-l/obscenity/issues) in case your request
10 | has already been raised.
11 | - type: textarea
12 | attributes:
13 | label: Description
14 | description: Is your feature request related to a problem? Please describe.
15 | validations:
16 | required: true
17 | - type: textarea
18 | attributes:
19 | label: Solution
20 | description: Explain the solution you would like to see. Please also provide alternatives to your solution.
21 | validations:
22 | required: true
23 | - type: checkboxes
24 | attributes:
25 | label: Code of Conduct
26 | description: By submitting this issue, you agree to follow our [Code of Conduct](../CODE_OF_CONDUCT.md)
27 | options:
28 | - label: I agree to follow this project's Code of Conduct.
29 | required: true
30 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | **Type of change:**
2 |
3 | - [ ] Refactor
4 | - [ ] Performance improvement
5 | - [ ] New feature
6 | - [ ] Bug fix
7 | - [ ] Other (please describe):
8 |
9 | **Please describe the changes this PR makes and why it should be merged:**
10 |
11 | **Status:**
12 |
13 | - [ ] I've added/modified unit tests relevant to my change / not needed
14 | - [ ] This PR contains breaking changes
15 | - [ ] This PR doesn't include changes to the code
16 |
--------------------------------------------------------------------------------
/.github/problem-matchers/tsc.json:
--------------------------------------------------------------------------------
1 | {
2 | "problemMatcher": [
3 | {
4 | "owner": "tsc",
5 | "pattern": [
6 | {
7 | "regexp": "^(?:\\s+\\d+\\>)?([^\\s].*)\\((\\d+),(\\d+)\\)\\s*:\\s+(error|warning|info)\\s+(\\w{1,2}\\d+)\\s*:\\s*(.*)$",
8 | "file": 1,
9 | "line": 2,
10 | "column": 3,
11 | "severity": 4,
12 | "code": 5,
13 | "message": 6
14 | }
15 | ]
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | name: CodeQL Analysis
2 |
3 | on:
4 | push:
5 | pull_request:
6 | schedule:
7 | - cron: '0 0 * * 1'
8 |
9 | jobs:
10 | codeql-build:
11 | name: CodeQL analysis
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - name: Checkout repository
16 | uses: actions/checkout@v4
17 |
18 | - name: Initialize CodeQL
19 | uses: github/codeql-action/init@v3
20 |
21 | - name: Auto-build
22 | uses: github/codeql-action/autobuild@v3
23 |
24 | - name: Perform CodeQL analysis
25 | uses: github/codeql-action/analyze@v3
26 |
--------------------------------------------------------------------------------
/.github/workflows/continuous-integration.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | pull_request:
6 |
7 | jobs:
8 | run-eslint:
9 | name: Run ESLint
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout repository
14 | uses: actions/checkout@v4
15 |
16 | - name: Install pnpm
17 | uses: pnpm/action-setup@v4
18 |
19 | - name: Install Node.js v22
20 | uses: actions/setup-node@v4
21 | with:
22 | node-version: 22
23 | cache: pnpm
24 |
25 | - name: Install dependencies
26 | run: pnpm install --frozen-lockfile
27 |
28 | - name: Run ESLint
29 | run: pnpm lint
30 |
31 | unit-tests:
32 | name: Run unit tests
33 | runs-on: ubuntu-latest
34 | strategy:
35 | matrix:
36 | node-version: [18.x, 20.x, 22.x]
37 |
38 | steps:
39 | - name: Checkout repository
40 | uses: actions/checkout@v4
41 |
42 | - name: Install pnpm
43 | uses: pnpm/action-setup@v4
44 |
45 | - name: Install Node.js
46 | uses: actions/setup-node@v4
47 | with:
48 | node-version: ${{ matrix.node-version }}
49 | cache: pnpm
50 |
51 | - name: Install dependencies
52 | run: pnpm install --frozen-lockfile
53 |
54 | - name: Run unit tests
55 | run: pnpm test:ci
56 |
57 | - name: Upload coverage
58 | uses: codecov/codecov-action@v5
59 |
60 | build-project:
61 | name: Compile source code
62 | runs-on: ubuntu-latest
63 |
64 | steps:
65 | - name: Checkout repository
66 | uses: actions/checkout@v4
67 |
68 | - name: Install pnpm
69 | uses: pnpm/action-setup@v4
70 |
71 | - name: Install Node.js v22
72 | uses: actions/setup-node@v4
73 | with:
74 | node-version: 22
75 | cache: pnpm
76 |
77 | - name: Install dependencies
78 | run: pnpm install --frozen-lockfile
79 |
80 | - name: Register problem matcher
81 | run: echo "##[add-matcher].github/problem-matchers/tsc.json"
82 |
83 | - name: Compile TypeScript
84 | run: pnpm build
85 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Deps
2 | node_modules
3 |
4 | # Build artifacts
5 | dist
6 |
7 | # Coverage
8 | coverage
9 | *.lcov
10 | .nyc_output
11 |
12 | # Logs
13 | pnpm-debug.log*
14 | *.log
15 |
16 | # IDE
17 | .idea
--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | dist
3 |
--------------------------------------------------------------------------------
/.prettierrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "printWidth": 120,
3 | "useTabs": true,
4 | "singleQuote": true,
5 | "quoteProps": "as-needed",
6 | "trailingComma": "all",
7 | "endOfLine": "auto",
8 | "overrides": [
9 | {
10 | "files": ["README.md"],
11 | "options": {
12 | "printWidth": 80
13 | }
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "typescript.tsdk": "node_modules/typescript/lib"
3 | }
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines.
4 |
5 | ### [0.4.3](https://github.com/jo3-l/obscenity/compare/v0.4.2...v0.4.3) (2025-01-26)
6 |
7 |
8 | ### Bug Fixes
9 |
10 | * **preset/english:** match 'shit' at end of word ([0299b49](https://github.com/jo3-l/obscenity/commit/0299b4978dec6d218a4e004fe20962a79500fe7c)), closes [#47](https://github.com/jo3-l/obscenity/issues/47)
11 |
12 | ### [0.4.2](https://github.com/jo3-l/obscenity/compare/v0.4.1...v0.4.2) (2025-01-18)
13 |
14 |
15 | ### Features
16 |
17 | * add more characters to leet transformer ([#78](https://github.com/jo3-l/obscenity/issues/78)) ([fa673e6](https://github.com/jo3-l/obscenity/commit/fa673e66226e13388401274610e7d1bd0801ade0))
18 |
19 |
20 | ### Bug Fixes
21 |
22 | * **censor:** don't generate the same character twice in a row ([#85](https://github.com/jo3-l/obscenity/issues/85)) ([58f2715](https://github.com/jo3-l/obscenity/commit/58f271556aa878e619457054f8a2f423e8b574ca)), closes [#82](https://github.com/jo3-l/obscenity/issues/82)
23 | * **preset/english:** add word boundary to 'shit' pattern ([9554e7c](https://github.com/jo3-l/obscenity/commit/9554e7cc7b796f64a80baa272ed3e49ad03466a3)), closes [#93](https://github.com/jo3-l/obscenity/issues/93)
24 | * **preset/english:** whitelist "fick" ([#88](https://github.com/jo3-l/obscenity/issues/88)) ([40f66fb](https://github.com/jo3-l/obscenity/commit/40f66fb17524f49b1e4be6a2fe1037f3e1b468c2))
25 |
26 | ### [0.4.1](https://github.com/jo3-l/obscenity/compare/v0.4.0...v0.4.1) (2024-12-03)
27 |
28 |
29 | ### Bug Fixes
30 |
31 | * **preset/english:** add "fickle" to whitelist ([#87](https://github.com/jo3-l/obscenity/issues/87)) ([da754da](https://github.com/jo3-l/obscenity/commit/da754da8d42cf4b36534141b2ceafaa4810b99b5))
32 | * **preset/english:** remove erroneous patterns for `dick` ([e43d502](https://github.com/jo3-l/obscenity/commit/e43d50260d8f3c55374bd1da65be0dff33a1fd6d)), closes [#86](https://github.com/jo3-l/obscenity/issues/86)
33 |
34 | ## [0.4.0](https://github.com/jo3-l/obscenity/compare/v0.3.1...v0.4.0) (2024-08-02)
35 |
36 |
37 | ### ⚠ BREAKING CHANGES
38 |
39 | * **regexp-matcher:** Passing an empty whitelisted term to the RegExpMatcher will result in a runtime error.
40 |
41 | This was unsupported previously and likely did not work correctly. Make it a real error.
42 |
43 | ### Bug Fixes
44 |
45 | * **regexp-matcher:** advance index correctly in whitelist matcher ([ebf95ad](https://github.com/jo3-l/obscenity/commit/ebf95add62be8297f693ca6d8aafefc10afc1a8b)), closes [#49](https://github.com/jo3-l/obscenity/issues/49)
46 | * **regexp-matcher:** correctly remap to original indices in all cases ([3a49579](https://github.com/jo3-l/obscenity/commit/3a49579f3c242d3e159e88707df090e3f6dc0121)), closes [#71](https://github.com/jo3-l/obscenity/issues/71)
47 | * **regexp-matcher:** reject empty whitelist terms ([9a46113](https://github.com/jo3-l/obscenity/commit/9a461130b98920e22d5acf92650146ae48d2226b))
48 |
49 | ### [0.3.1](https://github.com/jo3-l/obscenity/compare/v0.3.0...v0.3.1) (2024-07-17)
50 |
51 | ## [0.3.0](https://github.com/jo3-l/obscenity/compare/v0.2.2...v0.3.0) (2024-07-17)
52 |
53 |
54 | ### ⚠ BREAKING CHANGES
55 |
56 | * The library no longer exports a version constant.
57 |
58 | * drop version constant ([2810674](https://github.com/jo3-l/obscenity/commit/2810674de20d82d7372c617d2e8ef76e911f27ad))
59 |
60 | ### [0.2.2](https://github.com/jo3-l/obscenity/compare/v0.2.1...v0.2.2) (2024-07-17)
61 |
62 |
63 | ### Features
64 |
65 | * **english-preset:** add more blacklisted terms ([#50](https://github.com/jo3-l/obscenity/issues/50)) ([4653de5](https://github.com/jo3-l/obscenity/commit/4653de51e63bd3457daca57316c2b2c851752072))
66 |
67 |
68 | ### Bug Fixes
69 |
70 | * **english-preset:** whitelist 'kung-fu' ([d60b4f4](https://github.com/jo3-l/obscenity/commit/d60b4f4b766592785ba7c9c51d6d0607c5f26c57)), closes [#67](https://github.com/jo3-l/obscenity/issues/67)
71 |
72 | ### [0.2.1](https://github.com/jo3-l/obscenity/compare/v0.2.0...v0.2.1) (2024-03-03)
73 |
74 |
75 | ### Features
76 |
77 | * **english-preset:** add more blacklisted terms ([#50](https://github.com/jo3-l/obscenity/issues/50)) ([c189595](https://github.com/jo3-l/obscenity/commit/c189595b09554899aeead3dd070d36f8f3269150))
78 |
79 | ## [0.2.0](https://github.com/jo3-l/obscenity/compare/v0.1.4...v0.2.0) (2024-01-05)
80 |
81 |
82 | ### ⚠ BREAKING CHANGES
83 |
84 | * **english-preset:** Using the default English preset, Obscenity will no longer strip non-alphabetic characters from the input text before matching.
85 |
86 | This addresses a class of egregious false negatives in previous versions (see #23), but introduces a regression where cases such as 'f u c k' (with the space) will no longer be detected by default. We expect to provide a more comprehensive fix in the next minor release.
87 |
88 | If desired, it remains possible to revert to the previous behavior by providing a custom set of transformers to the matcher.
89 | * **matchers:** The NfaMatcher class has been removed. Use the RegExpMatcher instead.
90 |
91 | ### Features
92 |
93 | * **english-preset:** blacklist 'shit' by default ([b0d90aa](https://github.com/jo3-l/obscenity/commit/b0d90aa4b7dd6d15a2105490f1d2b0c87e58bdcf)), closes [#47](https://github.com/jo3-l/obscenity/issues/47)
94 |
95 |
96 | ### Bug Fixes
97 |
98 | * **english-preset:** don't include skip-non-alphabetic transformer ([620c721](https://github.com/jo3-l/obscenity/commit/620c721662c3ddd8d8ca8838861b9c4ba3ea66e7)), closes [#23](https://github.com/jo3-l/obscenity/issues/23) [#46](https://github.com/jo3-l/obscenity/issues/46)
99 | * **english-preset:** remove extraneous patterns for n-word ([e135be5](https://github.com/jo3-l/obscenity/commit/e135be58510149db9b678801a2e6e3468b3bd4bb)), closes [#48](https://github.com/jo3-l/obscenity/issues/48)
100 | * **pkg:** ensure types resolve in ESM ([718da8a](https://github.com/jo3-l/obscenity/commit/718da8a7399c0dcf948fbe8041714ad6d61c9f73)), closes [#44](https://github.com/jo3-l/obscenity/issues/44)
101 |
102 |
103 | * **matchers:** remove NfaMatcher ([b69c21d](https://github.com/jo3-l/obscenity/commit/b69c21d178ac5e3270fd35d2b876263045a67d81))
104 |
105 | ### [0.1.4](https://github.com/jo3-l/obscenity/compare/v0.1.1...v0.1.4) (2023-06-06)
106 |
107 | ### Bug Fixes
108 |
109 | - **matchers:** gracefully handle empty patterns ([#31](https://github.com/jo3-l/obscenity/issues/31)) ([79cfa63](https://github.com/jo3-l/obscenity/commit/79cfa630c964be79d1dc16eb0e5d65af4d68e7ab))
110 |
111 | ### 0.1.1, 0.1.2, 0.1.3
112 |
113 | Versions skipped due to temporary issue with release workflow.
114 |
115 | ## 0.1.0 (2021-08-27)
116 |
117 | Initial release.
118 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | If you wish to contribute to Obscenity, feel free to fork the repository and submit a pull request. We use [ESLint](https://eslint.org/) and [Prettier](https://prettier.io/) to enforce a consistent code style and catch possible issues; setting up relevant plugins for your editor of choice is highly recommended.
4 |
5 | ## Setup
6 |
7 | **Prerequisites:** Node.js (preferably latest version, but any version >= 12 will work), and [pnpm](https://pnpm.io/) for managing packages.
8 |
9 | 1. Fork & clone the main repository.
10 | 2. Create a new branch for your changes: `git checkout -b feat/my-feature`.
11 | 3. Run `pnpm install` to install all dependencies.
12 | 4. Make your changes.
13 | 5. Run `pnpm lint` and `pnpm style` to lint and format the code. Then, run `pnpm test` to make sure all the tests are still passing after your change.
14 | 6. Commit your changes (make sure you follow our commit convention, which is based off [Angular's commit message guidelines](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/conventional-changelog-angular)).
15 | 7. Submit a [pull request](https://github.com/jo3-l/obscenity/pulls).
16 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # The MIT License (MIT)
2 |
3 | Copyright © 2021 Joe L.
4 |
5 | Permission is hereby granted, free of charge, to any person
6 | obtaining a copy of this software and associated documentation
7 | files (the “Software”), to deal in the Software without
8 | restriction, including without limitation the rights to use,
9 | copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the
11 | Software is furnished to do so, subject to the following
12 | conditions:
13 |
14 | The above copyright notice and this permission notice shall be
15 | included in all copies or substantial portions of the Software.
16 |
17 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 | OTHER DEALINGS IN THE SOFTWARE.
25 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Obscenity
2 |
3 | > Robust, extensible profanity filter for NodeJS.
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | ## Why Obscenity?
12 |
13 | - **Accurate:** Though Obscenity is far from perfect (as with all profanity filters), it makes reducing false positives as simple as possible: adding whitelisted phrases is as easy as adding a new string to an array, and using word boundaries is equally simple.
14 | - **Robust:** Obscenity's transformer-based design allows it to match on variants of phrases other libraries are typically unable to, e.g. `fuuuuuuuckkk`, `ʃṳ𝒸𝗄`, `wordsbeforefuckandafter` and so on. There's no need to manually write out all the variants either: just adding the pattern `fuck` will match all of the cases above by default.
15 | - **Extensible:** With Obscenity, you aren't locked into anything - removing phrases that you don't agree with from the default set of words is trivial, as is disabling any transformations you don't like (perhaps you feel that leet-speak decoding is too error-prone for you).
16 |
17 | ## Installation
18 |
19 | ```shell
20 | $ npm install obscenity
21 | $ yarn add obscenity
22 | $ pnpm add obscenity
23 | ```
24 |
25 | ## Example usage
26 |
27 | First, import Obscenity:
28 |
29 | ```javascript
30 | const {
31 | RegExpMatcher,
32 | TextCensor,
33 | englishDataset,
34 | englishRecommendedTransformers,
35 | } = require('obscenity');
36 | ```
37 |
38 | Or, in TypeScript/ESM:
39 |
40 | ```typescript
41 | import {
42 | RegExpMatcher,
43 | TextCensor,
44 | englishDataset,
45 | englishRecommendedTransformers,
46 | } from 'obscenity';
47 | ```
48 |
49 | Now, we can create a new matcher using the English preset.
50 |
51 | ```javascript
52 | const matcher = new RegExpMatcher({
53 | ...englishDataset.build(),
54 | ...englishRecommendedTransformers,
55 | });
56 | ```
57 |
58 | Now, we can use our matcher to search for profanities in the text. Here's two examples of what you can do:
59 |
60 | **Check if there are any matches in some text:**
61 |
62 | ```javascript
63 | if (matcher.hasMatch('fuck you')) {
64 | console.log('The input text contains profanities.');
65 | }
66 | // The input text contains profanities.
67 | ```
68 |
69 | **Output the positions of all matches along with the original word used:**
70 |
71 | ```javascript
72 | // Pass "true" as the "sorted" parameter so the matches are sorted by their position.
73 | const matches = matcher.getAllMatches('ʃ𝐟ʃὗƈk ỹоứ 𝔟ⁱẗ𝙘ɦ', true);
74 | for (const match of matches) {
75 | const { phraseMetadata, startIndex, endIndex } =
76 | englishDataset.getPayloadWithPhraseMetadata(match);
77 | console.log(
78 | `Match for word ${phraseMetadata.originalWord} found between ${startIndex} and ${endIndex}.`,
79 | );
80 | }
81 | // Match for word fuck found between 0 and 6.
82 | // Match for word bitch found between 12 and 18.
83 | ```
84 |
85 | **Censoring matched text:**
86 |
87 | To censor text, we'll need to import another class: the `TextCensor`.
88 | Some other imports and creation of the matcher have been elided for simplicity.
89 |
90 | ```javascript
91 | const { TextCensor, ... } = require('obscenity');
92 | // ...
93 | const censor = new TextCensor();
94 | const input = 'fuck you little bitch';
95 | const matches = matcher.getAllMatches(input);
96 | console.log(censor.applyTo(input, matches));
97 | // %@$% you little **%@%
98 | ```
99 |
100 | This is just a small slice of what Obscenity can do: for more, check out the [documentation](#documentation).
101 |
102 | ## Accuracy
103 |
104 | > **Note:** As with all swear filters, Obscenity is not perfect (nor will it ever be). Use its output as a heuristic, and not as the sole judge of whether some content is appropriate or not.
105 |
106 | With the English preset, Obscenity (correctly) finds matches in all of the following texts:
107 |
108 | - you are a little **fuck**er
109 | - **fk** you
110 | - **ffuk** you
111 | - i like **a$$es**
112 | - ʃ𝐟ʃὗƈk ỹоứ
113 |
114 | ...and it **does not match** on the following:
115 |
116 | - the **pen is** mightier than the sword
117 | - i love banan**as s**o yeah
118 | - this song seems really b**anal**
119 | - g**rape**s are really yummy
120 |
121 | ## Documentation
122 |
123 | For a step-by-step guide on how to use Obscenity, check out the [guide](./docs/guide).
124 |
125 | Otherwise, refer to the [auto-generated API documentation](./docs/reference).
126 |
127 | ## Contributing
128 |
129 | Issues can be reported using the [issue tracker](https://github.com/jo3-l/obscenity/issues).
130 | If you'd like to submit a pull request, please read the [contribution guide](./CONTRIBUTING.md) first.
131 |
132 | ## Author
133 |
134 | **Obscenity** © [Joe L.](https://github.com/jo3-l/) under the MIT license. Authored and maintained by Joe L.
135 |
136 | > GitHub [@jo3-l](https://github.com/jo3-l)
137 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | Obscenity's documentation is composed of a [reference](./reference/) automatically generated using [TypeDoc](https://typedoc.org/) and a brief [guide](./guide/) comprising a number of hand-written articles.
4 |
--------------------------------------------------------------------------------
/docs/guide/README.md:
--------------------------------------------------------------------------------
1 | # Welcome
2 |
3 | Welcome to the Obscenity guide, a collection of articles that teach you how to use Obscenity in a step-by-step fashion.
4 |
5 | Though you can read it in any order you wish, we recommend the following order:
6 |
7 | - [Patterns](./patterns.md)
8 | - [Transformers](./transformers.md)
9 | - [Matchers](./matchers.md)
10 | - [Datasets](./datasets.md)
11 | - [Censoring text](./censoring-text.md)
12 |
--------------------------------------------------------------------------------
/docs/guide/censoring-text.md:
--------------------------------------------------------------------------------
1 | # Censoring Profane Phrases
2 |
3 | > Learn how to censor text with Obscenity's `TextCensor`.
4 |
5 | A common strategy to deal with content containing banned phrases is to _censor_
6 | them by replacing the offending parts of the content with placeholders.
7 |
8 | Obscenity's `TextCensor` class makes this simple. Consider the following basic
9 | example:
10 |
11 | ```typescript
12 | import { TextCensor, RegExpMatcher, englishDataset, englishRecommendedTransformers } from 'obscenity';
13 |
14 | const matcher = new RegExpMatcher({ ...englishDataset.build(), ...englishRecommendedTransformers });
15 | const censor = new TextCensor(); // (1)
16 |
17 | const text = 'f u c k you!';
18 | const matches = matcher.getAllMatches(text);
19 | console.log(censor.applyTo(text, matches)); // (2)
20 | //> "@$** you!"
21 | ```
22 |
23 | We start by constructing a `TextCensor` (1). Then, we apply this censor to a
24 | piece of content by invoking the `applyTo` method with the original text along
25 | with the set of matches (2).
26 |
27 | Note that in the above example the offending content has been replaced with
28 | [grawlix](https://en.wikipedia.org/wiki/Grawlix). However, if this is
29 | undesirable for your use-case, the replacement text can be easily customized
30 | by providing your own _censor strategy_.
31 |
32 | ## Censor Strategies
33 |
34 | A censor strategy specifies how to generate replacement text given a match.
35 | Under the hood, a censor strategy is simply a function that receives a _censor
36 | context_ and returns a replacement string.
37 |
38 | The most basic type of censor strategy simply returns a fixed replacement
39 | string:
40 |
41 | ```typescript
42 | const fudgeStrategy = () => 'fudge';
43 | ```
44 |
45 | To use this censor strategy, we use the `setStrategy` method on our
46 | `TextCensor`:
47 |
48 | ```typescript
49 | const censor = new TextCensor().setStrategy(fudgeStrategy);
50 | // ...
51 | console.log(censor.applyTo(text, matches));
52 | //> "fudge you!"
53 | ```
54 |
55 | We can also create more complex strategies that generate output dynamically
56 | based on the specific text matched. For instance, let us try writing a strategy
57 | that will generate a string of asterisks of varying length ⸺ `ass` should become
58 | `***`, `fuck` `****`, and so on. To do this, we can use the `matchLength` property
59 | of the censor context:
60 |
61 | ```typescript
62 | const asteriskStrategy = (ctx: CensorContext) => '*'.repeat(ctx.matchLength);
63 | ```
64 |
65 | which works as expected:
66 |
67 | ```typescript
68 | const censor = new TextCensor().setStrategy(asteriskStrategy);
69 | // ...
70 | console.log(censor.applyTo(text, matches));
71 | //> "**** you!"
72 | ```
73 |
74 | Other than the match length, censor contexts also include the following data:
75 |
76 | - All the properties of `MatchPayload`s, as `CensorContext` extends
77 | `MatchPayload`. Thus, `ctx.termId`, `ctx.startIndex`, `ctx.endIndex`, and so
78 | on are all accessible.
79 | - `input` ⸺ The input text.
80 | - `overlapsAtStart` ⸺ Whether the current match overlaps at the start with some other match.
81 | - `overlapsAtEnd` ⸺ Whether the current match overlaps at the end with some other match.
82 |
83 | ## Built-in Censor Strategies
84 |
85 | Obscenity exports the two censor strategies discussed in this article to save
86 | you the work of implementing them yourself:
87 |
88 | - `grawlixCensorStrategy()` ⸺ Generates grawlix; this is the default strategy.
89 | - `asteriskCensorStrategy()` ⸺ Generates repeated asterisks..
90 |
91 | In addition, a number of utilities are provided to aid in writing custom censor
92 | strategies:
93 |
94 | - `fixedPhraseCensorStrategy()` ⸺ Returns a censor strategy that produces a
95 | fixed phrase. For example, `fixedPhraseCensorStrategy('fudge')` always returns
96 | `fudge`.
97 | - `fixedCharCensorStrategy()` ⸺ Returns a censor strategy that produces the
98 | input character repeated an appropriate number of times. For example,
99 | `fixedCharCensorStrategy('$')` might return `$`, `$$`, `$$`, and so on.
100 | - `randomCharFromSetCensorStrategy()` ⸺ Returns a censor strategy that produces
101 | random characters from the set of characters given, repeated an appropriate
102 | number of times. For example, `randomCharFromSetCensorStrategy('%&')` might
103 | return `%%&`, `&%&&`, and so on.
104 | - `keepStartCensorStrategy()` ⸺ Extends another censor strategy by keeping the
105 | first character matched. For example,
106 | `keepStartCensorStrategy(asteriskStrategy)` might produce `f***` as the
107 | replacement string.
108 | - `keepEndCensorStrategy()` ⸺ Same as above, but keeps the last character
109 | matched instead.
110 |
111 | ---
112 |
113 | Great, now you know all about censoring text (and if you've read the guide in
114 | order, all of Obscenity's features)! If you have further questions, consult the
115 | reference documentation.
116 |
--------------------------------------------------------------------------------
/docs/guide/datasets.md:
--------------------------------------------------------------------------------
1 | # Datasets
2 |
3 | > Learn about datasets, a way to organize blacklisted and whitelisted terms.
4 |
5 | Say that you want to know what the original word associated with a match was. You could do this with a giant chain of `if-else`s:
6 |
7 | ```typescript
8 | const patterns = [
9 | { id: 0, pattern: pattern`fck` },
10 | { id: 1, pattern: pattern`fuck` },
11 | { id: 2, pattern: pattern`bish` },
12 | { id: 3, pattern: pattern`bitch` },
13 | // ...
14 | ];
15 |
16 | const matcher = new RegExpMatcher({ ... });
17 | const payloads = matcher.getAllMatches(text);
18 | for (const payload of payloads) {
19 | if (payload.termId === 0 || payload.termId === 1) console.log('Original word: fuck');
20 | else if (payload.termId === 2 || payload.termId === 3) console.log('Original word: bitch');
21 | // ...
22 | }
23 | ```
24 |
25 | ...but clearly this becomes quite unmaintainable with many patterns. What's the solution?
26 |
27 | **Datasets** can come in handy here. They support creating groups of blacklisted/whitelisted terms ("phrases") and associating arbitrary metadata with them. To see what's meant by this, see the following example:
28 |
29 | ```typescript
30 | import { DataSet, pattern } from 'obscenity';
31 |
32 | const dataset = new DataSet<{ originalWord: string }>()
33 | // addPhrase() adds a new phrase to the dataset.
34 | .addPhrase((phrase) =>
35 | phrase
36 | // setMetadata() sets the metadata of the phrase.
37 | .setMetadata({ originalWord: 'fuck' })
38 | // addPattern() associates a pattern with the phrase.
39 | .addPattern(pattern`fck`)
40 | .addPattern(pattern`fuck`),
41 | )
42 | .addPhrase((phrase) =>
43 | phrase
44 | .setMetadata({ originalWord: 'bitch' })
45 | .addPattern(pattern`bish`)
46 | .addPattern(pattern`bitch`)
47 | // addWhitelistedTerm() associates a whitelisted term with the phrase.
48 | .addWhitelistedTerm('abish'),
49 | );
50 | ```
51 |
52 | To use our dataset with a matcher, we can call the `build()` method, which will produce an object structured like `{ blacklistedTerms, whitelistedTerms }`, which we can then use in the matcher options:
53 |
54 | ```typescript
55 | const built = dataset.build();
56 | const matcher = new RegExpMatcher({
57 | blacklistedTerms: built.blacklistedTerms,
58 | whitelistedTerms: built.whitelistedTerms,
59 | // Other options go here.
60 | });
61 |
62 | // Or, using spread notation:
63 | const matcher = new RegExpMatcher({
64 | ...built,
65 | // Other options go here.
66 | });
67 | ```
68 |
69 | But how does this help us solve the original problem (getting the original word from a match)? Simple. We can use the `getPayloadWithPhraseMetadata` method:
70 |
71 | ```typescript
72 | const payloads = matcher.getAllMatches(input);
73 | const payloadsWithMetadata = payloads.map(dataset.getPayloadWithPhraseMetadata);
74 | ```
75 |
76 | The `getPayloadWithPhraseMetadata` will return a copy of the original match payload with a new property added: `phraseMetadata`, which is the phrase metadata associated with the term that matched.
77 |
78 | So, to get the original word that matched for the first payload, we could just use the following:
79 |
80 | ```typescript
81 | const originalWord = payloadsWithMetadata[0].phraseMetadata!.originalWord;
82 | ```
83 |
84 | Though associating metadata with phrases is one of the main features of the `DataSet`, it's by no means the only one, as we'll see in the next section.
85 |
86 | ## Extending existing datasets
87 |
88 | Say that you would like to use the English preset, but you don't really agree with one of the words in there. That's simple to fix: we can just extend the dataset of English words:
89 |
90 | ```typescript
91 | const myDataset = new DataSet<{ originalWord: string }>()
92 | // addAll() adds all the data from the dataset passed.
93 | .addAll(englishDataset)
94 | // removePhrasesIf() removes phrases from the current dataset if the function provided
95 | // returns true.
96 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'bitch');
97 | ```
98 |
99 | The `addAll` method adds
100 |
101 | Using our new dataset is equally as simple:
102 |
103 | ```typescript
104 | const matcher = new RegExpMatcher({
105 | ...myDataset.build(),
106 | // Other options go here.
107 | });
108 | ```
109 |
110 | ---
111 |
112 | **Next up: [Censoring text](./censoring-text.md).**
113 |
--------------------------------------------------------------------------------
/docs/guide/matchers.md:
--------------------------------------------------------------------------------
1 | # Matchers
2 |
3 | > Learn about Obscenity's `Matcher` interface and its implementations.
4 |
5 | We've previously discussed patterns and transformers. It's time to learn about how to use Obscenity to search for blacklisted terms in text, while respecting whitelisted terms.
6 |
7 | To facilitate this, Obscenity provides the `RegExpMatcher`, which -- as the name suggests -- implements matching using regular expressions and string searching methods. At a high level, all it does is:
8 |
9 | ```
10 | apply transformations to text before matching whitelisted terms
11 | find whitelisted terms in text
12 |
13 | apply transformations to text before matching blacklisted terms
14 | for each blacklisted term
15 | for all matches of the blacklisted term in the text
16 | if a whitelisted term did not match this part of the text
17 | emit match
18 | ```
19 |
20 | For now, the `RegExpMatcher` is the only matcher implementation offered by Obscenity, though this may change in future versions.
21 |
22 | ## Providing matcher options
23 |
24 | Matchers support several options:
25 |
26 | - `blacklistedTerms` - a list of blacklisted terms. Blacklisted terms are objects with a unique ID that identify them and a pattern, e.g. `` { id: 0, pattern: pattern`my pattern` } ``.
27 |
28 | > **Tip:** If you only want to supply a list of patterns (as you don't care about knowing exactly which pattern matched, you can use the `assignIncrementingIds` utility):
29 | >
30 | > ```typescript
31 | > import { RegExpMatcher, assignIncrementingIds, pattern } from 'obscenity';
32 | >
33 | > const matcher = new RegExpMatcher({
34 | > blacklistedTerms: assignIncrementingIds([pattern`my pattern`]),
35 | > });
36 | > ```
37 |
38 | - `whitelistedTerms` - a list of whitelisted terms, which are just strings.
39 |
40 | - `blacklistMatcherTransformers` - a set of transformers that should be applied to the text before matching blacklisted terms. They will be applied in the order they are given.
41 |
42 | - `whitelistMatcherTransformers` - a set of transformers that should be applied to the text before matching whitelisted terms. They will be applied in the order they are given.
43 |
44 | ### Example
45 |
46 | ```typescript
47 | import { RegExpMatcher, pattern } from 'obscenity';
48 |
49 | const matcher = new RegExpMatcher({
50 | blacklistedTerms: [
51 | { id: 0, pattern: pattern`hi` },
52 | { id: 1, pattern: pattern`bye` },
53 | ],
54 | whitelistedTerms: ['achingly'],
55 | blacklistMatcherTransformers: [skipSpaces],
56 | whitelistMatcherTransformers: [],
57 | });
58 | ```
59 |
60 | This will match `hi` and `bye` (ignoring spaces) unless the `hi` is part of `achingly` (not ignoring spaces).
61 |
62 | ## Presets
63 |
64 | While coming up with your own list of blacklisted terms / whitelisted terms / transformers is a possibility, it does take quite a bit of time if you want to make sure you have few false positives and match as many variants as possible.
65 |
66 | To save you some work, Obscenity features _presets_, which are sets of blacklisted terms, whitelisted terms, and transformers. For example, to use the English preset:
67 |
68 | ```typescript
69 | import { RegExpMatcher, englishDataset, englishRecommendedTransformers } from 'obscenity';
70 |
71 | const matcher = new RegExpMatcher({
72 | ...englishDataset.build(),
73 | ...englishRecommendedTransformers,
74 | });
75 | ```
76 |
77 | ### Available presets
78 |
79 | The English preset is the only one available at the moment, but more may be added in the future.
80 |
81 | ## Using the matcher
82 |
83 | Now, we can use our matcher to answer some questions about our text. Namely, we can ask it whether the text contains any blacklisted terms, and where those blacklisted terms appeared.
84 |
85 | To check whether the text contains any blacklisted terms, we can use the `hasMatch()` method:
86 |
87 | ```typescript
88 | const hasMatch = matcher.hasMatch(input);
89 | ```
90 |
91 | This should be preferred if you do not need to know which terms matched and where they matched.
92 |
93 | If you do need to obtain more information about the matches, though, you can use `getAllMatches()`:
94 |
95 | ```typescript
96 | const payloads = matcher.getAllMatches(input);
97 | ```
98 |
99 | You may notice that the resulting list of matches is not sorted. That is, matches beginning at a higher index might come before matches beginning at a lower index.
100 | If having a sorted list of matches is a requirement for your code, you can pass `true` as an argument to `getAllMatches`.
101 |
102 | ```typescript
103 | const sortedPayloads = matcher.getAllMatches(input, true);
104 | ```
105 |
106 | `getAllMatches()` returns a list of match payloads, which contain four pieces of information:
107 |
108 | - `termId` - the ID of the term that matched;
109 | - `startIndex` - the start index of the match, inclusive;
110 | - `endIndex` - the end index of the match, inclusive;
111 | - `matchLength` - the number of characters that matched.
112 |
113 | The information emitted may not be enough for your use-case (perhaps you want to track the type of word was used, what the original word was, etc.). If that's the case, be sure to check out the next article!
114 |
115 | ---
116 |
117 | **Next up: [Datasets](./datasets.md).**
118 |
--------------------------------------------------------------------------------
/docs/guide/patterns.md:
--------------------------------------------------------------------------------
1 | # Patterns
2 |
3 | > Learn about Obscenity's custom pattern syntax.
4 |
5 | **Patterns** are used to specify blacklisted words. To ease matching variations of words with only small changes, they support some special syntax (namely, wildcards, optional expressions, and boundary assertions). For example, the pattern `f?ck` matches `f`, then any character, then `ck`; and matches on `fuck`, `fbck`, `fyck`, `fack`, and so on.
6 |
7 | ## Why a custom pattern syntax?
8 |
9 | This might sound similar to [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions), which are widely used for similar purposes. Why not just use them instead of inventing some custom syntax? A few reasons:
10 |
11 | - Regular expressions are overkill for profanity filtering in most cases. Their expressive syntax is, for the most part, completely unneeded as most variations are normalized before matching (see the article on [transformers](./transformers.md)).
12 | - Not supporting all the features of regular expressions can make a more efficient implementation in certain cases. In addition to a simpler matcher implementation using regular expressions (ironically) and string searching methods, Obscenity also features a matcher implementation using finite automata techniques which searches for patterns in parallel, which may be useful if you have a large number of patterns.
13 |
14 | ## Pattern syntax
15 |
16 | Most characters match _literally_. that is, `a` matches an `a`, `book` matches `book`, and so on. However, there are three special expressions that are available:
17 |
18 | - **Wildcards:** A `?` matches any character.
19 | - **Optional expressions:** Wrapping an expression in a set of square brackets (`[]`) matches it _optional_: `a[bc]` matches either `a` or `abc`.
20 | - **Boundary assertions:** Placing a pipe (`|`) at the start or end of the pattern asserts position at a word boundary: `|tit` matches `tit` and `tits` but not `substitute`. Similarly, `chick|` matches
21 | `chick` but not `chicken`.
22 |
23 | A special character mentioned above can be escaped using a backslash (`\`): `\?` matches `?` instead of a wildcard.
24 |
25 | ## Using patterns with Obscenity
26 |
27 | A pattern may be created using the `parseRawPattern()` function:
28 |
29 | ```typescript
30 | import { parseRawPattern } from 'obscenity';
31 |
32 | const p = parseRawPattern('f?ck');
33 | ```
34 |
35 | However, it is usually more convenient to use the `pattern` [tagged template](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#tagged_templates):
36 |
37 | ```typescript
38 | import { pattern } from 'obscenity';
39 |
40 | const p = pattern`f?ck`;
41 | ```
42 |
43 | Note the lack of `()` when calling `pattern` and the usage of backticks.
44 |
45 | Due to how the `pattern` tagged template works internally, it is not necessary to double-escape backslashes:
46 |
47 | ```typescript
48 | import { pattern } from 'obscenity';
49 |
50 | const p = pattern`\[`;
51 | ```
52 |
53 | If you were using `parseRawPattern` instead, the following would be required:
54 |
55 | ```typescript
56 | const p = parseRawPattern('\\[');
57 | ```
58 |
59 | ---
60 |
61 | **Next up: [Transformers](./transformers.md).**
62 |
--------------------------------------------------------------------------------
/docs/guide/transformers.md:
--------------------------------------------------------------------------------
1 | # Transformers
2 |
3 | > Learn all about transformers: what they are, built-in transformers, and how to make your own.
4 |
5 | **Transformers** normalize text before it is passed to the matcher. For example, all of the following could be implementing using transformers:
6 |
7 | - Confusable character resolution: `Ἢἕļľᦞ ш٥ṟlᑰ!` -> `hello world`
8 | - Leet-speak resolution: `h3llo world` -> `hello world`
9 | - Duplicate character collapsing: `heeello world` -> `hello world`
10 |
11 | ## Simple transformers
12 |
13 | In their simplest form, transformers are just functions that map characters to other characters. For example, a transformer that changes `a` to `b` and keeps other characters intact might look like:
14 |
15 | ```typescript
16 | import { createSimpleTransformer } from 'obscenity';
17 |
18 | const a = 'a'.charCodeAt(0);
19 | const b = 'b'.charCodeAt(0);
20 | const changeAToB = createSimpleTransformer((c) => (c === a ? b : c));
21 | ```
22 |
23 | > **Note:** `createSimpleTransformer` is an adapter that returns the input function in a structure suitable for use with matchers, which are discussed in the next article. Don't forget to use it when creating transformers!
24 |
25 | > **Warning:** Note that as transformers take the character _code_ as input rather than a string, implementing the transformer like this:
26 | >
27 | > ```typescript
28 | > const changeAToB = createSimpleTransformer((c) => (c === 'a' ? 'b' : c));
29 | > ```
30 | >
31 | > ...is, unfortunately, incorrect, as `c` is a number and would never be equal to `'a'`. Remember to always use character codes when writing transformers!
32 |
33 | ### Removing characters
34 |
35 | Sometimes, changing characters isn't enough. Perhaps you want to completely ignore a character when matching. As an example, perhaps you want to skip the spaces in `f u c k` so it becomes `fuck`.
36 |
37 | To do this, simply return `undefined` from the transformer, which signifies that the character should be ignored. With this in mind, we can easily write a transformer that skips spaces:
38 |
39 | ```typescript
40 | import { createSimpleTransformer } from 'obscenity';
41 |
42 | const space = ' '.charCodeAt(0);
43 | const skipSpaces = createSimpleTransformer((c) => (c === space ? undefined : c));
44 | ```
45 |
46 | ## Stateful transformers
47 |
48 | The aforementioned type of transformer is inadequate if you want to implement more complicated transformers, though. For example, if we wanted to implement a transformer that collapses duplicate characters, we'd hit a roadblock quite quickly in that we need to store the last character to check whether it's a duplicate, but simple transformers provide no clear way to do so.
49 |
50 | This is where _stateful transformers_ come in handy. Stateful transformers are objects that implement the `StatefulTransformer` interface. More specifically, your object has to have the following methods:
51 |
52 | - `transform(char)`, which takes a character and returns the transformed character.
53 | - `reset()`, which resets any internal state the transformer has.
54 |
55 | With this in mind, we can now write a stateful transformer that ignores duplicate characters:
56 |
57 | ```typescript
58 | class CollapseDuplicates implements StatefulTransformer {
59 | private lastCharacter = -1;
60 |
61 | public transform(char: number) {
62 | if (char === this.lastCharacter) return undefined;
63 | this.lastCharacter = char;
64 | return char;
65 | }
66 |
67 | public reset() {
68 | this.lastCharacter = -1;
69 | }
70 | }
71 | ```
72 |
73 | Now, we can use the `createStatefulTransformer` adapter to get a structure suitable for use with matchers (discussed in the next article):
74 |
75 | ```typescript
76 | import { createStatefulTransformer } from 'obscenity';
77 |
78 | const collapseDuplicates = createStatefulTransformer(() => new CollapseDuplicates());
79 | ```
80 |
81 | ---
82 |
83 | Excellent, you now know all about transformers! Now, let's take a look at the various built-in transformers Obscenity provides out of the box.
84 |
85 | ## Built-in transformers
86 |
87 | Obscenity features a number of built-in transformers for common tasks.
88 |
89 | - **Collapsing duplicate characters** is implemented by the `collapseDuplicatesTransformer()`: `fuuuuuuuck` becomes `fuck`;
90 | - **Resolving confusable Unicode characters** is implemented by the `resolveConfusablesTransformer()`: ``Ἢἕļľᦞ ш٥ṟlᑰ!` becomes `hello world!`;
91 | - **Resolving leet-speak** is implemented by the `resolveLeetSpeakTransformer()`: `h3llo world` becomes `hello world`;
92 | - **Skipping non-alphabetic characters** is implemented by the `skipNonAlphabeticTransformer()`: `f.u.c.. k` becomes `fuck`;
93 | - **Converting characters to lower-case** is implemented by the `toAsciiLowerCaseTransformer()`: `fUCk` becomes `fuck`.
94 |
95 | ---
96 |
97 | **Next up: [Matchers](./matchers.md).**
98 |
--------------------------------------------------------------------------------
/docs/reference/.nojekyll:
--------------------------------------------------------------------------------
1 | TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false.
--------------------------------------------------------------------------------
/docs/reference/classes/DataSet.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / DataSet
2 |
3 | # Class: DataSet
4 |
5 | Holds phrases (groups of patterns and whitelisted terms), optionally
6 | associating metadata with them.
7 |
8 | ## Type parameters
9 |
10 | | Name | Description |
11 | | :------ | :------ |
12 | | `MetadataType` | Metadata type for phrases. Note that the metadata type is implicitly nullable. |
13 |
14 | ## Table of contents
15 |
16 | ### Constructors
17 |
18 | - [constructor](DataSet.md#constructor)
19 |
20 | ### Methods
21 |
22 | - [addAll](DataSet.md#addall)
23 | - [addPhrase](DataSet.md#addphrase)
24 | - [build](DataSet.md#build)
25 | - [getPayloadWithPhraseMetadata](DataSet.md#getpayloadwithphrasemetadata)
26 | - [removePhrasesIf](DataSet.md#removephrasesif)
27 |
28 | ## Constructors
29 |
30 | ### constructor
31 |
32 | • **new DataSet**<`MetadataType`\>()
33 |
34 | #### Type parameters
35 |
36 | | Name |
37 | | :------ |
38 | | `MetadataType` |
39 |
40 | ## Methods
41 |
42 | ### addAll
43 |
44 | ▸ **addAll**(`other`): [`DataSet`](DataSet.md)<`MetadataType`\>
45 |
46 | Adds all the phrases from the dataset provided to this one.
47 |
48 | **`Example`**
49 |
50 | ```typescript
51 | const customDataset = new DataSet().addAll(englishDataset);
52 | ```
53 |
54 | #### Parameters
55 |
56 | | Name | Type | Description |
57 | | :------ | :------ | :------ |
58 | | `other` | [`DataSet`](DataSet.md)<`MetadataType`\> | Other dataset. |
59 |
60 | #### Returns
61 |
62 | [`DataSet`](DataSet.md)<`MetadataType`\>
63 |
64 | #### Defined in
65 |
66 | [src/dataset/DataSet.ts:29](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L29)
67 |
68 | ___
69 |
70 | ### addPhrase
71 |
72 | ▸ **addPhrase**(`fn`): [`DataSet`](DataSet.md)<`MetadataType`\>
73 |
74 | Adds a phrase to this dataset.
75 |
76 | **`Example`**
77 |
78 | ```typescript
79 | const data = new DataSet<{ originalWord: string }>()
80 | .addPhrase((phrase) => phrase.setMetadata({ originalWord: 'fuck' })
81 | .addPattern(pattern`fuck`)
82 | .addPattern(pattern`f[?]ck`)
83 | .addWhitelistedTerm('Afck'))
84 | .build();
85 | ```
86 |
87 | #### Parameters
88 |
89 | | Name | Type | Description |
90 | | :------ | :------ | :------ |
91 | | `fn` | (`builder`: [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>) => [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> | A function that takes a [[PhraseBuilder]], adds patterns/whitelisted terms/metadata to it, and returns it. |
92 |
93 | #### Returns
94 |
95 | [`DataSet`](DataSet.md)<`MetadataType`\>
96 |
97 | #### Defined in
98 |
99 | [src/dataset/DataSet.ts:75](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L75)
100 |
101 | ___
102 |
103 | ### build
104 |
105 | ▸ **build**(): `Pick`<[`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md), ``"blacklistedTerms"`` \| ``"whitelistedTerms"``\>
106 |
107 | Returns the dataset in a format suitable for usage with the [[RegExpMatcher]].
108 |
109 | **`Example`**
110 |
111 | ```typescript
112 | // With the RegExpMatcher:
113 | const matcher = new RegExpMatcher({
114 | ...dataset.build(),
115 | // additional options here
116 | });
117 | ```
118 |
119 | #### Returns
120 |
121 | `Pick`<[`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md), ``"blacklistedTerms"`` \| ``"whitelistedTerms"``\>
122 |
123 | #### Defined in
124 |
125 | [src/dataset/DataSet.ts:118](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L118)
126 |
127 | ___
128 |
129 | ### getPayloadWithPhraseMetadata
130 |
131 | ▸ **getPayloadWithPhraseMetadata**(`payload`): [`MatchPayloadWithPhraseMetadata`](../README.md#matchpayloadwithphrasemetadata)<`MetadataType`\>
132 |
133 | Retrieves the phrase metadata associated with a pattern and returns a
134 | copy of the match payload with said metadata attached to it.
135 |
136 | **`Example`**
137 |
138 | ```typescript
139 | const matches = matcher.getAllMatches(input);
140 | const matchesWithPhraseMetadata = matches.map((match) => dataset.getPayloadWithPhraseMetadata(match));
141 | // Now we can access the 'phraseMetadata' property:
142 | const phraseMetadata = matchesWithPhraseMetadata[0].phraseMetadata;
143 | ```
144 |
145 | #### Parameters
146 |
147 | | Name | Type | Description |
148 | | :------ | :------ | :------ |
149 | | `payload` | [`MatchPayload`](../interfaces/MatchPayload.md) | Original match payload. |
150 |
151 | #### Returns
152 |
153 | [`MatchPayloadWithPhraseMetadata`](../README.md#matchpayloadwithphrasemetadata)<`MetadataType`\>
154 |
155 | #### Defined in
156 |
157 | [src/dataset/DataSet.ts:94](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L94)
158 |
159 | ___
160 |
161 | ### removePhrasesIf
162 |
163 | ▸ **removePhrasesIf**(`predicate`): [`DataSet`](DataSet.md)<`MetadataType`\>
164 |
165 | Removes phrases that match the predicate given.
166 |
167 | **`Example`**
168 |
169 | ```typescript
170 | const customDataset = new DataSet<{ originalWord: string }>()
171 | .addAll(englishDataset)
172 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck');
173 | ```
174 |
175 | #### Parameters
176 |
177 | | Name | Type | Description |
178 | | :------ | :------ | :------ |
179 | | `predicate` | (`phrase`: [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\>) => `boolean` | A predicate that determines whether or not a phrase should be removed. Return `true` to remove, `false` to keep. |
180 |
181 | #### Returns
182 |
183 | [`DataSet`](DataSet.md)<`MetadataType`\>
184 |
185 | #### Defined in
186 |
187 | [src/dataset/DataSet.ts:46](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L46)
188 |
--------------------------------------------------------------------------------
/docs/reference/classes/ParserError.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / ParserError
2 |
3 | # Class: ParserError
4 |
5 | Custom error thrown by the parser when syntactical errors are detected.
6 |
7 | ## Hierarchy
8 |
9 | - `Error`
10 |
11 | ↳ **`ParserError`**
12 |
13 | ## Table of contents
14 |
15 | ### Constructors
16 |
17 | - [constructor](ParserError.md#constructor)
18 |
19 | ### Properties
20 |
21 | - [column](ParserError.md#column)
22 | - [line](ParserError.md#line)
23 | - [message](ParserError.md#message)
24 | - [name](ParserError.md#name)
25 | - [stack](ParserError.md#stack)
26 |
27 | ## Constructors
28 |
29 | ### constructor
30 |
31 | • **new ParserError**(`message`, `line`, `column`)
32 |
33 | #### Parameters
34 |
35 | | Name | Type |
36 | | :------ | :------ |
37 | | `message` | `string` |
38 | | `line` | `number` |
39 | | `column` | `number` |
40 |
41 | #### Overrides
42 |
43 | Error.constructor
44 |
45 | #### Defined in
46 |
47 | [src/pattern/ParserError.ts:18](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L18)
48 |
49 | ## Properties
50 |
51 | ### column
52 |
53 | • `Readonly` **column**: `number`
54 |
55 | The column on which the error occurred (one-based).
56 | Note that surrogate pairs are counted as 1 column wide, not 2.
57 |
58 | #### Defined in
59 |
60 | [src/pattern/ParserError.ts:16](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L16)
61 |
62 | ___
63 |
64 | ### line
65 |
66 | • `Readonly` **line**: `number`
67 |
68 | The line on which the error occurred (one-based).
69 |
70 | #### Defined in
71 |
72 | [src/pattern/ParserError.ts:10](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L10)
73 |
74 | ___
75 |
76 | ### message
77 |
78 | • **message**: `string`
79 |
80 | #### Inherited from
81 |
82 | Error.message
83 |
84 | #### Defined in
85 |
86 | node_modules/.pnpm/typescript@5.2.2/node_modules/typescript/lib/lib.es5.d.ts:1068
87 |
88 | ___
89 |
90 | ### name
91 |
92 | • `Readonly` **name**: ``"ParserError"``
93 |
94 | #### Overrides
95 |
96 | Error.name
97 |
98 | #### Defined in
99 |
100 | [src/pattern/ParserError.ts:5](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L5)
101 |
102 | ___
103 |
104 | ### stack
105 |
106 | • `Optional` **stack**: `string`
107 |
108 | #### Inherited from
109 |
110 | Error.stack
111 |
112 | #### Defined in
113 |
114 | node_modules/.pnpm/typescript@5.2.2/node_modules/typescript/lib/lib.es5.d.ts:1069
115 |
--------------------------------------------------------------------------------
/docs/reference/classes/PhraseBuilder.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / PhraseBuilder
2 |
3 | # Class: PhraseBuilder
4 |
5 | Builder for phrases.
6 |
7 | ## Type parameters
8 |
9 | | Name |
10 | | :------ |
11 | | `MetadataType` |
12 |
13 | ## Table of contents
14 |
15 | ### Constructors
16 |
17 | - [constructor](PhraseBuilder.md#constructor)
18 |
19 | ### Methods
20 |
21 | - [addPattern](PhraseBuilder.md#addpattern)
22 | - [addWhitelistedTerm](PhraseBuilder.md#addwhitelistedterm)
23 | - [build](PhraseBuilder.md#build)
24 | - [setMetadata](PhraseBuilder.md#setmetadata)
25 |
26 | ## Constructors
27 |
28 | ### constructor
29 |
30 | • **new PhraseBuilder**<`MetadataType`\>()
31 |
32 | #### Type parameters
33 |
34 | | Name |
35 | | :------ |
36 | | `MetadataType` |
37 |
38 | ## Methods
39 |
40 | ### addPattern
41 |
42 | ▸ **addPattern**(`pattern`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
43 |
44 | Associates a pattern with this phrase.
45 |
46 | #### Parameters
47 |
48 | | Name | Type | Description |
49 | | :------ | :------ | :------ |
50 | | `pattern` | [`ParsedPattern`](../interfaces/ParsedPattern.md) | Pattern to add. |
51 |
52 | #### Returns
53 |
54 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
55 |
56 | #### Defined in
57 |
58 | [src/dataset/DataSet.ts:149](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L149)
59 |
60 | ___
61 |
62 | ### addWhitelistedTerm
63 |
64 | ▸ **addWhitelistedTerm**(`term`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
65 |
66 | Associates a whitelisted pattern with this phrase.
67 |
68 | #### Parameters
69 |
70 | | Name | Type | Description |
71 | | :------ | :------ | :------ |
72 | | `term` | `string` | Whitelisted term to add. |
73 |
74 | #### Returns
75 |
76 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
77 |
78 | #### Defined in
79 |
80 | [src/dataset/DataSet.ts:159](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L159)
81 |
82 | ___
83 |
84 | ### build
85 |
86 | ▸ **build**(): [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\>
87 |
88 | Builds the phrase, returning a [[PhraseContainer]] for use with the
89 | [[DataSet]].
90 |
91 | #### Returns
92 |
93 | [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\>
94 |
95 | #### Defined in
96 |
97 | [src/dataset/DataSet.ts:178](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L178)
98 |
99 | ___
100 |
101 | ### setMetadata
102 |
103 | ▸ **setMetadata**(`metadata?`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
104 |
105 | Associates some metadata with this phrase.
106 |
107 | #### Parameters
108 |
109 | | Name | Type | Description |
110 | | :------ | :------ | :------ |
111 | | `metadata?` | `MetadataType` | Metadata to use. |
112 |
113 | #### Returns
114 |
115 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>
116 |
117 | #### Defined in
118 |
119 | [src/dataset/DataSet.ts:169](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L169)
120 |
--------------------------------------------------------------------------------
/docs/reference/classes/RegExpMatcher.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / RegExpMatcher
2 |
3 | # Class: RegExpMatcher
4 |
5 | An implementation of the [[Matcher]] interface using regular expressions and
6 | string searching methods.
7 |
8 | ## Implements
9 |
10 | - [`Matcher`](../interfaces/Matcher.md)
11 |
12 | ## Table of contents
13 |
14 | ### Constructors
15 |
16 | - [constructor](RegExpMatcher.md#constructor)
17 |
18 | ### Methods
19 |
20 | - [getAllMatches](RegExpMatcher.md#getallmatches)
21 | - [hasMatch](RegExpMatcher.md#hasmatch)
22 |
23 | ## Constructors
24 |
25 | ### constructor
26 |
27 | • **new RegExpMatcher**(`options`)
28 |
29 | Creates a new [[RegExpMatcher]] with the options given.
30 |
31 | **`Example`**
32 |
33 | ```typescript
34 | // Use the options provided by the English preset.
35 | const matcher = new RegExpMatcher({
36 | ...englishDataset.build(),
37 | ...englishRecommendedTransformers,
38 | });
39 | ```
40 |
41 | **`Example`**
42 |
43 | ```typescript
44 | // Simple matcher that only has blacklisted patterns.
45 | const matcher = new RegExpMatcher({
46 | blacklistedTerms: assignIncrementingIds([
47 | pattern`fuck`,
48 | pattern`f?uck`, // wildcards (?)
49 | pattern`bitch`,
50 | pattern`b[i]tch` // optionals ([i] matches either "i" or "")
51 | ]),
52 | });
53 |
54 | // Check whether some string matches any of the patterns.
55 | const doesMatch = matcher.hasMatch('fuck you bitch');
56 | ```
57 |
58 | **`Example`**
59 |
60 | ```typescript
61 | // A more advanced example, with transformers and whitelisted terms.
62 | const matcher = new RegExpMatcher({
63 | blacklistedTerms: [
64 | { id: 1, pattern: pattern`penis` },
65 | { id: 2, pattern: pattern`fuck` },
66 | ],
67 | whitelistedTerms: ['pen is'],
68 | blacklistMatcherTransformers: [
69 | resolveConfusablesTransformer(), // '🅰' => 'a'
70 | resolveLeetSpeakTransformer(), // '$' => 's'
71 | foldAsciiCharCaseTransformer(), // case insensitive matching
72 | skipNonAlphabeticTransformer(), // 'f.u...c.k' => 'fuck'
73 | collapseDuplicatesTransformer(), // 'aaaa' => 'a'
74 | ],
75 | });
76 |
77 | // Output all matches.
78 | console.log(matcher.getAllMatches('fu.....uuuuCK the pen is mightier than the sword!'));
79 | ```
80 |
81 | #### Parameters
82 |
83 | | Name | Type | Description |
84 | | :------ | :------ | :------ |
85 | | `options` | [`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md) | Options to use. |
86 |
87 | #### Defined in
88 |
89 | [src/matcher/regexp/RegExpMatcher.ts:74](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L74)
90 |
91 | ## Methods
92 |
93 | ### getAllMatches
94 |
95 | ▸ **getAllMatches**(`input`, `sorted?`): [`MatchPayload`](../interfaces/MatchPayload.md)[]
96 |
97 | Returns all matches of blacklisted terms in the text.
98 |
99 | If you only need to check for the presence of a match, and do not need
100 | more specific information about the matches, use the `hasMatch()` method,
101 | which is typically more efficient.
102 |
103 | #### Parameters
104 |
105 | | Name | Type | Default value | Description |
106 | | :------ | :------ | :------ | :------ |
107 | | `input` | `string` | `undefined` | Text to find profanities in. |
108 | | `sorted` | `boolean` | `false` | Whether the resulting list of matches should be sorted using [[compareMatchByPositionAndId]]. Defaults to `false`. |
109 |
110 | #### Returns
111 |
112 | [`MatchPayload`](../interfaces/MatchPayload.md)[]
113 |
114 | A list of matches of the matcher on the text. The matches are
115 | guaranteed to be sorted if and only if the `sorted` parameter is `true`,
116 | otherwise, their order is unspecified.
117 |
118 | #### Implementation of
119 |
120 | [Matcher](../interfaces/Matcher.md).[getAllMatches](../interfaces/Matcher.md#getallmatches)
121 |
122 | #### Defined in
123 |
124 | [src/matcher/regexp/RegExpMatcher.ts:87](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L87)
125 |
126 | ___
127 |
128 | ### hasMatch
129 |
130 | ▸ **hasMatch**(`input`): `boolean`
131 |
132 | Checks whether there is a match for any blacklisted term in the text.
133 |
134 | This is typically more efficient than calling `getAllMatches` and
135 | checking the result, though it depends on the implementation.
136 |
137 | #### Parameters
138 |
139 | | Name | Type | Description |
140 | | :------ | :------ | :------ |
141 | | `input` | `string` | Text to check. |
142 |
143 | #### Returns
144 |
145 | `boolean`
146 |
147 | #### Implementation of
148 |
149 | [Matcher](../interfaces/Matcher.md).[hasMatch](../interfaces/Matcher.md#hasmatch)
150 |
151 | #### Defined in
152 |
153 | [src/matcher/regexp/RegExpMatcher.ts:120](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L120)
154 |
--------------------------------------------------------------------------------
/docs/reference/classes/TextCensor.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / TextCensor
2 |
3 | # Class: TextCensor
4 |
5 | Censors regions of text matched by a [[Matcher]], supporting flexible
6 | [[TextCensorStrategy | censoring strategies]].
7 |
8 | ## Table of contents
9 |
10 | ### Constructors
11 |
12 | - [constructor](TextCensor.md#constructor)
13 |
14 | ### Methods
15 |
16 | - [applyTo](TextCensor.md#applyto)
17 | - [setStrategy](TextCensor.md#setstrategy)
18 |
19 | ## Constructors
20 |
21 | ### constructor
22 |
23 | • **new TextCensor**()
24 |
25 | ## Methods
26 |
27 | ### applyTo
28 |
29 | ▸ **applyTo**(`input`, `matches`): `string`
30 |
31 | Applies the censoring strategy to the text, returning the censored text.
32 |
33 | **Overlapping regions**
34 |
35 | Overlapping regions are an annoying edge case to deal with when censoring
36 | text. There is no single best way to handle them, but the implementation
37 | of this method guarantees that overlapping regions will always be
38 | replaced, following the rules below:
39 |
40 | - Replacement text for matched regions will be generated in the order
41 | specified by [[compareMatchByPositionAndId]];
42 | - When generating replacements for regions that overlap at the start with
43 | some other region, the start index of the censor context passed to the
44 | censoring strategy will be the end index of the first region, plus one.
45 |
46 | #### Parameters
47 |
48 | | Name | Type | Description |
49 | | :------ | :------ | :------ |
50 | | `input` | `string` | Input text. |
51 | | `matches` | [`MatchPayload`](../interfaces/MatchPayload.md)[] | A list of matches. |
52 |
53 | #### Returns
54 |
55 | `string`
56 |
57 | The censored text.
58 |
59 | #### Defined in
60 |
61 | [src/censor/TextCensor.ts:66](https://github.com/jo3-l/obscenity/blob/0299b49/src/censor/TextCensor.ts#L66)
62 |
63 | ___
64 |
65 | ### setStrategy
66 |
67 | ▸ **setStrategy**(`strategy`): [`TextCensor`](TextCensor.md)
68 |
69 | Sets the censoring strategy, which is responsible for generating
70 | replacement text for regions of the text that should be censored.
71 |
72 | The default censoring strategy is the [[grawlixCensorStrategy]],
73 | generating text like `$%@*`. There are several other built-in strategies
74 | available:
75 | - [[keepStartCensorStrategy]] - extends another strategy and keeps the
76 | first character matched, e.g. `f***`.
77 | - [[keepEndCensorStrategy]] - extends another strategy and keeps the last
78 | character matched, e.g. `***k`.
79 | - [[asteriskCensorStrategy]] - replaces the text with asterisks, e.g.
80 | `****`.
81 | - [[grawlixCensorStrategy]] - the default strategy, discussed earlier.
82 |
83 | Note that since censoring strategies are just functions (see the
84 | documentation for [[TextCensorStrategy]]), it is relatively simple to
85 | create your own.
86 |
87 | To ease creation of common censoring strategies, we provide a number of
88 | utility functions:
89 | - [[fixedPhraseCensorStrategy]] - generates a fixed phrase, e.g. `fudge`.
90 | - [[fixedCharCensorStrategy]] - generates replacement strings constructed
91 | from the character given, repeated as many times as needed.
92 | - [[randomCharFromSetCensorStrategy]] - generates replacement strings
93 | made up of random characters from the set of characters provided.
94 |
95 | #### Parameters
96 |
97 | | Name | Type | Description |
98 | | :------ | :------ | :------ |
99 | | `strategy` | [`TextCensorStrategy`](../README.md#textcensorstrategy) | Text censoring strategy to use. |
100 |
101 | #### Returns
102 |
103 | [`TextCensor`](TextCensor.md)
104 |
105 | #### Defined in
106 |
107 | [src/censor/TextCensor.ts:41](https://github.com/jo3-l/obscenity/blob/0299b49/src/censor/TextCensor.ts#L41)
108 |
--------------------------------------------------------------------------------
/docs/reference/enums/SyntaxKind.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / SyntaxKind
2 |
3 | # Enumeration: SyntaxKind
4 |
5 | An enumeration of the kinds of nodes there are.
6 |
7 | ## Table of contents
8 |
9 | ### Enumeration Members
10 |
11 | - [BoundaryAssertion](SyntaxKind.md#boundaryassertion)
12 | - [Literal](SyntaxKind.md#literal)
13 | - [Optional](SyntaxKind.md#optional)
14 | - [Wildcard](SyntaxKind.md#wildcard)
15 |
16 | ## Enumeration Members
17 |
18 | ### BoundaryAssertion
19 |
20 | • **BoundaryAssertion** = ``3``
21 |
22 | #### Defined in
23 |
24 | [src/pattern/Nodes.ts:33](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L33)
25 |
26 | ___
27 |
28 | ### Literal
29 |
30 | • **Literal** = ``2``
31 |
32 | #### Defined in
33 |
34 | [src/pattern/Nodes.ts:32](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L32)
35 |
36 | ___
37 |
38 | ### Optional
39 |
40 | • **Optional** = ``0``
41 |
42 | #### Defined in
43 |
44 | [src/pattern/Nodes.ts:30](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L30)
45 |
46 | ___
47 |
48 | ### Wildcard
49 |
50 | • **Wildcard** = ``1``
51 |
52 | #### Defined in
53 |
54 | [src/pattern/Nodes.ts:31](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L31)
55 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/BlacklistedTerm.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / BlacklistedTerm
2 |
3 | # Interface: BlacklistedTerm
4 |
5 | Represents a blacklisted term.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [id](BlacklistedTerm.md#id)
12 | - [pattern](BlacklistedTerm.md#pattern)
13 |
14 | ## Properties
15 |
16 | ### id
17 |
18 | • **id**: `number`
19 |
20 | The identifier of the pattern; should be unique across all patterns.
21 |
22 | #### Defined in
23 |
24 | [src/matcher/BlacklistedTerm.ts:10](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/BlacklistedTerm.ts#L10)
25 |
26 | ___
27 |
28 | ### pattern
29 |
30 | • **pattern**: [`ParsedPattern`](ParsedPattern.md)
31 |
32 | The parsed pattern.
33 |
34 | #### Defined in
35 |
36 | [src/matcher/BlacklistedTerm.ts:15](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/BlacklistedTerm.ts#L15)
37 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/BoundaryAssertionNode.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / BoundaryAssertionNode
2 |
3 | # Interface: BoundaryAssertionNode
4 |
5 | A boundary assertion node.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [kind](BoundaryAssertionNode.md#kind)
12 |
13 | ## Properties
14 |
15 | ### kind
16 |
17 | • **kind**: [`BoundaryAssertion`](../enums/SyntaxKind.md#boundaryassertion)
18 |
19 | #### Defined in
20 |
21 | [src/pattern/Nodes.ts:72](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L72)
22 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/CollapseDuplicatesTransformerOptions.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / CollapseDuplicatesTransformerOptions
2 |
3 | # Interface: CollapseDuplicatesTransformerOptions
4 |
5 | Options for the collapse duplicates transformer.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [customThresholds](CollapseDuplicatesTransformerOptions.md#customthresholds)
12 | - [defaultThreshold](CollapseDuplicatesTransformerOptions.md#defaultthreshold)
13 |
14 | ## Properties
15 |
16 | ### customThresholds
17 |
18 | • `Optional` **customThresholds**: `Map`<`string`, `number`\>
19 |
20 | Custom thresholds for characters. If a character has an entry
21 | corresponding to it, the value of tne entry will be used as the maximum
22 | length of character runs comprised of said character before they are
23 | collapsed.
24 |
25 | The intended use-case for this option is for characters which appear
26 | more than once in a row in patterns. For example, the word `book` has
27 | two `o`s in a row, and matches `book`. With this transformer, though,
28 | `book` would become `bok`, meaning that `book` would no longer match `book`.
29 | The fix would be to add an entry corresponding to `o` that overrides its
30 | threshold to be `2`, with the effect of leaving `book` unchanged.
31 |
32 | **`Default`**
33 |
34 | ```ts
35 | new Map()
36 | ```
37 |
38 | #### Defined in
39 |
40 | [src/transformer/collapse-duplicates/index.ts:91](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L91)
41 |
42 | ___
43 |
44 | ### defaultThreshold
45 |
46 | • `Optional` **defaultThreshold**: `number`
47 |
48 | The maximum number of characters in a run that will be accepted before
49 | they will be collapsed.
50 |
51 | For example, if this value was `2`, `aa` would stay the same but `aaa`
52 | would be transformed to `aa`.
53 |
54 | **`Default`**
55 |
56 | ```ts
57 | 1
58 | ```
59 |
60 | #### Defined in
61 |
62 | [src/transformer/collapse-duplicates/index.ts:102](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L102)
63 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/LiteralNode.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / LiteralNode
2 |
3 | # Interface: LiteralNode
4 |
5 | A literal node.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [chars](LiteralNode.md#chars)
12 | - [kind](LiteralNode.md#kind)
13 |
14 | ## Properties
15 |
16 | ### chars
17 |
18 | • **chars**: `number`[]
19 |
20 | The code points that this literal matches.
21 |
22 | #### Defined in
23 |
24 | [src/pattern/Nodes.ts:63](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L63)
25 |
26 | ___
27 |
28 | ### kind
29 |
30 | • **kind**: [`Literal`](../enums/SyntaxKind.md#literal)
31 |
32 | #### Defined in
33 |
34 | [src/pattern/Nodes.ts:65](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L65)
35 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/MatchPayload.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / MatchPayload
2 |
3 | # Interface: MatchPayload
4 |
5 | Information emitted on a successful match.
6 |
7 | If you require more information about matches than what is provided here, see
8 | the [[DataSet]] class, which supports associating metadata with patterns.
9 |
10 | ## Table of contents
11 |
12 | ### Properties
13 |
14 | - [endIndex](MatchPayload.md#endindex)
15 | - [matchLength](MatchPayload.md#matchlength)
16 | - [startIndex](MatchPayload.md#startindex)
17 | - [termId](MatchPayload.md#termid)
18 |
19 | ## Properties
20 |
21 | ### endIndex
22 |
23 | • **endIndex**: `number`
24 |
25 | End index of the match, inclusive.
26 |
27 | If the last character of the pattern is a surrogate pair,
28 | then this points to the index of the low surrogate.
29 |
30 | #### Defined in
31 |
32 | [src/matcher/MatchPayload.ts:16](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L16)
33 |
34 | ___
35 |
36 | ### matchLength
37 |
38 | • **matchLength**: `number`
39 |
40 | Total number of of code points that matched.
41 |
42 | #### Defined in
43 |
44 | [src/matcher/MatchPayload.ts:21](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L21)
45 |
46 | ___
47 |
48 | ### startIndex
49 |
50 | • **startIndex**: `number`
51 |
52 | Start index of the match, inclusive.
53 |
54 | #### Defined in
55 |
56 | [src/matcher/MatchPayload.ts:26](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L26)
57 |
58 | ___
59 |
60 | ### termId
61 |
62 | • **termId**: `number`
63 |
64 | ID of the blacklisted term that matched.
65 |
66 | #### Defined in
67 |
68 | [src/matcher/MatchPayload.ts:31](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L31)
69 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/Matcher.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / Matcher
2 |
3 | # Interface: Matcher
4 |
5 | Searches for blacklisted terms in text, ignoring parts matched by whitelisted
6 | terms.
7 |
8 | See:
9 | - [[RegExpMatcher]] for an implementation using regular expressions.
10 |
11 | ## Implemented by
12 |
13 | - [`RegExpMatcher`](../classes/RegExpMatcher.md)
14 |
15 | ## Table of contents
16 |
17 | ### Methods
18 |
19 | - [getAllMatches](Matcher.md#getallmatches)
20 | - [hasMatch](Matcher.md#hasmatch)
21 |
22 | ## Methods
23 |
24 | ### getAllMatches
25 |
26 | ▸ **getAllMatches**(`input`, `sorted?`): [`MatchPayload`](MatchPayload.md)[]
27 |
28 | Returns all matches of blacklisted terms in the text.
29 |
30 | If you only need to check for the presence of a match, and do not need
31 | more specific information about the matches, use the `hasMatch()` method,
32 | which is typically more efficient.
33 |
34 | #### Parameters
35 |
36 | | Name | Type | Description |
37 | | :------ | :------ | :------ |
38 | | `input` | `string` | Text to find profanities in. |
39 | | `sorted?` | `boolean` | Whether the resulting list of matches should be sorted using [[compareMatchByPositionAndId]]. Defaults to `false`. |
40 |
41 | #### Returns
42 |
43 | [`MatchPayload`](MatchPayload.md)[]
44 |
45 | A list of matches of the matcher on the text. The matches are
46 | guaranteed to be sorted if and only if the `sorted` parameter is `true`,
47 | otherwise, their order is unspecified.
48 |
49 | #### Defined in
50 |
51 | [src/matcher/Matcher.ts:25](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/Matcher.ts#L25)
52 |
53 | ___
54 |
55 | ### hasMatch
56 |
57 | ▸ **hasMatch**(`input`): `boolean`
58 |
59 | Checks whether there is a match for any blacklisted term in the text.
60 |
61 | This is typically more efficient than calling `getAllMatches` and
62 | checking the result, though it depends on the implementation.
63 |
64 | #### Parameters
65 |
66 | | Name | Type | Description |
67 | | :------ | :------ | :------ |
68 | | `input` | `string` | Text to check. |
69 |
70 | #### Returns
71 |
72 | `boolean`
73 |
74 | #### Defined in
75 |
76 | [src/matcher/Matcher.ts:35](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/Matcher.ts#L35)
77 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/OptionalNode.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / OptionalNode
2 |
3 | # Interface: OptionalNode
4 |
5 | An optional node.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [childNode](OptionalNode.md#childnode)
12 | - [kind](OptionalNode.md#kind)
13 |
14 | ## Properties
15 |
16 | ### childNode
17 |
18 | • **childNode**: [`LiteralNode`](LiteralNode.md) \| [`WildcardNode`](WildcardNode.md)
19 |
20 | The node contained within the optional expression. For `[abc]`, this
21 | would be a literal node with the value `abc`.
22 |
23 | #### Defined in
24 |
25 | [src/pattern/Nodes.ts:44](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L44)
26 |
27 | ___
28 |
29 | ### kind
30 |
31 | • **kind**: [`Optional`](../enums/SyntaxKind.md#optional)
32 |
33 | #### Defined in
34 |
35 | [src/pattern/Nodes.ts:46](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L46)
36 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/ParsedPattern.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / ParsedPattern
2 |
3 | # Interface: ParsedPattern
4 |
5 | A parsed pattern.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [nodes](ParsedPattern.md#nodes)
12 | - [requireWordBoundaryAtEnd](ParsedPattern.md#requirewordboundaryatend)
13 | - [requireWordBoundaryAtStart](ParsedPattern.md#requirewordboundaryatstart)
14 |
15 | ## Properties
16 |
17 | ### nodes
18 |
19 | • **nodes**: [`Node`](../README.md#node)[]
20 |
21 | A list of nodes which make up the pattern.
22 |
23 | #### Defined in
24 |
25 | [src/pattern/Nodes.ts:8](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L8)
26 |
27 | ___
28 |
29 | ### requireWordBoundaryAtEnd
30 |
31 | • **requireWordBoundaryAtEnd**: `boolean`
32 |
33 | Whether the pattern requires a word boundary at the end.
34 |
35 | #### Defined in
36 |
37 | [src/pattern/Nodes.ts:13](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L13)
38 |
39 | ___
40 |
41 | ### requireWordBoundaryAtStart
42 |
43 | • **requireWordBoundaryAtStart**: `boolean`
44 |
45 | Whether the pattern requires a word boundary at the start.
46 |
47 | #### Defined in
48 |
49 | [src/pattern/Nodes.ts:18](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L18)
50 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/PhraseContainer.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / PhraseContainer
2 |
3 | # Interface: PhraseContainer
4 |
5 | Represents a phrase.
6 |
7 | ## Type parameters
8 |
9 | | Name |
10 | | :------ |
11 | | `MetadataType` |
12 |
13 | ## Table of contents
14 |
15 | ### Properties
16 |
17 | - [metadata](PhraseContainer.md#metadata)
18 | - [patterns](PhraseContainer.md#patterns)
19 | - [whitelistedTerms](PhraseContainer.md#whitelistedterms)
20 |
21 | ## Properties
22 |
23 | ### metadata
24 |
25 | • `Optional` **metadata**: `MetadataType`
26 |
27 | Metadata associated with this phrase.
28 |
29 | #### Defined in
30 |
31 | [src/dataset/DataSet.ts:204](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L204)
32 |
33 | ___
34 |
35 | ### patterns
36 |
37 | • **patterns**: [`ParsedPattern`](ParsedPattern.md)[]
38 |
39 | Patterns associated with this phrase.
40 |
41 | #### Defined in
42 |
43 | [src/dataset/DataSet.ts:209](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L209)
44 |
45 | ___
46 |
47 | ### whitelistedTerms
48 |
49 | • **whitelistedTerms**: `string`[]
50 |
51 | Whitelisted terms associated with this phrase.
52 |
53 | #### Defined in
54 |
55 | [src/dataset/DataSet.ts:214](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L214)
56 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/ProcessedCollapseDuplicatesTransformerOptions.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / ProcessedCollapseDuplicatesTransformerOptions
2 |
3 | # Interface: ProcessedCollapseDuplicatesTransformerOptions
4 |
5 | ## Table of contents
6 |
7 | ### Properties
8 |
9 | - [customThresholds](ProcessedCollapseDuplicatesTransformerOptions.md#customthresholds)
10 | - [defaultThreshold](ProcessedCollapseDuplicatesTransformerOptions.md#defaultthreshold)
11 |
12 | ## Properties
13 |
14 | ### customThresholds
15 |
16 | • **customThresholds**: `Map`<`number`, `number`\>
17 |
18 | #### Defined in
19 |
20 | [src/transformer/collapse-duplicates/index.ts:68](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L68)
21 |
22 | ___
23 |
24 | ### defaultThreshold
25 |
26 | • **defaultThreshold**: `number`
27 |
28 | #### Defined in
29 |
30 | [src/transformer/collapse-duplicates/index.ts:69](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L69)
31 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/RegExpMatcherOptions.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / RegExpMatcherOptions
2 |
3 | # Interface: RegExpMatcherOptions
4 |
5 | Options for the [[RegExpMatcher]].
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [blacklistMatcherTransformers](RegExpMatcherOptions.md#blacklistmatchertransformers)
12 | - [blacklistedTerms](RegExpMatcherOptions.md#blacklistedterms)
13 | - [whitelistMatcherTransformers](RegExpMatcherOptions.md#whitelistmatchertransformers)
14 | - [whitelistedTerms](RegExpMatcherOptions.md#whitelistedterms)
15 |
16 | ## Properties
17 |
18 | ### blacklistMatcherTransformers
19 |
20 | • `Optional` **blacklistMatcherTransformers**: `TransformerContainer`[]
21 |
22 | A set of transformers that should be applied to the input text before
23 | blacklisted patterns are matched. This does not affect the matching of
24 | whitelisted terms.
25 |
26 | Transformers will be applied in the order they appear.
27 |
28 | **`Default`**
29 |
30 | ```ts
31 | []
32 | ```
33 |
34 | #### Defined in
35 |
36 | [src/matcher/regexp/RegExpMatcher.ts:229](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L229)
37 |
38 | ___
39 |
40 | ### blacklistedTerms
41 |
42 | • **blacklistedTerms**: [`BlacklistedTerm`](BlacklistedTerm.md)[]
43 |
44 | A list of blacklisted terms.
45 |
46 | #### Defined in
47 |
48 | [src/matcher/regexp/RegExpMatcher.ts:234](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L234)
49 |
50 | ___
51 |
52 | ### whitelistMatcherTransformers
53 |
54 | • `Optional` **whitelistMatcherTransformers**: `TransformerContainer`[]
55 |
56 | A set of transformers that should be applied to the input text before
57 | whitelisted terms are matched. This does not affect the matching of
58 | blacklisted terms.
59 |
60 | Transformers will be applied in the order they appear.
61 |
62 | **`Default`**
63 |
64 | ```ts
65 | []
66 | ```
67 |
68 | #### Defined in
69 |
70 | [src/matcher/regexp/RegExpMatcher.ts:245](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L245)
71 |
72 | ___
73 |
74 | ### whitelistedTerms
75 |
76 | • `Optional` **whitelistedTerms**: `string`[]
77 |
78 | A list of whitelisted terms. If a whitelisted term matches some part of
79 | the text, a match of a blacklisted pattern within that part of the text
80 | will not be emitted.
81 |
82 | For example, if we had a pattern `penis` and a whitelisted term `pen is`,
83 | only no matches would be reported for the input text `the pen is mightier
84 | than the sword.`
85 |
86 | **`Default`**
87 |
88 | ```ts
89 | []
90 | ```
91 |
92 | #### Defined in
93 |
94 | [src/matcher/regexp/RegExpMatcher.ts:258](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L258)
95 |
--------------------------------------------------------------------------------
/docs/reference/interfaces/WildcardNode.md:
--------------------------------------------------------------------------------
1 | [obscenity](../README.md) / WildcardNode
2 |
3 | # Interface: WildcardNode
4 |
5 | A wildcard node.
6 |
7 | ## Table of contents
8 |
9 | ### Properties
10 |
11 | - [kind](WildcardNode.md#kind)
12 |
13 | ## Properties
14 |
15 | ### kind
16 |
17 | • **kind**: [`Wildcard`](../enums/SyntaxKind.md#wildcard)
18 |
19 | #### Defined in
20 |
21 | [src/pattern/Nodes.ts:53](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L53)
22 |
--------------------------------------------------------------------------------
/examples/extending-datasets.js:
--------------------------------------------------------------------------------
1 | // Add a new word to the English dataset and removing an existing one.
2 |
3 | // Import what we need from Obscenity.
4 | const { RegExpMatcher, DataSet, englishDataset, englishRecommendedTransformers, pattern } = require('../dist');
5 |
6 | // Create a new dataset.
7 | const myDataset = new DataSet()
8 | // Add all the data from the english dataset into our new one.
9 | .addAll(englishDataset)
10 | // Remove "fuck" and all its variants.
11 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck')
12 | // Add "simp".
13 | .addPhrase((phrase) =>
14 | phrase
15 | .setMetadata({ originalWord: 'simp' })
16 | .addPattern(pattern`simp`)
17 | .addWhitelistedTerm('simple'),
18 | );
19 |
20 | // Use our new dataset.
21 | const matcher = new RegExpMatcher({
22 | ...myDataset,
23 | ...englishRecommendedTransformers,
24 | });
25 |
26 | console.log(matcher.hasMatch('simp'));
27 |
--------------------------------------------------------------------------------
/examples/repl.js:
--------------------------------------------------------------------------------
1 | // A REPL where you can enter text and see whether Obscenity matches on it with
2 | // its English preset.
3 |
4 | // Import the REPL built-in package.
5 | const repl = require('repl');
6 | // Import what we need from Obscenity.
7 | const { RegExpMatcher, englishDataset, englishRecommendedTransformers } = require('../dist');
8 |
9 | // Create our matcher, using the English preset.
10 | const matcher = new RegExpMatcher({
11 | ...englishDataset.build(),
12 | ...englishRecommendedTransformers,
13 | });
14 |
15 | // Display a nice welcome message.
16 | console.log(`Welcome to the REPL example for Obscenity.
17 | Type ".help" for more information.`);
18 |
19 | // Start our REPL server.
20 | const replServer = repl.start({
21 | prompt: '> ',
22 | eval(input, _ctx, _file, cb) {
23 | // Get all matches of blacklisted terms in the input. We pass 'true' to
24 | // getAllMatches() so the output is sorted (easier to read).
25 | const matches = matcher
26 | .getAllMatches(input, true)
27 | // Add some additional metadata about the phrases that were matched.
28 | .map((match) => englishDataset.getPayloadWithPhraseMetadata(match));
29 |
30 | // Return the matches to the REPL server.
31 | cb(undefined, matches);
32 | },
33 | });
34 |
35 | // Overwrite the default help command.
36 | replServer.defineCommand('help', {
37 | help: 'View a help message',
38 | action() {
39 | console.log(`To try out Obscenity with the English preset, simply type a phrase.
40 | Obscene words found in the input will be displayed when you click enter.
41 |
42 | Press ^D to exit the REPL`);
43 | this.displayPrompt();
44 | },
45 | });
46 |
47 | // Override some special commands that aren't useful for this example.
48 | function invalidCommand() {
49 | console.log('Invalid REPL keyword');
50 | this.displayPrompt();
51 | }
52 |
53 | replServer.defineCommand('save', { action: invalidCommand });
54 | replServer.defineCommand('load', { action: invalidCommand });
55 |
--------------------------------------------------------------------------------
/jest.config.ts:
--------------------------------------------------------------------------------
1 | import type { Config } from '@jest/types';
2 |
3 | const config: Config.InitialOptions = {
4 | preset: 'ts-jest',
5 | testEnvironment: 'node',
6 | testRunner: 'jest-circus/runner',
7 | testMatch: ['/test/**/*.test.ts'],
8 | transform: {
9 | // eslint-disable-next-line @typescript-eslint/naming-convention
10 | '^.+\\.ts$': [
11 | 'ts-jest',
12 | {
13 | tsconfig: '/test/tsconfig.json',
14 | },
15 | ],
16 | },
17 | collectCoverage: true,
18 | collectCoverageFrom: ['/src/**/*.ts'],
19 | coverageDirectory: 'coverage',
20 | coverageReporters: ['text', 'lcov', 'clover'],
21 | coveragePathIgnorePatterns: [
22 | '/src/index\\.ts', // library entry point
23 | '/src/preset/.*\\.ts', // presets
24 | ],
25 | setupFilesAfterEnv: ['/test/jest.setup.ts'],
26 | };
27 |
28 | export default config;
29 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "obscenity",
3 | "version": "0.4.3",
4 | "description": "Robust, extensible profanity filter.",
5 | "files": [
6 | "/dist",
7 | "!/dist/*.tsbuildinfo"
8 | ],
9 | "main": "./dist/index.js",
10 | "module": "./dist/index.mjs",
11 | "types": "./dist/index.d.ts",
12 | "exports": {
13 | "import": {
14 | "types": "./dist/index.d.ts",
15 | "default": "./dist/index.mjs"
16 | },
17 | "require": {
18 | "types": "./dist/index.d.ts",
19 | "default": "./dist/index.js"
20 | }
21 | },
22 | "scripts": {
23 | "build": "rimraf dist && tsc -b src && gen-esm-wrapper dist/index.js dist/index.mjs",
24 | "build:benchmarks": "tsc -b benchmarks",
25 | "build:docs": "rimraf docs/references && typedoc --plugin typedoc-plugin-markdown",
26 | "test": "jest",
27 | "test:watch": "jest --watch",
28 | "test:ci": "jest --ci --no-stack-trace --verbose",
29 | "style": "prettier --write src/**/*.ts test/**/*.ts",
30 | "lint": "eslint src test",
31 | "lint:fix": "eslint src test --fix",
32 | "release": "git checkout main && git pull origin main && pnpm i && pnpm lint && pnpm test && pnpm build && pnpm build:docs && git add -A && standard-version -a",
33 | "release:tags": "git push --follow-tags origin main",
34 | "release:github": "conventional-github-releaser -p angular",
35 | "release:publish": "pnpm publish --access public"
36 | },
37 | "repository": {
38 | "type": "git",
39 | "url": "https://github.com/jo3-l/obscenity.git"
40 | },
41 | "bugs": {
42 | "url": "https://github.com/jo3-l/obscenity/issues"
43 | },
44 | "keywords": [
45 | "profanity",
46 | "profane",
47 | "obscenities",
48 | "obscenity",
49 | "obscene",
50 | "filter",
51 | "curse",
52 | "swear",
53 | "swearing",
54 | "vulgar",
55 | "vulgarity",
56 | "bad-words",
57 | "badwords",
58 | "cuss",
59 | "cussing"
60 | ],
61 | "homepage": "https://github.com/jo3-l/obscenity#readme",
62 | "author": "Joe L. ",
63 | "license": "MIT",
64 | "devDependencies": {
65 | "@commitlint/cli": "^18.0.0",
66 | "@commitlint/config-angular": "^18.0.0",
67 | "@jest/types": "^29.5.0",
68 | "@types/jest": "^29.5.2",
69 | "@typescript-eslint/eslint-plugin": "^8.0.0",
70 | "@typescript-eslint/parser": "^8.0.0",
71 | "conventional-github-releaser": "^3.1.5",
72 | "eslint": "^8.57.0",
73 | "eslint-config-prettier": "^10.0.0",
74 | "eslint-plugin-jest": "^27.9.0",
75 | "eslint-plugin-prettier": "^4.2.1",
76 | "fast-check": "^2.25.0",
77 | "gen-esm-wrapper": "^1.1.3",
78 | "is-ci": "^4.0.0",
79 | "jest": "^29.7.0",
80 | "jest-circus": "^29.5.0",
81 | "prettier": "^2.8.8",
82 | "rimraf": "^6.0.0",
83 | "standard-version": "^9.5.0",
84 | "ts-jest": "^29.1.1",
85 | "ts-node": "^10.9.1",
86 | "typedoc": "^0.25.0",
87 | "typedoc-plugin-markdown": "^3.15.3",
88 | "typescript": "^5.2.2"
89 | },
90 | "engines": {
91 | "node": ">=14.0.0"
92 | },
93 | "packageManager": "pnpm@9.15.4"
94 | }
95 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": [
3 | "config:base",
4 | ":semanticCommits",
5 | ":semanticPrefixChore",
6 | ":preserveSemverRanges",
7 | ":rebaseStalePrs",
8 | ":label(deps)"
9 | ]
10 | }
11 |
--------------------------------------------------------------------------------
/scripts/search-words.js:
--------------------------------------------------------------------------------
1 | const repl = require('repl');
2 | const fs = require('fs');
3 | const { join } = require('path');
4 |
5 | const words = fs.readFileSync(join(__dirname, 'english-words.txt'), { encoding: 'utf8' }).split('\n');
6 |
7 | repl.start({
8 | prompt: '> ',
9 | eval: (cmd, _, __, cb) => {
10 | cmd = cmd.trim();
11 | let prefixAnchor = cmd.startsWith('^');
12 | if (prefixAnchor) cmd = cmd.slice(1);
13 |
14 | let suffixAnchor = cmd.endsWith('$');
15 | if (suffixAnchor) cmd = cmd.slice(0, -1);
16 |
17 | const result = [];
18 | for (let i = 0; i < words.length; i++) {
19 | words[i] = words[i].trim();
20 | const word = words[i];
21 | let ok = false;
22 | if (prefixAnchor) ok = word.startsWith(cmd);
23 | else if (suffixAnchor) ok = word.endsWith(cmd);
24 | else ok = word.includes(cmd);
25 | if (ok) result.push(word);
26 | }
27 |
28 | cb(undefined, result);
29 | },
30 | writer: (output) => {
31 | if (output.length === 0) return 'No words found matching the query given.';
32 | return `${output.length} words found:\n\n${output.join('\n')}`;
33 | },
34 | });
35 |
--------------------------------------------------------------------------------
/src/censor/BuiltinStrategies.ts:
--------------------------------------------------------------------------------
1 | import { getAndAssertSingleCodePoint } from '../util/Char';
2 | import type { CensorContext, TextCensorStrategy } from './TextCensor';
3 |
4 | /**
5 | * A text censoring strategy that extends another strategy, adding the first
6 | * character matched at the start of the generated string.
7 | *
8 | * @example
9 | * ```typescript
10 | * const strategy = keepStartCensorStrategy(grawlixCensorStrategy());
11 | * const censor = new TextCensor().setStrategy(strategy);
12 | * // Before: 'fuck you'
13 | * // After: 'f$@* you'
14 | * ```
15 | * @example
16 | * ```typescript
17 | * // Since keepEndCensorStrategy() returns another text censoring strategy, you can use it
18 | * // as the base strategy to pass to keepStartCensorStrategy().
19 | * const strategy = keepStartCensorStrategy(keepEndCensorStrategy(asteriskCensorStrategy()));
20 | * const censor = new TextCensor().setStrategy(strategy);
21 | * // Before: 'fuck you'
22 | * // After: 'f**k you'
23 | * ```
24 | * @param baseStrategy - Strategy to extend. It will be used to produce the end of
25 | * the generated string.
26 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
27 | */
28 | export function keepStartCensorStrategy(baseStrategy: TextCensorStrategy): TextCensorStrategy {
29 | return (ctx: CensorContext) => {
30 | if (ctx.overlapsAtStart) return baseStrategy(ctx);
31 | const firstChar = String.fromCodePoint(ctx.input.codePointAt(ctx.startIndex)!);
32 | return firstChar + baseStrategy({ ...ctx, matchLength: ctx.matchLength - 1 });
33 | };
34 | }
35 |
36 | /**
37 | * A text censoring strategy that extends another strategy, adding the last
38 | * character matched at the end of the generated string.
39 | *
40 | * @example
41 | * ```typescript
42 | * const strategy = keepEndCensorStrategy(asteriskCensorStrategy());
43 | * const censor = new TextCensor().setStrategy(strategy);
44 | * // Before: 'fuck you'
45 | * // After: '***k you'
46 | * ```
47 | * @param baseStrategy - Strategy to extend. It will be used to produce the start
48 | * of the generated string.
49 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
50 | */
51 | export function keepEndCensorStrategy(baseStrategy: TextCensorStrategy): TextCensorStrategy {
52 | return (ctx: CensorContext) => {
53 | if (ctx.overlapsAtEnd) return baseStrategy(ctx);
54 | const lastChar = String.fromCodePoint(ctx.input.codePointAt(ctx.endIndex)!);
55 | return baseStrategy({ ...ctx, matchLength: ctx.matchLength - 1 }) + lastChar;
56 | };
57 | }
58 |
59 | /**
60 | * A text censoring strategy that generates strings made up of asterisks (`*`).
61 | *
62 | * @example
63 | * ```typescript
64 | * const strategy = asteriskCensorStrategy();
65 | * const censor = new TextCensor().setStrategy(strategy);
66 | * // Before: 'fuck you'
67 | * // After: '**** you'
68 | * ```
69 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
70 | */
71 | export function asteriskCensorStrategy() {
72 | return fixedCharCensorStrategy('*');
73 | }
74 |
75 | /**
76 | * A text censoring strategy that generates
77 | * [grawlix](https://www.merriam-webster.com/words-at-play/grawlix-symbols-swearing-comic-strips),
78 | * i.e. strings that contain the characters `%`, `@`, `$`, `&`, and `*`.
79 | *
80 | * @example
81 | * ```typescript
82 | * const strategy = grawlixCensorStrategy();
83 | * const censor = new TextCensor().setStrategy(strategy);
84 | * // Before: 'fuck you'
85 | * // After: '%@&* you'
86 | * ```
87 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
88 | */
89 | export function grawlixCensorStrategy() {
90 | return randomCharFromSetCensorStrategy('%@$&*');
91 | }
92 |
93 | /**
94 | * A text censoring strategy that returns a fixed string.
95 | *
96 | * @example
97 | * ```typescript
98 | * // The replacement phrase '' effectively removes all matched regions
99 | * // from the string.
100 | * const strategy = fixedPhraseCensorStrategy('');
101 | * const censor = new TextCensor().setStrategy(strategy);
102 | * // Before: 'fuck you'
103 | * // After: ' you'
104 | * ```
105 | * @example
106 | * ```typescript
107 | * const strategy = fixedPhraseCensorStrategy('fudge');
108 | * const censor = new TextCensor().setStrategy(strategy);
109 | * // Before: 'fuck you'
110 | * // After: 'fudge you'
111 | * ```
112 | * @param phrase - Replacement phrase to use.
113 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
114 | */
115 | export function fixedPhraseCensorStrategy(phrase: string): TextCensorStrategy {
116 | return () => phrase;
117 | }
118 |
119 | /**
120 | * A text censoring strategy that generates replacement strings that are made up
121 | * of the character given, repeated as many times as needed.
122 | *
123 | * @example
124 | * ```typescript
125 | * const strategy = fixedCharCensorStrategy('*');
126 | * const censor = new TextCensor().setStrategy(strategy);
127 | * // Before: 'fuck you'
128 | * // After: '**** you'.
129 | * ```
130 | * @param char - String that represents the code point which should be used when
131 | * generating the replacement string. Must be exactly one code point in length.
132 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
133 | */
134 | export function fixedCharCensorStrategy(char: string): TextCensorStrategy {
135 | // Make sure the input character is one code point in length.
136 | getAndAssertSingleCodePoint(char);
137 | return (ctx: CensorContext) => char.repeat(ctx.matchLength);
138 | }
139 |
140 | /**
141 | * A text censoring strategy that generates replacement strings made up of
142 | * random characters from the set of characters provided. The strings never
143 | * contain two of the same character in a row.
144 | *
145 | * @example
146 | * ```typescript
147 | * const strategy = randomCharFromSetCensorStrategy('$#!');
148 | * const censor = new TextCensor().setStrategy(strategy);
149 | * // Before: 'fuck you!'
150 | * // After: '!#$# you!'
151 | * ```
152 | * @param charset - Set of characters from which the replacement string should
153 | * be constructed. Must have at least two characters.
154 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]].
155 | */
156 | export function randomCharFromSetCensorStrategy(charset: string): TextCensorStrategy {
157 | const chars = [...charset];
158 | if (chars.length < 2) throw new Error('The character set passed must have at least 2 characters.');
159 | return (ctx: CensorContext) => {
160 | if (ctx.matchLength === 0) return '';
161 |
162 | let lastIdx = Math.floor(Math.random() * chars.length);
163 | let censored = chars[lastIdx];
164 | for (let i = 1; i < ctx.matchLength; i++) {
165 | let idx = Math.floor(Math.random() * (chars.length - 1));
166 | // Transform the distribution for idx from [0, len-1) to
167 | // [0, lastIdx) ∪ (lastIdx, len) to exclude lastIdx while
168 | // ensuring a uniform distribution of generated characters.
169 | if (idx >= lastIdx) idx++;
170 | lastIdx = idx;
171 | censored += chars[idx];
172 | }
173 | return censored;
174 | };
175 | }
176 |
--------------------------------------------------------------------------------
/src/censor/TextCensor.ts:
--------------------------------------------------------------------------------
1 | import type { MatchPayload } from '../matcher/MatchPayload';
2 | import { compareMatchByPositionAndId } from '../matcher/MatchPayload';
3 | import { grawlixCensorStrategy } from './BuiltinStrategies';
4 |
5 | /**
6 | * Censors regions of text matched by a [[Matcher]], supporting flexible
7 | * [[TextCensorStrategy | censoring strategies]].
8 | */
9 | export class TextCensor {
10 | private strategy: TextCensorStrategy = grawlixCensorStrategy();
11 |
12 | /**
13 | * Sets the censoring strategy, which is responsible for generating
14 | * replacement text for regions of the text that should be censored.
15 | *
16 | * The default censoring strategy is the [[grawlixCensorStrategy]],
17 | * generating text like `$%@*`. There are several other built-in strategies
18 | * available:
19 | * - [[keepStartCensorStrategy]] - extends another strategy and keeps the
20 | * first character matched, e.g. `f***`.
21 | * - [[keepEndCensorStrategy]] - extends another strategy and keeps the last
22 | * character matched, e.g. `***k`.
23 | * - [[asteriskCensorStrategy]] - replaces the text with asterisks, e.g.
24 | * `****`.
25 | * - [[grawlixCensorStrategy]] - the default strategy, discussed earlier.
26 | *
27 | * Note that since censoring strategies are just functions (see the
28 | * documentation for [[TextCensorStrategy]]), it is relatively simple to
29 | * create your own.
30 | *
31 | * To ease creation of common censoring strategies, we provide a number of
32 | * utility functions:
33 | * - [[fixedPhraseCensorStrategy]] - generates a fixed phrase, e.g. `fudge`.
34 | * - [[fixedCharCensorStrategy]] - generates replacement strings constructed
35 | * from the character given, repeated as many times as needed.
36 | * - [[randomCharFromSetCensorStrategy]] - generates replacement strings
37 | * made up of random characters from the set of characters provided.
38 | *
39 | * @param strategy - Text censoring strategy to use.
40 | */
41 | public setStrategy(strategy: TextCensorStrategy) {
42 | this.strategy = strategy;
43 | return this;
44 | }
45 |
46 | /**
47 | * Applies the censoring strategy to the text, returning the censored text.
48 | *
49 | * **Overlapping regions**
50 | *
51 | * Overlapping regions are an annoying edge case to deal with when censoring
52 | * text. There is no single best way to handle them, but the implementation
53 | * of this method guarantees that overlapping regions will always be
54 | * replaced, following the rules below:
55 | *
56 | * - Replacement text for matched regions will be generated in the order
57 | * specified by [[compareMatchByPositionAndId]];
58 | * - When generating replacements for regions that overlap at the start with
59 | * some other region, the start index of the censor context passed to the
60 | * censoring strategy will be the end index of the first region, plus one.
61 | *
62 | * @param input - Input text.
63 | * @param matches - A list of matches.
64 | * @returns The censored text.
65 | */
66 | public applyTo(input: string, matches: MatchPayload[]) {
67 | if (matches.length === 0) return input;
68 | const sorted = [...matches].sort(compareMatchByPositionAndId);
69 |
70 | let censored = '';
71 | let lastIndex = 0; // end index of last match, plus one
72 | for (let i = 0; i < sorted.length; i++) {
73 | const match = sorted[i];
74 | if (lastIndex > match.endIndex) continue; // completely contained in the previous span
75 |
76 | const overlapsAtStart = match.startIndex < lastIndex;
77 | // Add the chunk of text between the end of the last match and the
78 | // start of the current match.
79 | if (!overlapsAtStart) censored += input.slice(lastIndex, match.startIndex);
80 |
81 | const actualStartIndex = Math.max(lastIndex, match.startIndex);
82 | const overlapsAtEnd =
83 | i < sorted.length - 1 && // not the last match
84 | match.endIndex >= sorted[i + 1].startIndex && // end index of this match and start index of next one overlap
85 | match.endIndex < sorted[i + 1].endIndex; // doesn't completely contain next match
86 | censored += this.strategy({ ...match, startIndex: actualStartIndex, input, overlapsAtStart, overlapsAtEnd });
87 | lastIndex = match.endIndex + 1;
88 | }
89 |
90 | censored += input.slice(lastIndex);
91 | return censored;
92 | }
93 | }
94 |
95 | /**
96 | * A text censoring strategy, which receives a [[CensorContext]] and returns a
97 | * replacement string.
98 | */
99 | export type TextCensorStrategy = (ctx: CensorContext) => string;
100 |
101 | /**
102 | * Context passed to [[TextCensorStrategy | text censoring strategies]].
103 | */
104 | export type CensorContext = MatchPayload & {
105 | /**
106 | * The entire input text, without any censoring applied to it.
107 | */
108 | input: string;
109 |
110 | /**
111 | * Whether the current region overlaps at the end with some other region.
112 | */
113 | overlapsAtEnd: boolean;
114 |
115 | /**
116 | * Whether the current region overlaps at the start with some other region.
117 | */
118 | overlapsAtStart: boolean;
119 | };
120 |
--------------------------------------------------------------------------------
/src/dataset/DataSet.ts:
--------------------------------------------------------------------------------
1 | import { assignIncrementingIds } from '../matcher/BlacklistedTerm';
2 | import type { MatchPayload } from '../matcher/MatchPayload';
3 | import type { RegExpMatcherOptions } from '../matcher/regexp/RegExpMatcher';
4 | import type { ParsedPattern } from '../pattern/Nodes';
5 |
6 | /**
7 | * Holds phrases (groups of patterns and whitelisted terms), optionally
8 | * associating metadata with them.
9 | *
10 | * @typeParam MetadataType - Metadata type for phrases. Note that the metadata
11 | * type is implicitly nullable.
12 | */
13 | export class DataSet {
14 | private readonly containers: PhraseContainer[] = [];
15 |
16 | private patternCount = 0;
17 |
18 | private readonly patternIdToPhraseContainer = new Map(); // pattern ID => index of its container
19 |
20 | /**
21 | * Adds all the phrases from the dataset provided to this one.
22 | *
23 | * @example
24 | * ```typescript
25 | * const customDataset = new DataSet().addAll(englishDataset);
26 | * ```
27 | * @param other - Other dataset.
28 | */
29 | public addAll(other: DataSet) {
30 | for (const container of other.containers) this.registerContainer(container);
31 | return this;
32 | }
33 |
34 | /**
35 | * Removes phrases that match the predicate given.
36 | *
37 | * @example
38 | * ```typescript
39 | * const customDataset = new DataSet<{ originalWord: string }>()
40 | * .addAll(englishDataset)
41 | * .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck');
42 | * ```
43 | * @param predicate - A predicate that determines whether or not a phrase should be removed.
44 | * Return `true` to remove, `false` to keep.
45 | */
46 | public removePhrasesIf(predicate: (phrase: PhraseContainer) => boolean) {
47 | // Clear the internal state, then gradually rebuild it by adding the
48 | // containers that should be kept.
49 | this.patternCount = 0;
50 | this.patternIdToPhraseContainer.clear();
51 | const containers = this.containers.splice(0);
52 | for (const container of containers) {
53 | const remove = predicate(container);
54 | if (!remove) this.registerContainer(container);
55 | }
56 |
57 | return this;
58 | }
59 |
60 | /**
61 | * Adds a phrase to this dataset.
62 | *
63 | * @example
64 | * ```typescript
65 | * const data = new DataSet<{ originalWord: string }>()
66 | * .addPhrase((phrase) => phrase.setMetadata({ originalWord: 'fuck' })
67 | * .addPattern(pattern`fuck`)
68 | * .addPattern(pattern`f[?]ck`)
69 | * .addWhitelistedTerm('Afck'))
70 | * .build();
71 | * ```
72 | * @param fn - A function that takes a [[PhraseBuilder]], adds
73 | * patterns/whitelisted terms/metadata to it, and returns it.
74 | */
75 | public addPhrase(fn: (builder: PhraseBuilder) => PhraseBuilder) {
76 | const container = fn(new PhraseBuilder()).build();
77 | this.registerContainer(container);
78 | return this;
79 | }
80 |
81 | /**
82 | * Retrieves the phrase metadata associated with a pattern and returns a
83 | * copy of the match payload with said metadata attached to it.
84 | *
85 | * @example
86 | * ```typescript
87 | * const matches = matcher.getAllMatches(input);
88 | * const matchesWithPhraseMetadata = matches.map((match) => dataset.getPayloadWithPhraseMetadata(match));
89 | * // Now we can access the 'phraseMetadata' property:
90 | * const phraseMetadata = matchesWithPhraseMetadata[0].phraseMetadata;
91 | * ```
92 | * @param payload - Original match payload.
93 | */
94 | public getPayloadWithPhraseMetadata(payload: MatchPayload): MatchPayloadWithPhraseMetadata {
95 | const offset = this.patternIdToPhraseContainer.get(payload.termId);
96 | if (offset === undefined) {
97 | throw new Error(`The pattern with ID ${payload.termId} does not exist in this dataset.`);
98 | }
99 |
100 | return {
101 | ...payload,
102 | phraseMetadata: this.containers[offset].metadata,
103 | };
104 | }
105 |
106 | /**
107 | * Returns the dataset in a format suitable for usage with the [[RegExpMatcher]].
108 | *
109 | * @example
110 | * ```typescript
111 | * // With the RegExpMatcher:
112 | * const matcher = new RegExpMatcher({
113 | * ...dataset.build(),
114 | * // additional options here
115 | * });
116 | * ```
117 | */
118 | public build(): Pick {
119 | return {
120 | blacklistedTerms: assignIncrementingIds(this.containers.flatMap((p) => p.patterns)),
121 | whitelistedTerms: this.containers.flatMap((p) => p.whitelistedTerms),
122 | };
123 | }
124 |
125 | private registerContainer(container: PhraseContainer) {
126 | const offset = this.containers.push(container) - 1;
127 | for (let i = 0, phraseId = this.patternCount; i < container.patterns.length; i++, phraseId++) {
128 | this.patternIdToPhraseContainer.set(phraseId, offset);
129 | this.patternCount++;
130 | }
131 | }
132 | }
133 |
134 | /**
135 | * Builder for phrases.
136 | */
137 | export class PhraseBuilder {
138 | private readonly patterns: ParsedPattern[] = [];
139 |
140 | private readonly whitelistedTerms: string[] = [];
141 |
142 | private metadata?: MetadataType;
143 |
144 | /**
145 | * Associates a pattern with this phrase.
146 | *
147 | * @param pattern - Pattern to add.
148 | */
149 | public addPattern(pattern: ParsedPattern) {
150 | this.patterns.push(pattern);
151 | return this;
152 | }
153 |
154 | /**
155 | * Associates a whitelisted pattern with this phrase.
156 | *
157 | * @param term - Whitelisted term to add.
158 | */
159 | public addWhitelistedTerm(term: string) {
160 | this.whitelistedTerms.push(term);
161 | return this;
162 | }
163 |
164 | /**
165 | * Associates some metadata with this phrase.
166 | *
167 | * @param metadata - Metadata to use.
168 | */
169 | public setMetadata(metadata?: MetadataType) {
170 | this.metadata = metadata;
171 | return this;
172 | }
173 |
174 | /**
175 | * Builds the phrase, returning a [[PhraseContainer]] for use with the
176 | * [[DataSet]].
177 | */
178 | public build(): PhraseContainer {
179 | return {
180 | patterns: this.patterns,
181 | whitelistedTerms: this.whitelistedTerms,
182 | metadata: this.metadata,
183 | };
184 | }
185 | }
186 |
187 | /**
188 | * Extends the default match payload by adding phrase metadata.
189 | */
190 | export type MatchPayloadWithPhraseMetadata = MatchPayload & {
191 | /**
192 | * Phrase metadata associated with the pattern that matched.
193 | */
194 | phraseMetadata?: MetadataType;
195 | };
196 |
197 | /**
198 | * Represents a phrase.
199 | */
200 | export interface PhraseContainer {
201 | /**
202 | * Metadata associated with this phrase.
203 | */
204 | metadata?: MetadataType;
205 |
206 | /**
207 | * Patterns associated with this phrase.
208 | */
209 | patterns: ParsedPattern[];
210 |
211 | /**
212 | * Whitelisted terms associated with this phrase.
213 | */
214 | whitelistedTerms: string[];
215 | }
216 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from './censor/BuiltinStrategies';
2 | export * from './censor/TextCensor';
3 |
4 | export * from './dataset/DataSet';
5 |
6 | export * from './matcher/regexp/RegExpMatcher';
7 | export * from './matcher/BlacklistedTerm';
8 | export * from './matcher/MatchPayload';
9 | export * from './matcher/Matcher';
10 |
11 | export * from './pattern/Nodes';
12 | export * from './pattern/ParserError';
13 | export * from './pattern/Pattern';
14 |
15 | export * from './preset/english';
16 |
17 | export * from './transformer/collapse-duplicates';
18 | export * from './transformer/remap-characters';
19 | export * from './transformer/resolve-confusables';
20 | export * from './transformer/resolve-leetspeak';
21 | export * from './transformer/skip-non-alphabetic';
22 | export * from './transformer/to-ascii-lowercase';
23 |
--------------------------------------------------------------------------------
/src/matcher/BlacklistedTerm.ts:
--------------------------------------------------------------------------------
1 | import type { ParsedPattern } from '../pattern/Nodes';
2 |
3 | /**
4 | * Represents a blacklisted term.
5 | */
6 | export interface BlacklistedTerm {
7 | /**
8 | * The identifier of the pattern; should be unique across all patterns.
9 | */
10 | id: number;
11 |
12 | /**
13 | * The parsed pattern.
14 | */
15 | pattern: ParsedPattern;
16 | }
17 |
18 | /**
19 | * Assigns incrementing IDs to the patterns provided, starting with 0. It is
20 | * useful if you have a list of patterns to match against but don't care about
21 | * identifying which pattern matched.
22 | *
23 | * @example
24 | * ```typescript
25 | * const matcher = new RegExpMatcher({
26 | * ...,
27 | * blacklistedTerms: assignIncrementingIds([
28 | * pattern`f?uck`,
29 | * pattern`|shit|`,
30 | * ]),
31 | * });
32 | * ```
33 | * @param patterns - List of parsed patterns.
34 | * @returns A list of blacklisted terms with valid IDs which can then be passed
35 | * to the [[RegExpMatcher]].
36 | */
37 | export function assignIncrementingIds(patterns: ParsedPattern[]) {
38 | let currentId = 0;
39 | return patterns.map((pattern) => ({ id: currentId++, pattern }));
40 | }
41 |
--------------------------------------------------------------------------------
/src/matcher/IntervalCollection.ts:
--------------------------------------------------------------------------------
1 | import type { Interval } from '../util/Interval';
2 |
3 | export class IntervalCollection implements Iterable {
4 | private dirty = false;
5 |
6 | private readonly intervals: Interval[] = [];
7 |
8 | public insert(lowerBound: number, upperBound: number) {
9 | this.intervals.push([lowerBound, upperBound]);
10 | this.dirty = true;
11 | }
12 |
13 | public query(lowerBound: number, upperBound: number) {
14 | if (this.intervals.length === 0) return false;
15 | if (this.dirty) {
16 | this.dirty = false;
17 | // Sort by lower bound.
18 | this.intervals.sort(
19 | /* istanbul ignore next: not possible to write a robust test for this */
20 | (a, b) => (a[0] < b[0] ? -1 : b[0] < a[0] ? 1 : 0),
21 | );
22 | }
23 |
24 | for (const interval of this.intervals) {
25 | // Since the intervals are sorted by lower bound, if we see an
26 | // interval with a lower bound greater than the target, we can skip
27 | // checking all the ones after it as it's impossible that they fully
28 | // contain the target interval.
29 | if (interval[0] > lowerBound) break;
30 | if (interval[0] <= lowerBound && upperBound <= interval[1]) return true;
31 | }
32 |
33 | return false;
34 | }
35 |
36 | public values() {
37 | return this.intervals.values();
38 | }
39 |
40 | public [Symbol.iterator]() {
41 | return this.values();
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/matcher/MatchPayload.ts:
--------------------------------------------------------------------------------
1 | import { compareIntervals } from '../util/Interval';
2 |
3 | /**
4 | * Information emitted on a successful match.
5 | *
6 | * If you require more information about matches than what is provided here, see
7 | * the [[DataSet]] class, which supports associating metadata with patterns.
8 | */
9 | export interface MatchPayload {
10 | /**
11 | * End index of the match, inclusive.
12 | *
13 | * If the last character of the pattern is a surrogate pair,
14 | * then this points to the index of the low surrogate.
15 | */
16 | endIndex: number;
17 |
18 | /**
19 | * Total number of of code points that matched.
20 | */
21 | matchLength: number;
22 |
23 | /**
24 | * Start index of the match, inclusive.
25 | */
26 | startIndex: number;
27 |
28 | /**
29 | * ID of the blacklisted term that matched.
30 | */
31 | termId: number;
32 | }
33 |
34 | /**
35 | * Compares two match payloads.
36 | *
37 | * If the first match payload's start index is less than the second's, `-1` is
38 | * returned;
39 | * If the second match payload's start index is less than the first's, `1` is
40 | * returned;
41 | * If the first match payload's end index is less than the second's, `-1` is
42 | * returned;
43 | * If the second match payload's end index is less than the first's, `1` is
44 | * returned;
45 | * If the first match payload's term ID is less than the second's, `-1` is
46 | * returned;
47 | * If the first match payload's term ID is equal to the second's, `0` is
48 | * returned;
49 | * Otherwise, `1` is returned.
50 | *
51 | * @param a - First match payload.
52 | * @param b - Second match payload.
53 | * @returns The result of the comparison: -1 if the first should sort lower than
54 | * the second, 0 if they are the same, and 1 if the second should sort lower
55 | * than the first.
56 | */
57 | export function compareMatchByPositionAndId(a: MatchPayload, b: MatchPayload) {
58 | const result = compareIntervals(a.startIndex, a.endIndex, b.startIndex, b.endIndex);
59 | if (result !== 0) return result;
60 | return a.termId === b.termId ? 0 : a.termId < b.termId ? -1 : 1;
61 | }
62 |
--------------------------------------------------------------------------------
/src/matcher/Matcher.ts:
--------------------------------------------------------------------------------
1 | import type { MatchPayload } from './MatchPayload';
2 |
3 | /**
4 | * Searches for blacklisted terms in text, ignoring parts matched by whitelisted
5 | * terms.
6 | *
7 | * See:
8 | * - [[RegExpMatcher]] for an implementation using regular expressions.
9 | */
10 | export interface Matcher {
11 | /**
12 | * Returns all matches of blacklisted terms in the text.
13 | *
14 | * If you only need to check for the presence of a match, and do not need
15 | * more specific information about the matches, use the `hasMatch()` method,
16 | * which is typically more efficient.
17 | *
18 | * @param input - Text to find profanities in.
19 | * @param sorted - Whether the resulting list of matches should be sorted
20 | * using [[compareMatchByPositionAndId]]. Defaults to `false`.
21 | * @returns A list of matches of the matcher on the text. The matches are
22 | * guaranteed to be sorted if and only if the `sorted` parameter is `true`,
23 | * otherwise, their order is unspecified.
24 | */
25 | getAllMatches(input: string, sorted?: boolean): MatchPayload[];
26 |
27 | /**
28 | * Checks whether there is a match for any blacklisted term in the text.
29 | *
30 | * This is typically more efficient than calling `getAllMatches` and
31 | * checking the result, though it depends on the implementation.
32 | *
33 | * @param input - Text to check.
34 | */
35 | hasMatch(input: string): boolean;
36 | }
37 |
--------------------------------------------------------------------------------
/src/pattern/Nodes.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * A parsed pattern.
3 | */
4 | export interface ParsedPattern {
5 | /**
6 | * A list of nodes which make up the pattern.
7 | */
8 | nodes: Node[];
9 |
10 | /**
11 | * Whether the pattern requires a word boundary at the end.
12 | */
13 | requireWordBoundaryAtEnd: boolean;
14 |
15 | /**
16 | * Whether the pattern requires a word boundary at the start.
17 | */
18 | requireWordBoundaryAtStart: boolean;
19 | }
20 |
21 | /**
22 | * All the possible kinds of nodes.
23 | */
24 | export type Node = LiteralNode | OptionalNode | WildcardNode;
25 |
26 | /**
27 | * An enumeration of the kinds of nodes there are.
28 | */
29 | export enum SyntaxKind {
30 | Optional,
31 | Wildcard,
32 | Literal,
33 | BoundaryAssertion,
34 | }
35 |
36 | /**
37 | * An optional node.
38 | */
39 | export interface OptionalNode {
40 | /**
41 | * The node contained within the optional expression. For `[abc]`, this
42 | * would be a literal node with the value `abc`.
43 | */
44 | childNode: LiteralNode | WildcardNode;
45 |
46 | kind: SyntaxKind.Optional;
47 | }
48 |
49 | /**
50 | * A wildcard node.
51 | */
52 | export interface WildcardNode {
53 | kind: SyntaxKind.Wildcard;
54 | }
55 |
56 | /**
57 | * A literal node.
58 | */
59 | export interface LiteralNode {
60 | /**
61 | * The code points that this literal matches.
62 | */
63 | chars: number[];
64 |
65 | kind: SyntaxKind.Literal;
66 | }
67 |
68 | /**
69 | * A boundary assertion node.
70 | */
71 | export interface BoundaryAssertionNode {
72 | kind: SyntaxKind.BoundaryAssertion;
73 | }
74 |
--------------------------------------------------------------------------------
/src/pattern/ParserError.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Custom error thrown by the parser when syntactical errors are detected.
3 | */
4 | export class ParserError extends Error {
5 | public readonly name = 'ParserError';
6 |
7 | /**
8 | * The line on which the error occurred (one-based).
9 | */
10 | public readonly line: number;
11 |
12 | /**
13 | * The column on which the error occurred (one-based).
14 | * Note that surrogate pairs are counted as 1 column wide, not 2.
15 | */
16 | public readonly column: number;
17 |
18 | public constructor(message: string, line: number, column: number) {
19 | super(`${line}:${column}: ${message}`);
20 | this.line = line;
21 | this.column = column;
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/pattern/Pattern.ts:
--------------------------------------------------------------------------------
1 | import { Parser } from './Parser';
2 |
3 | const parser = new Parser();
4 |
5 | /**
6 | * Parses a pattern, which matches a set of strings; see the `Syntax` section
7 | * for details. This function is intended to be called as a [template
8 | * tag](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#tagged_templates).
9 | *
10 | * **Syntax**
11 | *
12 | * Generally speaking, in patterns, characters are interpreted literally. That
13 | * is, they match exactly what they are: `a` matches an `a`, `b` matches a `b`,
14 | * `;` matches a `;`, and so on.
15 | *
16 | * However, there are several constructs that have special meaning:
17 | *
18 | * - `[expr]` matches either the empty string or `expr` (an **optional
19 | * expression**). `expr` may be a sequence of literal characters or a wildcard
20 | * (see below).
21 | * - `?` matches any character (a **wildcard**).
22 | * - A `|` at the start or end of the pattern asserts position at a word
23 | * boundary (a **word boundary assertion**). If `|` is at the start, it
24 | * ensures that the match either starts at the start of the string or a non-
25 | * word character preceding it; if it is at the end, it ensures that the match
26 | * either ends at the end of the string or a non-word character follows it.
27 | *
28 | * A word character is an lower-case or upper-case ASCII alphabet character or
29 | * an ASCII digit.
30 | * - In a literal, a backslash may be used to **escape** one of the
31 | * meta-characters mentioned above so that it does match literally: `\\[`
32 | * matches `[`, and does not mark the start of an optional expression.
33 | *
34 | * **Note about escapes**
35 | *
36 | * As this function operates on raw strings, double-escaping backslashes is
37 | * not necessary:
38 | *
39 | * ```typescript
40 | * // Use this:
41 | * const parsed = pattern`hello \[`;
42 | * // Don't use this:
43 | * const parsed = pattern`hello \\[`;
44 | * ```
45 | *
46 | * **Examples**
47 | *
48 | * - `baz` matches `baz` exactly.
49 | *
50 | * - `b\[ar` matches `b[ar` exactly.
51 | *
52 | * - `d?ude` matches `d`, then any character, then `ude`. All of the following
53 | * strings are matched by this pattern:
54 | * - `dyude`
55 | * - `d;ude`
56 | * - `d!ude`
57 | *
58 | * - `h[?]ello` matches either `h`, any character, then `ello` or the literal
59 | * string `hello`. The set of strings it matches is equal to the union of the
60 | * set of strings that the two patterns `hello` and `h?ello` match. All of the
61 | * following strings are matched by this pattern:
62 | * - `hello`
63 | * - `h!ello`
64 | * - `h;ello`
65 | *
66 | * - `|foobar|` asserts position at a word boundary, matches the literal string
67 | * `foobar`, and asserts position at a word boundary:
68 | * - `foobar` matches, as the start and end of string count as word
69 | * boundaries;
70 | * - `yofoobar` does _not_ match, as `f` is immediately preceded by a word
71 | * character;
72 | * - `hello foobar bye` matches, as `f` is immediately preceded by a non-word
73 | * character, and `r` is immediately followed by a non-word character.
74 | *
75 | * **Grammar**
76 | *
77 | * ```
78 | * Pattern ::= '['? Atom* ']'?
79 | * Atom ::= Literal | Wildcard | Optional
80 | * Optional ::= '[' Literal | Wildcard ']'
81 | * Literal ::= (NON_SPECIAL | '\' SUPPORTS_ESCAPING)+
82 | *
83 | * NON_SPECIAL ::= _any character other than '\', '?', '[', ']', or '|'_
84 | * SUPPORTS_ESCAPING ::= '\' | '[' | ']' | '?' | '|'
85 | * ```
86 | *
87 | * @example
88 | * ```typescript
89 | * const parsed = pattern`hello?`; // match "hello", then any character
90 | * ```
91 | * @example
92 | * ```typescript
93 | * const parsed = pattern`w[o]rld`; // match "wrld" or "world"
94 | * ```
95 | * @example
96 | * ```typescript
97 | * const parsed = pattern`my initials are \[??\]`; // match "my initials are [", then any two characters, then a "]"
98 | * ```
99 | * @returns The parsed pattern, which can then be used with the
100 | * [[RegExpMatcher]].
101 | * @throws [[ParserError]] if a syntactical error was detected while parsing the
102 | * pattern.
103 | * @see [[parseRawPattern]] if you want to parse a string into a pattern without
104 | * using a template tag.
105 | */
106 | export function pattern(strings: TemplateStringsArray, ...expressions: unknown[]) {
107 | let result = strings.raw[0];
108 | for (const [i, expression] of expressions.entries()) {
109 | result += expression;
110 | result += strings.raw[i + 1];
111 | }
112 |
113 | return parser.parse(result);
114 | }
115 |
116 | /**
117 | * Parses a string as a pattern directly.
118 | *
119 | * **Note**
120 | *
121 | * It is recommended to use the [[pattern | pattern template tag]] instead of
122 | * this function for literal patterns (i.e. ones without dynamic content).
123 | *
124 | * @param pattern - The string to parse.
125 | * @throws [[ParserError]] if a syntactical error was detected while parsing the
126 | * pattern.
127 | * @returns The parsed pattern, which can then be used with the
128 | * [[RegExpMatcher]].
129 | */
130 | export function parseRawPattern(pattern: string) {
131 | return parser.parse(pattern);
132 | }
133 |
--------------------------------------------------------------------------------
/src/pattern/Util.ts:
--------------------------------------------------------------------------------
1 | import type { Node, ParsedPattern } from './Nodes';
2 | import { SyntaxKind } from './Nodes';
3 |
4 | export function potentiallyMatchesEmptyString(pattern: ParsedPattern) {
5 | return pattern.nodes.every((node) => node.kind === SyntaxKind.Optional);
6 | }
7 |
8 | export function compilePatternToRegExp(pattern: ParsedPattern) {
9 | let regExpStr = '';
10 | if (pattern.requireWordBoundaryAtStart) regExpStr += '\\b';
11 | for (const node of pattern.nodes) regExpStr += getRegExpStringForNode(node);
12 | if (pattern.requireWordBoundaryAtEnd) regExpStr += `\\b`;
13 | return new RegExp(regExpStr, 'gs');
14 | }
15 |
16 | const regExpSpecialChars = ['[', '.', '*', '+', '?', '^', '$', '{', '}', '(', ')', '|', '[', '\\', ']'].map((str) =>
17 | str.charCodeAt(0),
18 | );
19 |
20 | export function getRegExpStringForNode(node: Node): string {
21 | switch (node.kind) {
22 | case SyntaxKind.Literal: {
23 | let str = '';
24 | for (const char of node.chars) {
25 | if (regExpSpecialChars.includes(char)) str += '\\';
26 | str += String.fromCodePoint(char);
27 | }
28 |
29 | return str;
30 | }
31 |
32 | case SyntaxKind.Optional:
33 | return `(?:${getRegExpStringForNode(node.childNode)})?`;
34 | case SyntaxKind.Wildcard:
35 | return `.`;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/transformer/TransformerSet.ts:
--------------------------------------------------------------------------------
1 | import type { StatefulTransformer, TransformerContainer } from './Transformers';
2 | import { TransformerType } from './Transformers';
3 |
4 | export class TransformerSet {
5 | private readonly transformers: TransformerContainer[];
6 |
7 | private readonly statefulTransformers: (StatefulTransformer | undefined)[];
8 |
9 | public constructor(transformers: TransformerContainer[]) {
10 | this.transformers = transformers;
11 | this.statefulTransformers = Array.from({ length: this.transformers.length });
12 | for (let i = 0; i < this.transformers.length; i++) {
13 | const transformer = this.transformers[i];
14 | if (transformer.type === TransformerType.Stateful) {
15 | this.statefulTransformers[i] = transformer.factory();
16 | }
17 | }
18 | }
19 |
20 | public applyTo(char: number) {
21 | let transformed: number | undefined = char;
22 | for (let i = 0; i < this.transformers.length && transformed !== undefined; i++) {
23 | const transformer = this.transformers[i];
24 | if (transformer.type === TransformerType.Simple) transformed = transformer.transform(transformed);
25 | else transformed = this.statefulTransformers[i]!.transform(transformed);
26 | }
27 |
28 | return transformed;
29 | }
30 |
31 | public resetAll() {
32 | for (let i = 0; i < this.transformers.length; i++) {
33 | if (this.transformers[i].type === TransformerType.Stateful) {
34 | this.statefulTransformers[i]!.reset();
35 | }
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/transformer/Transformers.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * All the possible transformer types.
3 | */
4 | export const enum TransformerType {
5 | Simple,
6 | Stateful,
7 | }
8 |
9 | /**
10 | * All the possible transformer container types.
11 | */
12 | export type TransformerContainer = SimpleTransformerContainer | StatefulTransformerContainer;
13 |
14 | /**
15 | * Creates a container holding the transformer function provided. Simple
16 | * transformers are suitable for stateless transformations, e.g., a
17 | * transformation that maps certain characters to others. For transformations
18 | * that need to keep around state, see `createStatefulTransformer`.
19 | *
20 | * @example
21 | * ```typescript
22 | * function lowercaseToUppercase(char) {
23 | * return isLowercase(char) ? char - 32 : char;
24 | * }
25 | *
26 | * const transformer = createSimpleTransformer(lowercaseToUppercase);
27 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
28 | * ```
29 | * @example
30 | * ```typescript
31 | * function ignoreAllNonDigitChars(char) {
32 | * return isDigit(char) ? char : undefined;
33 | * }
34 | *
35 | * const transformer = createSimpleTransformer(ignoreAllNonDigitChars);
36 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
37 | * ```
38 | * @param transformer - Function that applies the transformation. It should
39 | * accept one argument, the input character, and return the transformed
40 | * character. A return value of `undefined` indicates that the character should
41 | * be ignored.
42 | * @returns A container holding the transformer, which can then be passed to the
43 | * [[RegExpMatcher]].
44 | */
45 | export function createSimpleTransformer(transformer: TransformerFn): SimpleTransformerContainer {
46 | return { type: TransformerType.Simple, transform: transformer };
47 | }
48 |
49 | /**
50 | * Transforms input characters.
51 | *
52 | * @param char - Input character.
53 | * @returns The transformed character. A return value of `undefined` indicates
54 | * that the character should be ignored.
55 | */
56 | export type TransformerFn = (char: number) => number | undefined;
57 |
58 | /**
59 | * Container for simple transformers.
60 | */
61 | export interface SimpleTransformerContainer {
62 | /**
63 | * The transformer function.
64 | */
65 | transform: TransformerFn;
66 |
67 | type: TransformerType.Simple;
68 | }
69 |
70 | /**
71 | * Creates a container holding the stateful transformer. Stateful transformers
72 | * are objects which satisfy the `StatefulTransformer` interface. They are
73 | * suitable for transformations that require keeping around some state regarding
74 | * the characters previously transformed in the text.
75 | *
76 | * @example
77 | * ```typescript
78 | * class IgnoreDuplicateCharactersTransformer implements StatefulTransformer {
79 | * private lastChar = -1;
80 | *
81 | * public transform(char: number) {
82 | * if (char === this.lastChar) return undefined;
83 | * this.lastChar = char;
84 | * return char;
85 | * }
86 | *
87 | * public reset() {
88 | * this.lastChar = -1;
89 | * }
90 | * }
91 | *
92 | * const transformer = createStatefulTransformer(() => new IgnoreDuplicateCharactersTransformer());
93 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
94 | * ```
95 | * @param factory A function that returns an instance of the stateful
96 | * transformer.
97 | * @returns A container holding the stateful transformer, which can then be
98 | * passed to the [[RegExpMatcher]].
99 | */
100 | export function createStatefulTransformer(factory: StatefulTransformerFactory): StatefulTransformerContainer {
101 | return { type: TransformerType.Stateful, factory };
102 | }
103 |
104 | /**
105 | * A function that returns an instance of a stateful transformer.
106 | */
107 | export type StatefulTransformerFactory = () => StatefulTransformer;
108 |
109 | /**
110 | * An interface that stateful transformers should implement.
111 | */
112 | export interface StatefulTransformer {
113 | /**
114 | * Resets the state of the transformer.
115 | */
116 | reset(): void;
117 |
118 | /**
119 | * Transforms input characters.
120 | *
121 | * @param char - Input character.
122 | * @returns The transformed character. A return value of `undefined` indicates
123 | * that the character should be ignored.
124 | */
125 | transform: TransformerFn;
126 | }
127 |
128 | /**
129 | * Container for stateful transformers.
130 | */
131 | export interface StatefulTransformerContainer {
132 | factory: StatefulTransformerFactory;
133 | type: TransformerType.Stateful;
134 | }
135 |
--------------------------------------------------------------------------------
/src/transformer/collapse-duplicates/index.ts:
--------------------------------------------------------------------------------
1 | import { getAndAssertSingleCodePoint } from '../../util/Char';
2 | import { createStatefulTransformer } from '../Transformers';
3 | import { CollapseDuplicatesTransformer } from './transformer';
4 |
5 | /**
6 | * Creates a transformer that collapses duplicate characters. This is useful for
7 | * detecting variants of patterns in which a character is repeated to bypass
8 | * detection.
9 | *
10 | * As an example, the pattern `hi` does not match `hhiii` by default, as the
11 | * frequency of the characters does not match. With this transformer, `hhiii`
12 | * would become `hi`, and would therefore match the pattern.
13 | *
14 | * **Application order**
15 | *
16 | * It is recommended that this transformer be applied after all other
17 | * transformers. Using it before other transformers may have the effect of not
18 | * catching duplicates of certain characters that were originally different but
19 | * became the same after a series of transformations.
20 | *
21 | * **Warning**
22 | *
23 | * This transformer should be used with caution, as while it can make certain
24 | * patterns match text that wouldn't have been matched before, it can also go
25 | * the other way. For example, the pattern `hello` clearly matches `hello`, but
26 | * with this transformer, by default, `hello` would become `helo` which does
27 | * _not_ match. In this cases, the `customThresholds` option can be used to
28 | * allow two `l`s in a row, making it leave `hello` unchanged.
29 | *
30 | * @example
31 | * ```typescript
32 | * // Collapse runs of the same character.
33 | * const transformer = collapseDuplicatesTransformer();
34 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
35 | * ```
36 | * @example
37 | * ```typescript
38 | * // Collapse runs of characters other than 'a'.
39 | * const transformer = collapseDuplicatesTransformer({ customThresholds: new Map([['a', Infinity]]) });
40 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
41 | * ```
42 | * @param options - Options for the transformer.
43 | * @returns A container holding the transformer, which can then be passed to the
44 | * [[RegExpMatcher]].
45 | */
46 | export function collapseDuplicatesTransformer({
47 | defaultThreshold = 1,
48 | customThresholds = new Map(),
49 | }: CollapseDuplicatesTransformerOptions = {}) {
50 | const map = createCharacterToThresholdMap(customThresholds);
51 | return createStatefulTransformer(
52 | () => new CollapseDuplicatesTransformer({ defaultThreshold, customThresholds: map }),
53 | );
54 | }
55 |
56 | function createCharacterToThresholdMap(customThresholds: Map) {
57 | const map = new Map();
58 | for (const [str, threshold] of customThresholds) {
59 | if (threshold < 0) throw new RangeError('Expected all thresholds to be non-negative.');
60 | const char = getAndAssertSingleCodePoint(str);
61 | map.set(char, threshold);
62 | }
63 |
64 | return map;
65 | }
66 |
67 | export interface ProcessedCollapseDuplicatesTransformerOptions {
68 | customThresholds: Map;
69 | defaultThreshold: number;
70 | }
71 |
72 | /**
73 | * Options for the collapse duplicates transformer.
74 | */
75 | export interface CollapseDuplicatesTransformerOptions {
76 | /**
77 | * Custom thresholds for characters. If a character has an entry
78 | * corresponding to it, the value of tne entry will be used as the maximum
79 | * length of character runs comprised of said character before they are
80 | * collapsed.
81 | *
82 | * The intended use-case for this option is for characters which appear
83 | * more than once in a row in patterns. For example, the word `book` has
84 | * two `o`s in a row, and matches `book`. With this transformer, though,
85 | * `book` would become `bok`, meaning that `book` would no longer match `book`.
86 | * The fix would be to add an entry corresponding to `o` that overrides its
87 | * threshold to be `2`, with the effect of leaving `book` unchanged.
88 | *
89 | * @default new Map()
90 | */
91 | customThresholds?: Map;
92 |
93 | /**
94 | * The maximum number of characters in a run that will be accepted before
95 | * they will be collapsed.
96 | *
97 | * For example, if this value was `2`, `aa` would stay the same but `aaa`
98 | * would be transformed to `aa`.
99 | *
100 | * @default 1
101 | */
102 | defaultThreshold?: number;
103 | }
104 |
--------------------------------------------------------------------------------
/src/transformer/collapse-duplicates/transformer.ts:
--------------------------------------------------------------------------------
1 | import type { StatefulTransformer } from '../Transformers';
2 | import type { ProcessedCollapseDuplicatesTransformerOptions } from '.';
3 |
4 | export class CollapseDuplicatesTransformer implements StatefulTransformer {
5 | private readonly defaultThreshold: number;
6 |
7 | private readonly customThresholds: Map;
8 |
9 | private remaining = -1;
10 |
11 | private lastChar = -1;
12 |
13 | public constructor({ defaultThreshold, customThresholds }: ProcessedCollapseDuplicatesTransformerOptions) {
14 | this.defaultThreshold = defaultThreshold;
15 | this.customThresholds = customThresholds;
16 | }
17 |
18 | public transform(char: number) {
19 | if (char === this.lastChar) {
20 | return this.remaining-- > 0 ? char : undefined;
21 | }
22 |
23 | const threshold = this.customThresholds.get(char) ?? this.defaultThreshold;
24 | this.remaining = threshold - 1;
25 | this.lastChar = char;
26 | return threshold > 0 ? char : undefined;
27 | }
28 |
29 | public reset() {
30 | this.remaining = -1;
31 | this.lastChar = -1;
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/transformer/remap-characters/index.ts:
--------------------------------------------------------------------------------
1 | import { getAndAssertSingleCodePoint } from '../../util/Char';
2 | import { CharacterIterator } from '../../util/CharacterIterator';
3 | import { createSimpleTransformer } from '../Transformers';
4 |
5 | /**
6 | * Maps certain characters to other characters, leaving other characters
7 | * unchanged.
8 | *
9 | * **Application order**
10 | *
11 | * It is recommended that this transformer be applied near the start of the
12 | * transformer chain.
13 | *
14 | * @example
15 | * ```typescript
16 | * // Transform 'a' to 'b'.
17 | * const transformer = remapCharactersTransformer({ 'b': 'a' });
18 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
19 | * ```
20 | * @example
21 | * ```typescript
22 | * // Transform '🅱️' to 'b', and use a map instead of an object as the argument.
23 | * const transformer = remapCharactersTransformer(new Map([['b', '🅱️']]));
24 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
25 | * ```
26 | * @example
27 | * ```typescript
28 | * // Transform '🇴' and '0' to 'o'.
29 | * const transformer = remapCharactersTransformer({ o: '🇴0' });
30 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
31 | * ```
32 | * @param mapping - A map/object mapping certain characters to others.
33 | * @returns A container holding the transformer, which can then be passed to the
34 | * [[RegExpMatcher]].
35 | * @see [[resolveConfusablesTransformer| Transformer that handles confusable Unicode characters]]
36 | * @see [[resolveLeetSpeakTransformer | Transformer that handles leet-speak]]
37 | */
38 | export function remapCharactersTransformer(mapping: CharacterMapping) {
39 | const map = createOneToOneMap(mapping);
40 | return createSimpleTransformer((c) => map.get(c) ?? c);
41 | }
42 |
43 | function createOneToOneMap(mapping: CharacterMapping) {
44 | const map = new Map();
45 | const iterable = mapping instanceof Map ? mapping.entries() : Object.entries(mapping);
46 | for (const [original, equivalents] of iterable) {
47 | const originalChar = getAndAssertSingleCodePoint(original);
48 | const iter = new CharacterIterator(equivalents);
49 | for (const equivalent of iter) map.set(equivalent, originalChar);
50 | }
51 |
52 | return map;
53 | }
54 |
55 | /**
56 | * Maps characters to other characters.
57 | * The key of the map/object should be the transformed character, while the value
58 | * should be a set of characters that map to the transformed character.
59 | */
60 | export type CharacterMapping = Map | Record;
61 |
--------------------------------------------------------------------------------
/src/transformer/resolve-confusables/confusables.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Maps confusable Unicode characters to their normalized equivalents.
3 | *
4 | * @copyright
5 | * The data here is taken from the
6 | * [confusables](https://github.com/gc/confusables) library.
7 | *
8 | * ```text
9 | * # The MIT License (MIT)
10 | *
11 | * Copyright © 2019 https://github.com/gc/
12 | *
13 | * Permission is hereby granted, free of charge, to any person
14 | * obtaining a copy of this software and associated documentation
15 | * files (the “Software”), to deal in the Software without
16 | * restriction, including without limitation the rights to use,
17 | * copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | * copies of the Software, and to permit persons to whom the
19 | * Software is furnished to do so, subject to the following
20 | * conditions:
21 | *
22 | * The above copyright notice and this permission notice shall be
23 | * included in all copies or substantial portions of the Software.
24 | *
25 | * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
26 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
27 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
29 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
30 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
32 | * OTHER DEALINGS IN THE SOFTWARE.
33 | * ```
34 | */
35 | export const confusables = new Map([
36 | [' ', ' '],
37 | ['0', '⓿'],
38 | ['1', '⓵➊⑴¹𝟏𝟙1𝟷𝟣⒈𝟭1➀₁①❶⥠'],
39 | ['2', '⓶⒉⑵➋ƻ²ᒿ𝟚2𝟮𝟤ᒾ𝟸Ƨ𝟐②ᴤ₂➁❷ᘝƨ'],
40 | ['3', '³ⳌꞫ𝟑ℨ𝟛𝟯𝟥Ꝫ➌ЗȜ⓷ӠƷ3𝟹⑶⒊ʒʓǯǮƺ𝕴ᶾзᦡ➂③₃ᶚᴣᴟ❸ҘҙӬӡӭӟӞ'],
41 | ['4', '➍ҶᏎ𝟜ҷ⓸ҸҹӴӵᶣ4чㄩ⁴➃₄④❹Ӌ⑷⒋'],
42 | ['5', '𝟱⓹➎Ƽ𝟓𝟻𝟝𝟧5➄₅⑤⁵❺ƽ⑸⒌'],
43 | ['6', 'ⳒᏮ𝟞𝟨𝟔➏⓺Ϭϭ⁶б6ᧈ⑥➅₆❻⑹⒍'],
44 | ['7', '⓻𐓒➐7⁷⑦₇❼➆⑺⒎'],
45 | ['8', '𐌚➑⓼8𝟠𝟪৪⁸₈𝟴➇⑧❽𝟾𝟖⑻⒏'],
46 | ['9', 'ꝮⳊ⓽➒੧৭୨9𝟫𝟿𝟗⁹₉Գ➈⑨❾⑼⒐'],
47 | ['A', '🄰Ꭿ𐊠𝕬𝜜𝐴ꓮᎪ𝚨ꭺ𝝖🅐Å∀🇦₳🅰𝒜𝘈𝐀𝔸дǺᗅⒶAΑᾋᗩĂÃÅǍȀȂĀȺĄʌΛλƛᴀᴬДАልÄₐᕱªǞӒΆẠẢẦẨẬẮẰẲẴẶᾸᾹᾺΆᾼᾈᾉᾊᾌᾍᾎᾏἈἉἊἋἌἍἎἏḀȦǠӐÀÁÂẤẪ𝛢𝓐𝙰𝘼'],
48 | ['a', '∂⍺ⓐձǟᵃᶏ⒜аɒaαȃȁคǎმäɑāɐąᾄẚạảǡầẵḁȧӑӓãåάὰάăẩằẳặᾀᾁᾂᾃᾅᾆᾰᾱᾲᾳᾴᶐᾶᾷἀἁἂἃἄἅἆἇᾇậắàáâấẫǻⱥ𝐚𝑎𝒂𝒶𝓪𝔞𝕒𝖆𝖺𝗮𝘢𝙖𝚊𝛂𝛼𝜶𝝰𝞪⍶'],
49 | ['B', '𐌁𝑩𝕭🄱𐊡𝖡𝘽ꓐ𝗕𝘉𝜝𐊂𝚩𝐁𝛣𝝗𝐵𝙱𝔹Ᏼᏼ𝞑Ꞵ𝔅🅑฿𝓑ᗿᗾᗽ🅱ⒷBвϐᗷƁ乃ßცჩ๖βɮБՅ๒ᙖʙᴮᵇጌḄℬΒВẞḂḆɃദᗹᗸᵝᙞᙟᙝᛒᙗᙘᴃ🇧'],
50 | ['b', 'Ꮟ𝐛𝘣𝒷𝔟𝓫𝖇𝖻𝑏𝙗𝕓𝒃𝗯𝚋♭ᑳᒈbᖚᕹᕺⓑḃḅҍъḇƃɓƅᖯƄЬᑲþƂ⒝ЪᶀᑿᒀᒂᒁᑾьƀҌѢѣᔎ'],
51 | ['C', 'ᏟⲤ🄲ꓚ𐊢𐌂🅲𐐕🅒☾ČÇⒸCↃƇᑕㄈ¢८↻ĈϾՇȻᙅᶜ⒞ĆҀĊ©टƆℂℭϹС匚ḈҪʗᑖᑡᑢᑣᑤᑥⅭ𝐂𝐶𝑪𝒞𝓒𝕮𝖢𝗖𝘊𝘾ᔍ'],
52 | ['c', 'ⲥ𐐽ꮯĉcⓒćčċçҁƈḉȼↄсርᴄϲҫ꒝ςɽϛ𝙲ᑦ᧚𝐜𝑐𝒄𝒸𝓬𝔠𝕔𝖈𝖼𝗰𝘤𝙘𝚌₵🇨ᥴᒼⅽ'],
53 | ['D', 'Ꭰ🄳𝔡𝖉𝔻𝗗𝘋𝙳𝐷𝓓𝐃𝑫𝕯𝖣𝔇𝘿ꭰⅅ𝒟ꓓ🅳🅓ⒹDƉᗪƊÐԺᴅᴰↁḊĐÞⅮᗞᑯĎḌḐḒḎᗫᗬᗟᗠᶛᴆ🇩'],
54 | ['d', 'Ꮷꓒ𝓭ᵭ₫ԃⓓdḋďḍḑḓḏđƌɖɗᵈ⒟ԁⅾᶁԀᑺᑻᑼᑽᒄᑰᑱᶑ𝕕𝖽𝑑𝘥𝒅𝙙𝐝𝗱𝚍ⅆ𝒹ʠժ'],
55 | ['E', 'ꭼ🄴𝙀𝔼𐊆𝚬ꓰ𝝚𝞔𝓔𝑬𝗘🅴🅔ⒺΈEƎἝᕮƐモЄᴇᴱᵉÉ乇ЁɆꂅ€ÈℰΕЕⴹᎬĒĔĖĘĚÊËԐỀẾỄỂẼḔḖẺȄȆẸỆȨḜḘḚἘἙἚἛἜῈΈӖὲέЀϵ🇪'],
56 | ['e', '𝑒𝓮𝕖𝖊𝘦𝗲𝚎𝙚𝒆𝔢𝖾𝐞Ҿҿⓔe⒠èᧉéᶒêɘἔềếễ૯ǝєεēҽɛểẽḕḗĕėëẻěȅȇẹệȩɇₑęḝḙḛ℮еԑѐӗᥱёἐἑἒἓἕℯ'],
57 | ['F', '🄵𐊇𝔉𝘍𐊥ꓝꞘ🅵🅕𝓕ⒻFғҒᖴƑԲϝቻḞℱϜ₣🇫Ⅎ'],
58 | ['f', '𝐟𝖋ⓕfƒḟʃբᶠ⒡ſꊰʄ∱ᶂ𝘧'],
59 | ['G', 'ꓖᏳ🄶Ꮐᏻ𝔾𝓖𝑮𝕲ꮐ𝒢𝙂𝖦𝙶𝔊𝐺𝐆🅶🅖ⒼGɢƓʛĢᘜᴳǴĠԌĜḠĞǦǤԍ₲🇬⅁'],
60 | ['g', 'ⓖgǵĝḡğġǧģց૭ǥɠﻭﻮᵍ⒢ℊɡᧁ𝐠𝑔𝒈𝓰𝔤𝕘𝖌𝗀𝗴𝘨𝙜𝚐'],
61 | ['H', '🄷𝜢ꓧ𝘏𝐻𝝜𝖧𐋏𝗛ꮋℍᎻℌⲎ𝑯𝞖🅷🅗ዞǶԋⒽHĤᚺḢḦȞḤḨḪĦⱧҢңҤῊΉῌἨἩἪἫἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟӉӈҥΉн卄♓𝓗ℋН𝐇𝙃𝙷ʜ𝛨Η𝚮ᕼӇᴴᵸ🇭'],
62 | ['h', 'Һ⒣ђⓗhĥḣḧȟḥḩḫẖħⱨհһከኩኪካɦℎ𝐡𝒉𝒽𝓱𝔥𝕙𝖍𝗁𝗵𝘩𝙝𝚑իʰᑋᗁɧんɥ'],
63 | ['I', '🄸ЇꀤᏆ🅸🅘إﺇٳأﺃٲٵⒾI៸ÌÍÎĨĪĬİÏḮỈǏȈȊỊĮḬƗェエῘῙῚΊἸἹἺἻἼἽἾⅠΪΊɪᶦᑊᥣ𝛪𝐈𝙄𝙸𝓵𝙡𝐼ᴵ𝚰𝑰🇮'],
64 | ['i', 'ⓘiìíîĩīĭïḯỉǐȉȋịḭῐῑῒΐῖῗἰἱἲⅰⅼ∣ⵏ│׀ا١۱ߊᛁἳἴἵɨіὶίᶖ𝔦𝚒𝝸𝗂𝐢𝕚𝖎𝗶𝘪𝙞ίⁱᵢ𝓲⒤'],
65 | ['J', '🄹🅹🅙ⒿJЈʝᒍנフĴʆวلյʖᴊᴶﻝጋɈⱼՂๅႱįᎫȷ丿ℐℑᒘᒙᒚᒛᒴᒵᒎᒏ🇯'],
66 | ['j', 'ⓙjϳʲ⒥ɉĵǰјڶᶨ𝒿𝘫𝗷𝑗𝙟𝔧𝒋𝗃𝓳𝕛𝚓𝖏𝐣'],
67 | ['K', '𝗞🄺𝜥𝘒ꓗ𝙆𝕂Ⲕ𝔎𝛫Ꮶ𝞙𝒦🅺🅚₭ⓀKĸḰќƘкҠκқҟӄʞҚКҡᴋᴷᵏ⒦ᛕЌጕḲΚKҜҝҞĶḴǨⱩϗӃ🇰'],
68 | ['k', 'ⓚkḱǩḳķḵƙⱪᶄ𝐤𝘬𝗄𝕜𝜅𝜘𝜿𝝒𝝹𝞌𝞳𝙠𝚔𝑘𝒌ϰ𝛋𝛞𝟆𝗸𝓴𝓀'],
69 | ['L', '🄻𐐛Ⳑ𝑳𝙻𐑃𝓛ⳑꮮᏞꓡ🅻🅛ﺈ└ⓁւLĿᒪ乚ՆʟꓶιԼᴸˡĹረḶₗΓլĻᄂⅬℒⱢᥧᥨᒻᒶᒷᶫﺎᒺᒹᒸᒫ⎳ㄥŁⱠﺄȽ🇱'],
70 | ['l', 'ⓛlŀĺľḷḹļӀℓḽḻłレɭƚɫⱡ|Ɩ⒧ʅǀוןΙІ|ᶩӏ𝓘𝕀𝖨𝗜𝘐𝐥𝑙𝒍𝓁𝔩𝕝𝖑𝗅𝗹𝘭𝚕𝜤𝝞ı𝚤ɩι𝛊𝜄𝜾𝞲'],
71 | ['M', '🄼𐌑𐊰ꓟⲘᎷ🅼🅜ⓂMмṂ൱ᗰ州ᘻო๓♏ʍᙏᴍᴹᵐ⒨ḾМṀ௱ⅯℳΜϺᛖӍӎ𝐌𝑀𝑴𝓜𝔐𝕄𝕸𝖬𝗠𝘔𝙈𝙼𝚳𝛭𝜧𝝡𝞛🇲'],
72 | ['m', '₥ᵯ𝖒𝐦𝗆𝔪𝕞𝓂ⓜmനᙢ൩ḿṁⅿϻṃጠɱ៳ᶆ𝙢𝓶𝚖𝑚𝗺᧕᧗'],
73 | ['N', '🄽ℕꓠ𝛮𝝢𝙽𝚴𝑵𝑁Ⲛ𝐍𝒩𝞜𝗡𝘕𝜨𝓝𝖭🅽₦🅝ЙЍⓃҋ៷NᴎɴƝᑎ几иՈռИהЛπᴺᶰŃ刀ክṄⁿÑПΝᴨոϖǸŇṆŅṊṈทŊӢӣӤӥћѝйᥢҊᴻ🇳'],
74 | ['n', 'ח𝒏𝓷𝙣𝑛𝖓𝔫𝗇𝚗𝗻ᥒⓝήnǹᴒńñᾗηṅňṇɲņṋṉղຖՌƞŋ⒩ภกɳпʼnлԉȠἠἡῃդᾐᾑᾒᾓᾔᾕᾖῄῆῇῂἢἣἤἥἦἧὴήበቡቢባቤብቦȵ𝛈𝜂𝜼𝝶𝞰𝕟𝘯𝐧𝓃ᶇᵰᥥ∩'],
75 | [
76 | 'O',
77 | 'ꄲ🄾𐊒𝟬ꓳⲞ𐐄𐊫𐓂𝞞🅞⍥◯ⵁ⊖0⊝𝝤Ѳϴ𝚶𝜪ѺӦӨӪΌʘ𝐎ǑÒŎÓÔÕȌȎㇿ❍ⓄOὋロ❤૦⊕ØФԾΘƠᴼᵒ⒪ŐÖₒ¤◊Φ〇ΟОՕଠഠ௦סỒỐỖỔṌȬṎŌṐṒȮȰȪỎỜỚỠỞỢỌỘǪǬǾƟⵔ߀៰⍜⎔⎕⦰⦱⦲⦳⦴⦵⦶⦷⦸⦹⦺⦻⦼⦽⦾⦿⧀⧁⧂⧃ὈὉὊὌὍ',
78 | ],
79 | [
80 | 'o',
81 | '𝚘𝛐𝗈𝞼ဝⲟ𝙤၀𐐬𝔬𐓪𝓸🇴⍤○ϙ🅾𝒪𝖮𝟢𝟶𝙾𝘰𝗼𝕠𝜊𝐨𝝾𝞸ᐤⓞѳ᧐ᥲðoఠᦞՓòөӧóºōôǒȏŏồốȍỗổõσṍȭṏὄṑṓȯȫ๏ᴏőöѻоዐǭȱ০୦٥౦೦൦๐໐οօᴑ०੦ỏơờớỡởợọộǫøǿɵծὀὁόὸόὂὃὅ',
82 | ],
83 | ['P', '🄿ꓑ𝚸𝙿𝞠𝙋ꮲⲢ𝒫𝝦𝑃𝑷𝗣𝐏𐊕𝜬𝘗𝓟𝖯𝛲Ꮲ🅟Ҏ🅿ⓅPƤᑭ尸Ṗրφքᴘᴾᵖ⒫ṔアקРየᴩⱣℙΡῬᑸᑶᑷᑹᑬᑮ🇵₱'],
84 | ['p', 'ҏ℗ⓟpṕṗƥᵽῥρрƿǷῤ⍴𝓹𝓅𝐩𝑝𝒑𝔭𝕡𝖕𝗉𝗽𝘱𝙥𝚙𝛒𝝆𝞺𝜌𝞀'],
85 | ['Q', '🅀🆀🅠ⓆQℚⵕԚ𝐐𝑄𝑸𝒬𝓠𝚀𝘘𝙌𝖰𝕼𝔔𝗤🇶'],
86 | ['q', 'ⓠqգ⒬۹զᑫɋɊԛ𝗊𝑞𝘲𝕢𝚚𝒒𝖖𝐪𝔮𝓺𝙦'],
87 | ['R', '℞℟ꭱᏒ𐒴ꮢᎡꓣ🆁🅡ⓇRᴙȒʀᖇя尺ŔЯરƦᴿዪṚɌʁℛℜℝṘŘȐṜŖṞⱤ𝐑𝑅𝑹𝓡𝕽𝖱𝗥𝘙𝙍𝚁ᚱ🇷ᴚ'],
88 | ['r', 'ⓡrŕṙřȑȓṛṝŗгՐɾᥬṟɍʳ⒭ɼѓᴦᶉ𝐫𝑟𝒓𝓇𝓻𝔯𝕣𝖗𝗋𝗿𝘳𝙧ᵲґᵣ'],
89 | ['S', '🅂ꇙ𝓢𝗦Ꮪ𝒮Ꮥ𝚂𝐒ꓢ𝖲𝔖𝙎𐊖𝕾𐐠𝘚𝕊𝑆𝑺🆂🅢ⓈSṨŞֆՏȘˢ⒮ЅṠŠŚṤŜṦṢടᔕᔖᔢᔡᔣᔤ'],
90 | ['s', 'ⓢꜱ𐑈ꮪsśṥŝṡšṧʂṣṩѕşșȿᶊక𝐬𝑠𝒔𝓈𝓼𝔰𝕤𝖘𝗌𝘀𝘴𝙨𝚜ގ🇸'],
91 | ['T', '🅃🆃𐌕𝚻𝛵𝕋𝕿𝑻𐊱𐊗𝖳𝙏🝨𝝩𝞣𝚃𝘛𝑇ꓔ⟙𝐓Ⲧ𝗧⊤𝔗Ꭲꭲ𝒯🅣⏇⏉ⓉTтҬҭƬイŦԵτᴛᵀイፕϮŤ⊥ƮΤТ下ṪṬȚŢṰṮ丅丁ᐪ𝛕𝜏𝝉𝞃𝞽𝓣ㄒ🇹ጥ'],
92 | ['t', 'ⓣtṫẗťṭțȶ੮էʇ†ţṱṯƭŧᵗ⒯ʈեƫ𝐭𝑡𝒕𝓉𝓽𝔱𝕥𝖙𝗍𝘁𝘵𝙩𝚝ナ'],
93 | ['U', '🅄ꓴ𐓎꒤🆄🅤ŨŬŮᑗᑘǓǕǗǙⓊUȖᑌ凵ƱմԱꓵЦŪՄƲᙀᵁᵘ⒰ŰપÜՍÙÚÛṸṺǛỦȔƯỪỨỮỬỰỤṲŲṶṴɄᥩᑧ∪ᘮ⋃𝐔𝑈𝑼𝒰𝓤𝔘𝕌𝖀𝖴𝗨𝘜𝙐𝚄🇺'],
94 | ['u', 'ὺύⓤuùũūừṷṹŭǖữᥙǚǜὗυΰนսʊǘǔúůᴜűųยûṻцሁüᵾᵤµʋủȕȗưứửựụṳṵʉῠῡῢΰῦῧὐὑϋύὒὓὔὕὖᥔ𝐮𝑢𝒖𝓊𝓾𝔲𝕦𝖚𝗎ᶙ'],
95 | ['V', '🅅ꓦ𝑽𝖵𝘝Ꮩ𝚅𝙑𝐕🆅🅥ⓋVᐯѴᵛ⒱۷ṾⅴⅤṼ٧ⴸѶᐺᐻ🇻𝓥'],
96 | ['v', 'ሀⓥv𝜐𝝊ṽṿ౮งѵעᴠνטᵥѷ៴ᘁ𝙫𝚟𝛎𝜈𝝂𝝼𝞶𝘷𝘃𝓿'],
97 | ['W', '🅆ᏔᎳ𝑾ꓪ𝒲𝘞🆆Ⓦ🅦wWẂᾧᗯᥕ山ѠຟచաЩШώщฬшᙎᵂʷ⒲ฝሠẄԜẀŴẆẈധᘺѿᙡƜ₩🇼'],
98 | ['w', 'ẁꮃẃⓦ⍵ŵẇẅẘẉⱳὼὠὡὢὣωὤὥὦὧῲῳῴῶῷⱲѡԝᴡώᾠᾡᾢᾣᾤᾥᾦɯ𝝕𝟉𝞏'],
99 | ['X', '🞨🞩🞪🅇🞫🞬𐌗Ⲭꓫ𝖃𝞦𝘟𐊐𝚾𝝬𝜲Ꭓ𐌢𝖷𝑋𝕏𝔛𐊴𝗫🆇🅧❌Ⓧ𝓧XẊ᙭χㄨ𝒳ӾჯӼҳЖΧҲᵡˣ⒳אሸẌꊼⅩХ╳᙮ᕁᕽⅹᚷⵝ𝙓𝚇乂𝐗🇽'],
100 | ['x', 'ⓧxхẋ×ₓ⤫⤬⨯ẍᶍ𝙭ӽ𝘹𝐱𝚡⨰メ𝔁'],
101 | ['Y', 'Ⲩ𝚈𝑌𝗬𝐘ꓬ𝒀𝜰𐊲🆈🅨ⓎYὛƳㄚʏ⅄ϔ¥¥ՎϓγץӲЧЎሃŸɎϤΥϒҮỲÝŶỸȲẎỶỴῨῩῪΎὙὝὟΫΎӮӰҰұ𝕐🇾'],
102 | ['y', '🅈ᎽᎩⓨyỳýŷỹȳẏÿỷуყẙỵƴɏᵞɣʸᶌү⒴ӳӱӯўУʎ'],
103 | ['Z', '🅉ꓜ𝗭𝐙☡Ꮓ𝘡🆉🅩ⓏZẔƵ乙ẐȤᶻ⒵ŹℤΖŻŽẒⱫ🇿'],
104 | ['z', 'ꮓⓩzźẑżžẓẕƶȥɀᴢጊʐⱬᶎʑᙆ'],
105 | ]);
106 |
--------------------------------------------------------------------------------
/src/transformer/resolve-confusables/index.ts:
--------------------------------------------------------------------------------
1 | import { remapCharactersTransformer } from '../remap-characters';
2 | import { confusables } from './confusables';
3 |
4 | /**
5 | * Creates a transformer that maps confusable Unicode characters to their
6 | * normalized equivalent. For example, `⓵`, `➊`, and `⑴` become `1` when using
7 | * this transformer.
8 | *
9 | * **Application order**
10 | *
11 | * It is recommended that this transformer be applied near the start of the
12 | * transformer chain.
13 | *
14 | * @example
15 | * ```typescript
16 | * const transformer = resolveConfusablesTransformer();
17 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
18 | * ```
19 | * @returns A container holding the transformer, which can then be passed to the
20 | * [[RegExpMatcher]].
21 | */
22 | export function resolveConfusablesTransformer() {
23 | return remapCharactersTransformer(confusables);
24 | }
25 |
--------------------------------------------------------------------------------
/src/transformer/resolve-leetspeak/dictionary.ts:
--------------------------------------------------------------------------------
1 | export const dictionary = new Map([
2 | ['a', '@4'],
3 | ['c', '('],
4 | ['e', '3'],
5 | ['i', '1|!'],
6 | ['g', '6'],
7 | ['o', '0'],
8 | ['s', '$5'],
9 | ['t', '7'],
10 | ['z', '2'],
11 | ]);
12 |
--------------------------------------------------------------------------------
/src/transformer/resolve-leetspeak/index.ts:
--------------------------------------------------------------------------------
1 | import { remapCharactersTransformer } from '../remap-characters';
2 | import { dictionary } from './dictionary';
3 |
4 | /**
5 | * Creates a transformer that maps leet-speak characters to their normalized
6 | * equivalent. For example, `$` becomes `s` when using this transformer.
7 | *
8 | * **Application order**
9 | *
10 | * It is recommended that this transformer be applied near the start of the
11 | * transformer chain, but after similar transformers that map characters to
12 | * other characters, such as the [[resolveConfusablesTransformer | transformer
13 | * that resolves confusable Unicode characters]].
14 | *
15 | * @example
16 | * ```typescript
17 | * const transformer = resolveLeetSpeakTransformer();
18 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
19 | * ```
20 | * @returns A container holding the transformer, which can then be passed to the
21 | * [[RegExpMatcher]].
22 | */
23 | export function resolveLeetSpeakTransformer() {
24 | return remapCharactersTransformer(dictionary);
25 | }
26 |
--------------------------------------------------------------------------------
/src/transformer/skip-non-alphabetic/index.ts:
--------------------------------------------------------------------------------
1 | import { isAlphabetic } from '../../util/Char';
2 | import { createSimpleTransformer } from '../Transformers';
3 |
4 | /**
5 | * Creates a transformer that skips non-alphabetic characters (`a`-`z`,
6 | * `A`-`Z`). This is useful when matching text on patterns that are solely
7 | * comprised of alphabetic characters (the pattern `hello` does not match
8 | * `h.e.l.l.o` by default, but does with this transformer).
9 | *
10 | * **Warning**
11 | *
12 | * This transformation is not part of the default set of transformations, as
13 | * there are some known rough edges with false negatives; see
14 | * [#23](https://github.com/jo3-l/obscenity/issues/23) and
15 | * [#46](https://github.com/jo3-l/obscenity/issues/46) on the GitHub issue
16 | * tracker.
17 | *
18 | * **Application order**
19 | *
20 | * It is recommended that this transformer be applied near the end of the
21 | * transformer chain, if at all.
22 | *
23 | * @example
24 | * ```typescript
25 | * const transformer = skipNonAlphabeticTransformer();
26 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] });
27 | * ```
28 | * @returns A container holding the transformer, which can then be passed to the
29 | * [[RegExpMatcher]].
30 | */
31 | export function skipNonAlphabeticTransformer() {
32 | return createSimpleTransformer((c) => (isAlphabetic(c) ? c : undefined));
33 | }
34 |
--------------------------------------------------------------------------------
/src/transformer/to-ascii-lowercase/index.ts:
--------------------------------------------------------------------------------
1 | import { invertCaseOfAlphabeticChar, isUpperCase } from '../../util/Char';
2 | import { createSimpleTransformer } from '../Transformers';
3 |
4 | /**
5 | * Creates a transformer that changes all ASCII alphabet characters to
6 | * lower-case, leaving other characters unchanged.
7 | *
8 | * **Application order**
9 | *
10 | * It is recommended that this transformer be applied near the end of the
11 | * transformer chain. Using it before other transformers may have the effect of
12 | * making its changes useless as transformers applied after produce characters
13 | * of varying cases.
14 | *
15 | * @returns A container holding the transformer, which can then be passed to the
16 | * [[RegExpMatcher]].
17 | */
18 | export function toAsciiLowerCaseTransformer() {
19 | return createSimpleTransformer((c) => (isUpperCase(c) ? invertCaseOfAlphabeticChar(c) : c));
20 | }
21 |
--------------------------------------------------------------------------------
/src/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../tsconfig.base.json",
3 | "include": ["."],
4 | "compilerOptions": {
5 | "outDir": "../dist",
6 | "rootDir": ".",
7 | "baseUrl": ".",
8 | "composite": true,
9 | "declaration": true
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/util/Char.ts:
--------------------------------------------------------------------------------
1 | export const enum CharacterCode {
2 | LowerA = 97,
3 | LowerZ = 122,
4 | UpperA = 65,
5 | UpperZ = 90,
6 |
7 | Zero = 48,
8 | Nine = 57,
9 |
10 | LeftSquareBracket = 91,
11 | RightSquareBracket = 93,
12 | QuestionMark = 63,
13 | Backslash = 92,
14 | Newline = 10,
15 | VerticalBar = 124,
16 |
17 | HighSurrogateStart = 0xd800,
18 | HighSurrogateEnd = 0xdbff,
19 | LowSurrogateStart = 0xdc00,
20 | LowSurrogateEnd = 0xdfff,
21 | }
22 |
23 | export function isHighSurrogate(char: number) {
24 | return CharacterCode.HighSurrogateStart <= char && char <= CharacterCode.HighSurrogateEnd;
25 | }
26 |
27 | export function isLowSurrogate(char: number) {
28 | return CharacterCode.LowSurrogateStart <= char && char <= CharacterCode.LowSurrogateEnd;
29 | }
30 |
31 | // See https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs.
32 | export function convertSurrogatePairToCodePoint(highSurrogate: number, lowSurrogate: number) {
33 | return (
34 | (highSurrogate - CharacterCode.HighSurrogateStart) * 0x400 +
35 | lowSurrogate -
36 | CharacterCode.LowSurrogateStart +
37 | 0x10000
38 | );
39 | }
40 |
41 | export function isWordChar(char: number) {
42 | return isDigit(char) || isAlphabetic(char);
43 | }
44 |
45 | export function isDigit(char: number) {
46 | return CharacterCode.Zero <= char && char <= CharacterCode.Nine;
47 | }
48 |
49 | export function isAlphabetic(char: number) {
50 | return isLowerCase(char) || isUpperCase(char);
51 | }
52 |
53 | export function isLowerCase(char: number) {
54 | return CharacterCode.LowerA <= char && char <= CharacterCode.LowerZ;
55 | }
56 |
57 | export function isUpperCase(char: number) {
58 | return CharacterCode.UpperA <= char && char <= CharacterCode.UpperZ;
59 | }
60 |
61 | // Input must be a lower-case or upper-case ASCII alphabet character.
62 | export function invertCaseOfAlphabeticChar(char: number) {
63 | return char ^ 0x20;
64 | }
65 |
66 | // Asserts that the string is comprised of one and only one code point,
67 | // then returns said code point.
68 | export function getAndAssertSingleCodePoint(str: string) {
69 | if ([...str].length !== 1) throw new RangeError(`Expected the input string to be one code point in length.`);
70 | return str.codePointAt(0)!;
71 | }
72 |
--------------------------------------------------------------------------------
/src/util/CharacterIterator.ts:
--------------------------------------------------------------------------------
1 | import { convertSurrogatePairToCodePoint, isHighSurrogate, isLowSurrogate } from './Char';
2 |
3 | export class CharacterIterator implements IterableIterator {
4 | private _input: string;
5 |
6 | private lastPosition = -1;
7 |
8 | private currentPosition = 0;
9 |
10 | private _lastWidth = 0;
11 |
12 | public constructor(input?: string) {
13 | this._input = input ?? '';
14 | }
15 |
16 | public get input() {
17 | return this._input;
18 | }
19 |
20 | public setInput(input: string) {
21 | this._input = input;
22 | this.reset();
23 | return this;
24 | }
25 |
26 | public reset() {
27 | this.lastPosition = -1;
28 | this.currentPosition = 0;
29 | this._lastWidth = 0;
30 | }
31 |
32 | public next(): IteratorResult {
33 | if (this.done) return { done: true, value: undefined };
34 | this.lastPosition = this.currentPosition;
35 |
36 | const char = this._input.charCodeAt(this.currentPosition++);
37 | this._lastWidth = 1;
38 | // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
39 | if (this.done || !isHighSurrogate(char)) return { done: false, value: char };
40 |
41 | // Do we have a surrogate pair?
42 | const next = this._input.charCodeAt(this.currentPosition);
43 | if (isLowSurrogate(next)) {
44 | this._lastWidth++;
45 | this.currentPosition++;
46 | return { done: false, value: convertSurrogatePairToCodePoint(char, next) };
47 | }
48 |
49 | return { done: false, value: char };
50 | }
51 |
52 | // Position of the iterator; equals the start index of the last character consumed.
53 | // -1 if no characters were consumed yet.
54 | public get position() {
55 | return this.lastPosition;
56 | }
57 |
58 | // Width of the last character consumed; 2 if it was a surrogate pair and 1 otherwise.
59 | // 0 if no characters were consumed yet.
60 | public get lastWidth() {
61 | return this._lastWidth;
62 | }
63 |
64 | public get done() {
65 | return this.currentPosition >= this._input.length;
66 | }
67 |
68 | public [Symbol.iterator]() {
69 | return this;
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/util/Interval.ts:
--------------------------------------------------------------------------------
1 | export function compareIntervals(lowerBound0: number, upperBound0: number, lowerBound1: number, upperBound1: number) {
2 | if (lowerBound0 < lowerBound1) return -1;
3 | if (lowerBound1 < lowerBound0) return 1;
4 | if (upperBound0 < upperBound1) return -1;
5 | if (upperBound1 < upperBound0) return 1;
6 | return 0;
7 | }
8 |
9 | export type Interval = [lowerBound: number, upperBound: number];
10 |
--------------------------------------------------------------------------------
/test/censor/BuiltinStrategies.test.ts:
--------------------------------------------------------------------------------
1 | import {
2 | asteriskCensorStrategy,
3 | fixedCharCensorStrategy,
4 | fixedPhraseCensorStrategy,
5 | grawlixCensorStrategy,
6 | keepEndCensorStrategy,
7 | keepStartCensorStrategy,
8 | randomCharFromSetCensorStrategy,
9 | } from '../../src/censor/BuiltinStrategies';
10 | import type { CensorContext } from '../../src/censor/TextCensor';
11 |
12 | const partialCtx = {
13 | input: '',
14 | overlapsAtStart: false,
15 | overlapsAtEnd: false,
16 | termId: -1,
17 | startIndex: 0,
18 | endIndex: 0,
19 | };
20 |
21 | describe('keepStartCensorStrategy()', () => {
22 | const baseStrategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength));
23 |
24 | afterEach(() => {
25 | baseStrategy.mockClear();
26 | });
27 |
28 | it('should call the base strategy with the same arguments if overlapsAtStart is true', () => {
29 | const strategy = keepStartCensorStrategy(baseStrategy);
30 | const res = strategy({ ...partialCtx, matchLength: 5, overlapsAtStart: true });
31 | expect(res).toBe('.....');
32 | expect(baseStrategy).toHaveBeenCalledTimes(1);
33 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...partialCtx, matchLength: 5, overlapsAtStart: true });
34 | });
35 |
36 | it('should call the base strategy with matchLength-1 and add the first character of the matched region', () => {
37 | const strategy = keepStartCensorStrategy(baseStrategy);
38 | const ctx = {
39 | input: 'hello world!',
40 | overlapsAtStart: false,
41 | overlapsAtEnd: false,
42 | termId: -1,
43 | startIndex: 6,
44 | endIndex: 10,
45 | matchLength: 5,
46 | };
47 | const res = strategy(ctx);
48 | expect(res).toBe('w....');
49 | expect(baseStrategy).toHaveBeenCalledTimes(1);
50 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...ctx, matchLength: 4 });
51 | });
52 | });
53 |
54 | describe('keepEndCensorStrategy()', () => {
55 | const baseStrategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength));
56 |
57 | afterEach(() => {
58 | baseStrategy.mockClear();
59 | });
60 |
61 | it('should call the base strategy with the same arguments if overlapsAtEnd is true', () => {
62 | const strategy = keepEndCensorStrategy(baseStrategy);
63 | const res = strategy({ ...partialCtx, matchLength: 5, overlapsAtEnd: true });
64 | expect(res).toBe('.....');
65 | expect(baseStrategy).toHaveBeenCalledTimes(1);
66 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...partialCtx, matchLength: 5, overlapsAtEnd: true });
67 | });
68 |
69 | it('should call the base strategy with matchLength-1 and add the last character of the matched region', () => {
70 | const strategy = keepEndCensorStrategy(baseStrategy);
71 | const ctx = {
72 | input: 'hello world!',
73 | overlapsAtStart: false,
74 | overlapsAtEnd: false,
75 | termId: -1,
76 | startIndex: 6,
77 | endIndex: 10,
78 | matchLength: 5,
79 | };
80 | const res = strategy(ctx);
81 | expect(res).toBe('....d');
82 | expect(baseStrategy).toHaveBeenCalledTimes(1);
83 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...ctx, matchLength: 4 });
84 | });
85 | });
86 |
87 | describe('asteriskCensorStrategy()', () => {
88 | it('should return strings that are made up of asterisks', () => {
89 | const strategy = asteriskCensorStrategy();
90 | expect(strategy({ ...partialCtx, matchLength: 8 })).toBe('********');
91 | });
92 | });
93 |
94 | describe('grawlixCensorStrategy()', () => {
95 | it('should return strings that have characters taken from the charset %@$&*', () => {
96 | const charset = '%@$&*';
97 | const strategy = grawlixCensorStrategy();
98 | expect([...strategy({ ...partialCtx, matchLength: 20 })].every((c) => charset.includes(c))).toBeTruthy();
99 | });
100 | });
101 |
102 | describe('fixedPhraseCensorStrategy()', () => {
103 | it('should simply return the phrase given', () => {
104 | const strategy = fixedPhraseCensorStrategy('fixed phrase');
105 | expect(strategy({ ...partialCtx, matchLength: 30 })).toBe('fixed phrase');
106 | });
107 | });
108 |
109 | describe('fixedCharCensorStrategy()', () => {
110 | it('should throw if the input string was empty', () => {
111 | expect(() => fixedCharCensorStrategy('')).toThrow(
112 | new RangeError(`Expected the input string to be one code point in length.`),
113 | );
114 | });
115 |
116 | it('should throw if the input string was comprised of more than one code point', () => {
117 | expect(() => fixedCharCensorStrategy('ab')).toThrow(
118 | new RangeError(`Expected the input string to be one code point in length.`),
119 | );
120 | });
121 |
122 | it('should not throw if the input string was a surrogate pair', () => {
123 | expect(() => fixedCharCensorStrategy('🌉')).not.toThrow();
124 | });
125 |
126 | it('should return the input string repeated N times (where N is the match length)', () => {
127 | const strategy = fixedCharCensorStrategy('x');
128 | expect(strategy({ ...partialCtx, matchLength: 7 })).toBe('xxxxxxx');
129 | });
130 | });
131 |
132 | describe('randomCharFromSetCensorStrategy()', () => {
133 | it('should throw if the charset has less than 2 characters', () => {
134 | expect(() => randomCharFromSetCensorStrategy('')).toThrow(
135 | new Error('The character set passed must have at least 2 characters.'),
136 | );
137 | expect(() => randomCharFromSetCensorStrategy('a')).toThrow(
138 | new Error('The character set passed must have at least 2 characters.'),
139 | );
140 | });
141 |
142 | it('should work for matchLength 0', () => {
143 | const strategy = randomCharFromSetCensorStrategy('abcdefghijk');
144 | expect(strategy({ ...partialCtx, matchLength: 0 })).toBe('');
145 | });
146 |
147 | it('should return N characters (where N is the match length) from the set of characters given', () => {
148 | const charset = 'abcdefghijk';
149 | const strategy = randomCharFromSetCensorStrategy(charset);
150 | expect([...strategy({ ...partialCtx, matchLength: 5 })].every((c) => charset.includes(c))).toBeTruthy();
151 | });
152 |
153 | it('should not repeat the same character twice in a row', () => {
154 | const strategy = randomCharFromSetCensorStrategy('ab');
155 | for (let i = 0; i < 100; i++) {
156 | expect(['aba', 'bab']).toContain(strategy({ ...partialCtx, matchLength: 3 }));
157 | }
158 | });
159 | });
160 |
--------------------------------------------------------------------------------
/test/censor/TextCensor.test.ts:
--------------------------------------------------------------------------------
1 | import { grawlixCensorStrategy } from '../../src/censor/BuiltinStrategies';
2 | import type { CensorContext } from '../../src/censor/TextCensor';
3 | import { TextCensor } from '../../src/censor/TextCensor';
4 |
5 | describe('TextCensor#setStrategy()', () => {
6 | it('should return the text censor', () => {
7 | const censor = new TextCensor();
8 | expect(censor.setStrategy(grawlixCensorStrategy())).toStrictEqual(censor);
9 | });
10 | });
11 |
12 | describe('TextCensor#applyTo()', () => {
13 | const strategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength));
14 |
15 | afterEach(() => {
16 | strategy.mockClear();
17 | });
18 |
19 | it('should return the input unmodified if there are no matches', () => {
20 | const censor = new TextCensor().setStrategy(strategy);
21 | expect(censor.applyTo('text', [])).toBe('text');
22 | expect(strategy).not.toHaveBeenCalled();
23 | });
24 |
25 | it('should call the strategy for each non-overlapping match interval (no overlaps, 1 match)', () => {
26 | const censor = new TextCensor().setStrategy(strategy);
27 | const firstMatch = { termId: 0, matchLength: 11, startIndex: 3, endIndex: 13 };
28 | expect(censor.applyTo('my interesting input', [firstMatch])).toBe('my ........... input');
29 | expect(strategy).toHaveBeenCalledTimes(1);
30 | expect(strategy).toHaveBeenLastCalledWith({
31 | ...firstMatch,
32 | input: 'my interesting input',
33 | overlapsAtStart: false,
34 | overlapsAtEnd: false,
35 | });
36 | });
37 |
38 | it('should call the strategy for each non-overlapping match interval (no overlaps, 3 matches)', () => {
39 | const censor = new TextCensor().setStrategy(strategy);
40 | const firstMatch = { termId: 0, matchLength: 4, startIndex: 0, endIndex: 3 };
41 | const secondMatch = { termId: 0, matchLength: 2, startIndex: 8, endIndex: 9 };
42 | const thirdMatch = { termId: 0, matchLength: 5, startIndex: 22, endIndex: 26 };
43 | expect(censor.applyTo('this is my intriguing input', [firstMatch, secondMatch, thirdMatch])).toBe(
44 | '.... is .. intriguing .....',
45 | );
46 | expect(strategy).toHaveBeenCalledTimes(3);
47 | expect(strategy).toHaveBeenNthCalledWith(1, {
48 | ...firstMatch,
49 | input: 'this is my intriguing input',
50 | overlapsAtStart: false,
51 | overlapsAtEnd: false,
52 | });
53 | expect(strategy).toHaveBeenNthCalledWith(2, {
54 | ...secondMatch,
55 | input: 'this is my intriguing input',
56 | overlapsAtStart: false,
57 | overlapsAtEnd: false,
58 | });
59 | expect(strategy).toHaveBeenNthCalledWith(3, {
60 | ...thirdMatch,
61 | input: 'this is my intriguing input',
62 | overlapsAtStart: false,
63 | overlapsAtEnd: false,
64 | });
65 | });
66 |
67 | it('should call the strategy for each non-overlapping match interval (some overlaps, 2 matches)', () => {
68 | const censor = new TextCensor().setStrategy(strategy);
69 | const firstMatch = { termId: 0, matchLength: 5, startIndex: 0, endIndex: 4 };
70 | const secondMatch = { termId: 0, matchLength: 8, startIndex: 0, endIndex: 7 };
71 | expect(censor.applyTo('thinking of good test data is hard', [firstMatch, secondMatch])).toBe(
72 | '............. of good test data is hard',
73 | );
74 | expect(strategy).toHaveBeenCalledTimes(2);
75 | expect(strategy).toHaveBeenNthCalledWith(1, {
76 | ...firstMatch,
77 | input: 'thinking of good test data is hard',
78 | overlapsAtStart: false,
79 | overlapsAtEnd: true,
80 | });
81 | expect(strategy).toHaveBeenNthCalledWith(2, {
82 | ...secondMatch,
83 | input: 'thinking of good test data is hard',
84 | startIndex: 5,
85 | overlapsAtStart: true,
86 | overlapsAtEnd: false,
87 | });
88 | });
89 |
90 | it('should not call the strategy for matched intervals which are completely contained in another one', () => {
91 | const censor = new TextCensor().setStrategy(strategy);
92 | const firstMatch = { termId: 0, matchLength: 2, startIndex: 1, endIndex: 2 };
93 | const secondMatch = { termId: 0, matchLength: 1, startIndex: 2, endIndex: 2 };
94 | expect(censor.applyTo('tests', [firstMatch, secondMatch])).toBe('t..ts');
95 | expect(strategy).toHaveBeenCalledTimes(1);
96 | expect(strategy).toHaveBeenLastCalledWith({
97 | ...firstMatch,
98 | input: 'tests',
99 | overlapsAtStart: false,
100 | overlapsAtEnd: false,
101 | });
102 | });
103 |
104 | it('should not call the strategy for matched intervals which are equal to some other one', () => {
105 | const censor = new TextCensor().setStrategy(strategy);
106 | const firstMatch = { termId: 0, matchLength: 3, startIndex: 1, endIndex: 3 };
107 | const secondMatch = { termId: 1, matchLength: 3, startIndex: 1, endIndex: 3 };
108 | expect(censor.applyTo('heretical', [firstMatch, secondMatch])).toBe('h...tical');
109 | expect(strategy).toHaveBeenCalledTimes(1);
110 | expect(strategy).toHaveBeenLastCalledWith({
111 | ...firstMatch,
112 | input: 'heretical',
113 | overlapsAtStart: false,
114 | overlapsAtEnd: false,
115 | });
116 | });
117 | });
118 |
--------------------------------------------------------------------------------
/test/jest.setup.ts:
--------------------------------------------------------------------------------
1 | expect.extend({
2 | toBePermutationOf(this: jest.MatcherContext, received: T[], expected: T[]) {
3 | const options = {
4 | isNot: this.isNot,
5 | promise: this.promise,
6 | };
7 |
8 | if (received.length !== expected.length) {
9 | return {
10 | message: () => `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)}
11 |
12 | Expected: array of length ${expected.length} (${this.utils.printExpected(expected)})
13 | Received: array of length ${received.length} (${this.utils.printReceived(received)})`,
14 | pass: false,
15 | };
16 | }
17 |
18 | const copy = [...expected];
19 | let maxIndex = expected.length - 1;
20 |
21 | for (const element of received) {
22 | // See if there's an element in expected that hasn't been used yet and is
23 | // deeply equal to the current value.
24 | let pass = false;
25 | for (let i = maxIndex; i >= 0; i--) {
26 | pass = this.equals(element, copy[i]);
27 | if (pass) {
28 | // Swap the current element with the one at the maximum index,
29 | // then mark the maximum index as unusable.
30 | // This ensures that we don't mark two values in received as equal
31 | // to the same value in expected.
32 | copy[i] = copy[maxIndex--];
33 | break;
34 | }
35 | }
36 |
37 | if (!pass) {
38 | // No value in expected is deeply equal to the current value in received.
39 | const message = () => {
40 | return `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)}
41 |
42 | Expected: a permutation of ${this.utils.printExpected(expected)}
43 | Received: ${this.utils.printReceived(received)}
44 | }
45 | `;
46 | };
47 |
48 | return { message, pass: false };
49 | }
50 | }
51 |
52 | return {
53 | message: () => `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)}
54 |
55 | Expected: not a permutation of ${this.utils.printExpected(expected)}
56 | Received: ${this.utils.printReceived(received)}`,
57 | pass: true,
58 | };
59 | },
60 | });
61 |
62 | declare global {
63 | // eslint-disable-next-line @typescript-eslint/no-namespace
64 | namespace jest {
65 | interface Matchers {
66 | toBePermutationOf(expected: readonly any[]): R;
67 | }
68 | }
69 | }
70 |
71 | export {};
72 |
--------------------------------------------------------------------------------
/test/matcher/BlacklistedTerm.test.ts:
--------------------------------------------------------------------------------
1 | import { assignIncrementingIds } from '../../src/matcher/BlacklistedTerm';
2 | import { pattern } from '../../src/pattern/Pattern';
3 |
4 | describe('assignIncrementingIds()', () => {
5 | it('should assign incrementing, unique IDs to the input patterns', () => {
6 | const firstPattern = pattern`|world|`;
7 | const secondPattern = pattern`:D`;
8 | const thirdPattern = pattern`??`;
9 | const fourthPattern = pattern`hmm interesting`;
10 | expect(assignIncrementingIds([firstPattern, secondPattern, thirdPattern, fourthPattern])).toStrictEqual([
11 | { id: 0, pattern: firstPattern },
12 | { id: 1, pattern: secondPattern },
13 | { id: 2, pattern: thirdPattern },
14 | { id: 3, pattern: fourthPattern },
15 | ]);
16 | });
17 | });
18 |
--------------------------------------------------------------------------------
/test/matcher/IntervalCollection.test.ts:
--------------------------------------------------------------------------------
1 | import { IntervalCollection } from '../../src/matcher/IntervalCollection';
2 |
3 | let coll: IntervalCollection;
4 |
5 | beforeEach(() => {
6 | coll = new IntervalCollection();
7 | });
8 |
9 | describe('IntervalCollection#insert()', () => {
10 | it('should add the interval to the collection', () => {
11 | coll.insert(5, 10);
12 | expect([...coll]).toBePermutationOf([[5, 10]]);
13 | coll.insert(12, 13);
14 | expect([...coll]).toBePermutationOf([
15 | [12, 13],
16 | [5, 10],
17 | ]);
18 | });
19 | });
20 |
21 | describe('IntervalCollection#query()', () => {
22 | it('should return false if the input interval does not intersect any of the stored intervals', () => {
23 | coll.insert(5, 10);
24 | coll.insert(13, 14);
25 | coll.insert(17, 19);
26 | expect(coll.query(3, 4)).toBeFalsy();
27 | });
28 |
29 | it('should return false if the interval collection is empty', () => {
30 | expect(coll.query(0, 0)).toBeFalsy();
31 | });
32 |
33 | it('should return true if there is some interval stored that such that the input interval is a subset of it', () => {
34 | coll.insert(8, 9);
35 | coll.insert(10, 12);
36 | coll.insert(13, 17);
37 | expect(coll.query(14, 15)).toBeTruthy();
38 | });
39 |
40 | it('should return false if the input interval simply overlaps with some of the stored intervals', () => {
41 | coll.insert(17, 19);
42 | coll.insert(20, 24);
43 | coll.insert(25, 44);
44 | expect(coll.query(34, 45)).toBeFalsy();
45 | });
46 | });
47 |
48 | it('should be iterable', () => {
49 | coll.insert(30, 35);
50 | coll.insert(47, 49);
51 | coll.insert(98, 99);
52 | expect([...coll]).toBePermutationOf([
53 | [30, 35],
54 | [47, 49],
55 | [98, 99],
56 | ]);
57 | });
58 |
--------------------------------------------------------------------------------
/test/matcher/MatchPayload.test.ts:
--------------------------------------------------------------------------------
1 | import { compareMatchByPositionAndId } from '../../src/matcher/MatchPayload';
2 | import { compareIntervals as _compareIntervals } from '../../src/util/Interval';
3 |
4 | jest.mock('../../src/util/Interval', () => ({ compareIntervals: jest.fn().mockReturnValue(0) }));
5 |
6 | const compareIntervals = _compareIntervals as jest.MockedFunction;
7 |
8 | afterEach(() => {
9 | compareIntervals.mockClear();
10 | });
11 |
12 | describe('compareMatchByPositionAndId()', () => {
13 | const termIdAndMatchLen = { termId: -1, matchLength: 0 };
14 |
15 | it('should call compareIntervals() and return its result if not zero', () => {
16 | compareIntervals.mockImplementationOnce(() => -1);
17 | expect(
18 | compareMatchByPositionAndId(
19 | { ...termIdAndMatchLen, startIndex: 5, endIndex: 7 },
20 | { ...termIdAndMatchLen, startIndex: 6, endIndex: 8 },
21 | ),
22 | ).toBe(-1);
23 | expect(compareIntervals).toHaveBeenCalledTimes(1);
24 | expect(compareIntervals).toHaveBeenLastCalledWith(5, 7, 6, 8);
25 | });
26 |
27 | const startAndEndIdxAndMatchLen = { startIndex: 0, endIndex: 0, matchLength: 0 };
28 |
29 | it("should return -1 if the first match payload's term ID is less than the second's and their positions are identical", () => {
30 | expect(
31 | compareMatchByPositionAndId(
32 | { ...startAndEndIdxAndMatchLen, termId: 0 },
33 | { ...startAndEndIdxAndMatchLen, termId: 3 },
34 | ),
35 | ).toBe(-1);
36 | });
37 |
38 | it("should return 1 if the first match payload's term ID is less than the second's and their positions are identical", () => {
39 | expect(
40 | compareMatchByPositionAndId(
41 | { ...startAndEndIdxAndMatchLen, termId: 50 },
42 | { ...startAndEndIdxAndMatchLen, termId: 30 },
43 | ),
44 | ).toBe(1);
45 | });
46 |
47 | it("should return 0 if the first match payload's term ID is equal to the first's and their positions are identical", () => {
48 | expect(
49 | compareMatchByPositionAndId(
50 | { ...startAndEndIdxAndMatchLen, termId: 34 },
51 | { ...startAndEndIdxAndMatchLen, termId: 34 },
52 | ),
53 | ).toBe(0);
54 | });
55 | });
56 |
--------------------------------------------------------------------------------
/test/pattern/ParserError.test.ts:
--------------------------------------------------------------------------------
1 | import { ParserError } from '../../src/pattern/ParserError';
2 |
3 | describe('ParserError#name', () => {
4 | it("should be equal to 'ParserError'", () => {
5 | const err = new ParserError('', 0, 0);
6 | expect(err.name).toBe('ParserError');
7 | });
8 | });
9 |
10 | describe('ParserError#line', () => {
11 | it('should be equal to the value passed to the constructor', () => {
12 | const err = new ParserError('', 1, 0);
13 | expect(err.line).toBe(1);
14 | });
15 | });
16 |
17 | describe('ParserError#column', () => {
18 | it('should be equal to the value passed to the constructor', () => {
19 | const err = new ParserError('', 0, 500);
20 | expect(err.column).toBe(500);
21 | });
22 | });
23 |
24 | describe('ParserError#message', () => {
25 | it("should be in the format 'line:column: message'", () => {
26 | const err = new ParserError('hi', 1, 10);
27 | expect(err.message).toBe('1:10: hi');
28 | });
29 | });
30 |
--------------------------------------------------------------------------------
/test/pattern/Pattern.test.ts:
--------------------------------------------------------------------------------
1 | import { Parser } from '../../src/pattern/Parser';
2 | import { parseRawPattern, pattern } from '../../src/pattern/Pattern';
3 |
4 | const parser = new Parser();
5 |
6 | describe('pattern template tag', () => {
7 | it('should parse the pattern given', () => {
8 | expect(pattern`hello world?`).toStrictEqual(parser.parse('hello world?'));
9 | });
10 |
11 | it('should not require double-escaping backslashes', () => {
12 | expect(pattern`hello escaped \[ :D`).toStrictEqual(parser.parse('hello escaped \\[ :D'));
13 | });
14 |
15 | it('should interpolate one expression appropriately', () => {
16 | const value = 123;
17 | expect(pattern`value=${value}`).toStrictEqual(parser.parse('value=123'));
18 | });
19 |
20 | it('should interpolate many expressions appropriately', () => {
21 | const value0 = 123;
22 | const value1 = 234;
23 | expect(pattern`value0=${value0} value1=${value1} something after :)`).toStrictEqual(
24 | parser.parse('value0=123 value1=234 something after :)'),
25 | );
26 | });
27 |
28 | it('should work with empty strings', () => {
29 | expect(pattern``).toStrictEqual(parser.parse(''));
30 | });
31 | });
32 |
33 | describe('parseRawPattern()', () => {
34 | it('should parse the string given', () => {
35 | expect(parseRawPattern('h[i] ?')).toStrictEqual(parser.parse('h[i] ?'));
36 | });
37 | });
38 |
--------------------------------------------------------------------------------
/test/pattern/Util.test.ts:
--------------------------------------------------------------------------------
1 | import type { LiteralNode, OptionalNode } from '../../src/pattern/Nodes';
2 | import { SyntaxKind } from '../../src/pattern/Nodes';
3 | import { compilePatternToRegExp, getRegExpStringForNode, potentiallyMatchesEmptyString } from '../../src/pattern/Util';
4 | import { CharacterIterator } from '../../src/util/CharacterIterator';
5 |
6 | function toLiteralNode(str: string): LiteralNode {
7 | return { kind: SyntaxKind.Literal, chars: [...new CharacterIterator(str)] };
8 | }
9 |
10 | describe('potentiallyMatchesEmptyString()', () => {
11 | it('should return false for patterns with wildcards', () => {
12 | expect(
13 | potentiallyMatchesEmptyString({
14 | requireWordBoundaryAtStart: false,
15 | requireWordBoundaryAtEnd: false,
16 | nodes: [{ kind: SyntaxKind.Wildcard }],
17 | }),
18 | ).toBeFalsy();
19 | });
20 |
21 | it('should return false for literal patterns', () => {
22 | expect(
23 | potentiallyMatchesEmptyString({
24 | requireWordBoundaryAtStart: false,
25 | requireWordBoundaryAtEnd: false,
26 | nodes: [toLiteralNode('foo')],
27 | }),
28 | ).toBeFalsy();
29 | });
30 |
31 | it('should return false for patterns composed of combo of literals and optionals', () => {
32 | expect(
33 | potentiallyMatchesEmptyString({
34 | requireWordBoundaryAtStart: false,
35 | requireWordBoundaryAtEnd: false,
36 | nodes: [toLiteralNode('foo'), { kind: SyntaxKind.Optional, childNode: toLiteralNode('bar') }],
37 | }),
38 | ).toBeFalsy();
39 | });
40 |
41 | it('should return true for patterns solely composed of optionals', () => {
42 | expect(
43 | potentiallyMatchesEmptyString({
44 | requireWordBoundaryAtStart: false,
45 | requireWordBoundaryAtEnd: false,
46 | nodes: [
47 | { kind: SyntaxKind.Optional, childNode: { kind: SyntaxKind.Wildcard } },
48 | { kind: SyntaxKind.Optional, childNode: toLiteralNode('bar') },
49 | ],
50 | }),
51 | ).toBeTruthy();
52 | });
53 |
54 | it('should return true for empty patterns', () => {
55 | expect(
56 | potentiallyMatchesEmptyString({ requireWordBoundaryAtStart: false, requireWordBoundaryAtEnd: false, nodes: [] }),
57 | ).toBeTruthy();
58 | });
59 | });
60 |
61 | describe('compilePatternToRegExp()', () => {
62 | it('should add \\b at the begin if requireWordBoundaryAtStart is true', () => {
63 | const regExp = compilePatternToRegExp({
64 | requireWordBoundaryAtStart: true,
65 | requireWordBoundaryAtEnd: false,
66 | nodes: [toLiteralNode('bye')],
67 | });
68 | expect(regExp.source).toBe('\\bbye');
69 | });
70 |
71 | it('should add a \\b at the end if requireWordBoundaryAtEnd is true', () => {
72 | const regExp = compilePatternToRegExp({
73 | requireWordBoundaryAtStart: false,
74 | requireWordBoundaryAtEnd: true,
75 | nodes: [toLiteralNode('hi')],
76 | });
77 | expect(regExp.source).toBe('hi\\b');
78 | });
79 |
80 | it('should return the regexp with dotall and global flags on', () => {
81 | const regExp = compilePatternToRegExp({
82 | requireWordBoundaryAtStart: false,
83 | requireWordBoundaryAtEnd: true,
84 | nodes: [toLiteralNode('yo'), { kind: SyntaxKind.Wildcard }],
85 | });
86 | expect(regExp.dotAll).toBeTruthy();
87 | expect(regExp.global).toBeTruthy();
88 | });
89 | });
90 |
91 | describe('getRegExpStringForNode()', () => {
92 | describe('literals', () => {
93 | it('should return the text of the string directly if it contains no special chars', () => {
94 | expect(getRegExpStringForNode(toLiteralNode('hi'))).toBe('hi');
95 | expect(getRegExpStringForNode(toLiteralNode(':D'))).toBe(':D');
96 | expect(getRegExpStringForNode(toLiteralNode('🌉'))).toBe('🌉');
97 | });
98 |
99 | it('should escape special characters with a backslash', () => {
100 | expect(getRegExpStringForNode(toLiteralNode('['))).toBe('\\[');
101 | expect(getRegExpStringForNode(toLiteralNode('.'))).toBe('\\.');
102 | expect(getRegExpStringForNode(toLiteralNode('hi?'))).toBe('hi\\?');
103 | });
104 | });
105 |
106 | describe('optionals', () => {
107 | it('should return (?:inner)?', () => {
108 | const optional: OptionalNode = { kind: SyntaxKind.Optional, childNode: toLiteralNode('hello') };
109 | expect(getRegExpStringForNode(optional)).toBe('(?:hello)?');
110 | });
111 | });
112 |
113 | describe('wildcards', () => {
114 | it('should return a dot', () => {
115 | expect(getRegExpStringForNode({ kind: SyntaxKind.Wildcard })).toBe('.');
116 | });
117 | });
118 | });
119 |
--------------------------------------------------------------------------------
/test/transformer/TransformerSet.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerSet } from '../../src/transformer/TransformerSet';
2 | import type { StatefulTransformer } from '../../src/transformer/Transformers';
3 | import { createSimpleTransformer, createStatefulTransformer } from '../../src/transformer/Transformers';
4 |
5 | it('should create multiple instances of stateful transformers', () => {
6 | const spy = jest.fn();
7 | class MyTransformer implements StatefulTransformer {
8 | public constructor() {
9 | spy();
10 | }
11 |
12 | public transform() {
13 | return 0;
14 | }
15 |
16 | public reset() {
17 | // do nothing
18 | }
19 | }
20 |
21 | const transformer = createStatefulTransformer(() => new MyTransformer());
22 | new TransformerSet([transformer]);
23 | expect(spy).toHaveBeenCalledTimes(1);
24 | new TransformerSet([transformer]);
25 | expect(spy).toHaveBeenCalledTimes(2);
26 | });
27 |
28 | describe('TransformerSet#applyTo()', () => {
29 | it('should be a noop if no transformers were provided', () => {
30 | expect(new TransformerSet([]).applyTo(32)).toBe(32);
31 | });
32 |
33 | it('should work with simple transformers', () => {
34 | const fn = jest.fn((c: number) => c + 1);
35 | expect(new TransformerSet([createSimpleTransformer(fn)]).applyTo(5)).toBe(6);
36 | expect(fn).toHaveBeenCalledTimes(1);
37 | expect(fn).toHaveBeenLastCalledWith(5);
38 | });
39 |
40 | it('should work with stateful transformers', () => {
41 | const instance = {
42 | transform: jest.fn((c) => c + 1),
43 | reset: jest.fn(),
44 | };
45 | expect(new TransformerSet([createStatefulTransformer(() => instance)]).applyTo(7)).toBe(8);
46 | expect(instance.transform).toHaveBeenCalledTimes(1);
47 | expect(instance.transform).toHaveBeenLastCalledWith(7);
48 | expect(instance.reset).not.toHaveBeenCalled();
49 | });
50 |
51 | it('should pass the transformed value to the next transformer', () => {
52 | const fn0 = jest.fn((c: number) => c + 1);
53 | const fn1 = jest.fn((c: number) => c + 2);
54 | expect(new TransformerSet([createSimpleTransformer(fn0), createSimpleTransformer(fn1)]).applyTo(5)).toBe(8);
55 | expect(fn0).toHaveBeenCalledTimes(1);
56 | expect(fn0).toHaveBeenLastCalledWith(5);
57 | expect(fn1).toHaveBeenCalledTimes(1);
58 | expect(fn1).toHaveBeenLastCalledWith(6);
59 | });
60 |
61 | it('should short circuit if a transformer returns undefined', () => {
62 | const fn0 = jest.fn((c: number) => c + 1);
63 | const fn1 = jest.fn(() => undefined);
64 | const fn2 = jest.fn((c: number) => c + 3);
65 | expect(
66 | new TransformerSet([
67 | createSimpleTransformer(fn0),
68 | createSimpleTransformer(fn1),
69 | createSimpleTransformer(fn2),
70 | ]).applyTo(6),
71 | ).toBeUndefined();
72 | expect(fn0).toHaveBeenCalledTimes(1);
73 | expect(fn0).toHaveBeenLastCalledWith(6);
74 | expect(fn1).toHaveBeenCalledTimes(1);
75 | expect(fn1).toHaveBeenLastCalledWith(7);
76 | expect(fn2).not.toHaveBeenCalled();
77 | });
78 |
79 | it('should work with a mix of different types of transformers', () => {
80 | const instance = {
81 | transform: jest.fn((c) => c + 1),
82 | reset: jest.fn(),
83 | };
84 | const fn0 = jest.fn((c: number) => c + 2);
85 | const fn1 = jest.fn((c: number) => c + 3);
86 | expect(
87 | new TransformerSet([
88 | createStatefulTransformer(() => instance),
89 | createSimpleTransformer(fn0),
90 | createSimpleTransformer(fn1),
91 | ]).applyTo(5),
92 | ).toBe(11);
93 | expect(instance.transform).toHaveBeenCalledTimes(1);
94 | expect(instance.transform).toHaveBeenLastCalledWith(5);
95 | expect(fn0).toHaveBeenCalledTimes(1);
96 | expect(fn0).toHaveBeenLastCalledWith(6);
97 | expect(fn1).toHaveBeenCalledTimes(1);
98 | expect(fn1).toHaveBeenLastCalledWith(8);
99 | });
100 |
101 | it('should apply transformers in order', () => {
102 | const calls: number[] = [];
103 | const fn0 = (c: number) => {
104 | calls.push(0);
105 | return c + 1;
106 | };
107 |
108 | const fn1 = (c: number) => {
109 | calls.push(1);
110 | return c + 2;
111 | };
112 |
113 | expect(new TransformerSet([createSimpleTransformer(fn0), createSimpleTransformer(fn1)]).applyTo(5)).toBe(8);
114 | expect(calls).toStrictEqual([0, 1]);
115 | });
116 | });
117 |
118 | describe('TransformerSet#resetAll()', () => {
119 | it('should call the reset() method of all stateful transformers once', () => {
120 | const instance0 = {
121 | transform: (c: number) => c + 1,
122 | reset: jest.fn(),
123 | };
124 | const fn = (c: number) => c + 1;
125 | const instance1 = {
126 | transform: (c: number) => c + 2,
127 | reset: jest.fn(),
128 | };
129 | const transformers = new TransformerSet([
130 | createStatefulTransformer(() => instance0),
131 | createSimpleTransformer(fn),
132 | createStatefulTransformer(() => instance1),
133 | ]);
134 | transformers.resetAll();
135 | expect(instance0.reset).toHaveBeenCalledTimes(1);
136 | expect(instance1.reset).toHaveBeenCalledTimes(1);
137 | });
138 | });
139 |
--------------------------------------------------------------------------------
/test/transformer/Transformers.test.ts:
--------------------------------------------------------------------------------
1 | import type { StatefulTransformer } from '../../src/transformer/Transformers';
2 | import {
3 | createSimpleTransformer,
4 | createStatefulTransformer,
5 | TransformerType,
6 | } from '../../src/transformer/Transformers';
7 |
8 | describe('TransformerType', () => {
9 | describe('TransformerType.Simple', () => {
10 | it('should equal 0', () => {
11 | expect(TransformerType.Simple).toBe(0);
12 | });
13 | });
14 |
15 | describe('TransformerType.Stateful', () => {
16 | it('should equal 1', () => {
17 | expect(TransformerType.Stateful).toBe(1);
18 | });
19 | });
20 | });
21 |
22 | describe('createSimpleTransformer', () => {
23 | it('should return a container holding the function given', () => {
24 | const transformer = (c: number) => c + 1;
25 | expect(createSimpleTransformer(transformer)).toStrictEqual({
26 | type: TransformerType.Simple,
27 | transform: transformer,
28 | });
29 | });
30 | });
31 |
32 | describe('createStatefulTransformer', () => {
33 | it('should return a container holding an instance produced by the factory given', () => {
34 | const statefulTransformer: StatefulTransformer = {
35 | transform: () => undefined,
36 | reset: () => {
37 | /* do nothing */
38 | },
39 | };
40 | const factory = () => statefulTransformer;
41 | expect(createStatefulTransformer(factory)).toStrictEqual({
42 | type: TransformerType.Stateful,
43 | factory,
44 | });
45 | });
46 | });
47 |
--------------------------------------------------------------------------------
/test/transformer/collapse-duplicates/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import type { CollapseDuplicatesTransformerOptions } from '../../../src/transformer/collapse-duplicates/index';
3 | import { collapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/index';
4 | import { CollapseDuplicatesTransformer as _CollapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/transformer';
5 | import { CharacterCode } from '../../../src/util/Char';
6 |
7 | jest.mock('../../../src/transformer/collapse-duplicates/transformer');
8 |
9 | // eslint-disable-next-line @typescript-eslint/naming-convention
10 | const CollapseDuplicatesTransformer = _CollapseDuplicatesTransformer as jest.MockedClass<
11 | typeof _CollapseDuplicatesTransformer
12 | >;
13 |
14 | beforeEach(() => {
15 | CollapseDuplicatesTransformer.mockClear();
16 | });
17 |
18 | describe('collapseDuplicatesTransformer()', () => {
19 | describe('customThresholds processing', () => {
20 | it('should throw if any threshold was < 0', () => {
21 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['a', -1]]) })).toThrow(RangeError);
22 | });
23 |
24 | it('should not throw for threshold=0', () => {
25 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['a', 0]]) })).not.toThrow(RangeError);
26 | });
27 |
28 | it('should throw if the string corresponding to a threshold had length 0', () => {
29 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['', 1]]) })).toThrow(RangeError);
30 | });
31 |
32 | it('should throw if the string corresponding to a threshold was comprised of more than 1 code point', () => {
33 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['ab', 1]]) })).toThrow(RangeError);
34 | });
35 |
36 | it("should create a map of character code => threshold and pass that to CollapseDuplicateTransformer's constructor", () => {
37 | collapseDuplicatesTransformer({
38 | customThresholds: new Map([
39 | ['a', 2],
40 | ['z', 3],
41 | ]),
42 | }).factory();
43 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1);
44 | expect(CollapseDuplicatesTransformer.mock.calls[0][0]).toMatchObject({
45 | customThresholds: new Map([
46 | [CharacterCode.LowerA, 2],
47 | [CharacterCode.LowerZ, 3],
48 | ]),
49 | });
50 | });
51 | });
52 |
53 | it("should pass the options given to CollapseDuplicatesTransformer's constructor", () => {
54 | const options: CollapseDuplicatesTransformerOptions = {
55 | defaultThreshold: 5,
56 | customThresholds: new Map([
57 | ['a', 2],
58 | ['z', 3],
59 | ]),
60 | };
61 | collapseDuplicatesTransformer(options).factory();
62 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1);
63 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({
64 | defaultThreshold: 5,
65 | customThresholds: new Map([
66 | [CharacterCode.LowerA, 2],
67 | [CharacterCode.LowerZ, 3],
68 | ]),
69 | });
70 | });
71 |
72 | it('should use 1 as the value for defaultThreshold if not provided', () => {
73 | const options: CollapseDuplicatesTransformerOptions = {
74 | customThresholds: new Map([
75 | ['a', 2],
76 | ['z', 3],
77 | ]),
78 | };
79 | collapseDuplicatesTransformer(options).factory();
80 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1);
81 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({
82 | defaultThreshold: 1,
83 | customThresholds: new Map([
84 | [CharacterCode.LowerA, 2],
85 | [CharacterCode.LowerZ, 3],
86 | ]),
87 | });
88 | });
89 |
90 | it('should use an empty map as the value for customThresholds if not provided', () => {
91 | const options: CollapseDuplicatesTransformerOptions = {
92 | defaultThreshold: 1,
93 | };
94 | collapseDuplicatesTransformer(options).factory();
95 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1);
96 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({ ...options, customThresholds: new Map() });
97 | });
98 |
99 | it('should return a stateful transformer container', () => {
100 | const container = collapseDuplicatesTransformer();
101 | expect(container.type).toBe(TransformerType.Stateful);
102 | expect(container.factory).toStrictEqual(expect.any(Function));
103 | });
104 | });
105 |
--------------------------------------------------------------------------------
/test/transformer/collapse-duplicates/transformer.test.ts:
--------------------------------------------------------------------------------
1 | import { CollapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/transformer';
2 |
3 | describe('CollapseDuplicatesTransformer#transform()', () => {
4 | describe('threshold selection', () => {
5 | it('should use the default threshold if there is no corresponding custom threshold', () => {
6 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 1, customThresholds: new Map() });
7 | expect(transformer.transform(1)).toBe(1);
8 | expect(transformer.transform(1)).toBeUndefined();
9 | });
10 |
11 | it('should use the custom threshold if one is provided', () => {
12 | const transformer = new CollapseDuplicatesTransformer({
13 | defaultThreshold: 1,
14 | customThresholds: new Map([[1, 2]]),
15 | });
16 | expect(transformer.transform(1)).toBe(1);
17 | expect(transformer.transform(1)).toBe(1);
18 | expect(transformer.transform(1)).toBeUndefined();
19 | });
20 | });
21 |
22 | it('should return undefined for characters with a threshold <= 0', () => {
23 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 0, customThresholds: new Map() });
24 | expect(transformer.transform(1)).toBeUndefined();
25 | expect(transformer.transform(2)).toBeUndefined();
26 | });
27 |
28 | it('should be a noop until the threshold is hit', () => {
29 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 5, customThresholds: new Map() });
30 | expect(transformer.transform(1)).toBe(1);
31 | expect(transformer.transform(1)).toBe(1);
32 | expect(transformer.transform(1)).toBe(1);
33 | expect(transformer.transform(1)).toBe(1);
34 | expect(transformer.transform(1)).toBe(1);
35 | expect(transformer.transform(1)).toBeUndefined();
36 | });
37 |
38 | it('should reset the threshold once a different character is seen', () => {
39 | const transformer = new CollapseDuplicatesTransformer({
40 | defaultThreshold: 1,
41 | customThresholds: new Map([
42 | [1, 2],
43 | [2, 3],
44 | ]),
45 | });
46 | expect(transformer.transform(1)).toBe(1);
47 | expect(transformer.transform(1)).toBe(1);
48 | expect(transformer.transform(1)).toBeUndefined();
49 | expect(transformer.transform(2)).toBe(2);
50 | expect(transformer.transform(2)).toBe(2);
51 | expect(transformer.transform(2)).toBe(2);
52 | expect(transformer.transform(2)).toBeUndefined();
53 | });
54 | });
55 |
56 | describe('CollapseDuplicatesTransformer#reset()', () => {
57 | it('should reset the threshold and current character', () => {
58 | const transformer = new CollapseDuplicatesTransformer({
59 | defaultThreshold: 2,
60 | customThresholds: new Map(),
61 | });
62 | expect(transformer.transform(1)).toBe(1);
63 | expect(transformer.transform(1)).toBe(1);
64 | expect(transformer.transform(1)).toBeUndefined();
65 | transformer.reset();
66 | expect(transformer.transform(1)).toBe(1);
67 | expect(transformer.transform(1)).toBe(1);
68 | expect(transformer.transform(1)).toBeUndefined();
69 | });
70 | });
71 |
--------------------------------------------------------------------------------
/test/transformer/remap-characters/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import { remapCharactersTransformer } from '../../../src/transformer/remap-characters';
3 | import { CharacterCode } from '../../../src/util/Char';
4 |
5 | describe('remapCharactersTransformer()', () => {
6 | it('should return a simple transformer container', () => {
7 | const container = remapCharactersTransformer({ a: 'b' });
8 | expect(container.type).toBe(TransformerType.Simple);
9 | expect(typeof container.transform).toBe('function');
10 | });
11 |
12 | describe('options', () => {
13 | it('should throw if given an object where keys are comprised of more than one codepoint', () => {
14 | expect(() => remapCharactersTransformer({ ab: 'cd' })).toThrow(RangeError);
15 | });
16 |
17 | it('should throw if given an object where keys are empty strings', () => {
18 | // eslint-disable-next-line @typescript-eslint/naming-convention
19 | expect(() => remapCharactersTransformer({ '': 'cd' })).toThrow(RangeError);
20 | });
21 |
22 | it('should throw if given an map where keys are comprised of more than one codepoint', () => {
23 | expect(() => remapCharactersTransformer(new Map([['ab', 'cd']]))).toThrow(RangeError);
24 | });
25 |
26 | it('should throw if given an map where keys are empty strings', () => {
27 | expect(() => remapCharactersTransformer(new Map([['', 'cd']]))).toThrow(RangeError);
28 | });
29 | });
30 |
31 | describe('character remapping', () => {
32 | it('should map any of the equivalent characters to the transformed character (object version)', () => {
33 | const transformer = remapCharactersTransformer({ a: 'bc' });
34 | expect(transformer.transform('b'.charCodeAt(0))).toBe(CharacterCode.LowerA);
35 | expect(transformer.transform('c'.charCodeAt(0))).toBe(CharacterCode.LowerA);
36 | });
37 |
38 | it('should map any of the equivalent characters to the transformed character (map version)', () => {
39 | const transformer = remapCharactersTransformer(new Map([['a', 'bc']]));
40 | expect(transformer.transform('b'.charCodeAt(0))).toBe(CharacterCode.LowerA);
41 | expect(transformer.transform('c'.charCodeAt(0))).toBe(CharacterCode.LowerA);
42 | });
43 |
44 | it('should leave other characters unchanged', () => {
45 | const transformer = remapCharactersTransformer({ a: 'bc' });
46 | expect(transformer.transform('e'.charCodeAt(0))).toBe('e'.charCodeAt(0));
47 | expect(transformer.transform('z'.charCodeAt(0))).toBe('z'.charCodeAt(0));
48 | });
49 | });
50 | });
51 |
--------------------------------------------------------------------------------
/test/transformer/resolve-confusables/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import { resolveConfusablesTransformer } from '../../../src/transformer/resolve-confusables';
3 | import { CharacterCode } from '../../../src/util/Char';
4 |
5 | describe('resolveConfusablesTransformer()', () => {
6 | it('should return a simple transformer container', () => {
7 | const container = resolveConfusablesTransformer();
8 | expect(container.type).toBe(TransformerType.Simple);
9 | expect(typeof container.transform).toBe('function');
10 | });
11 |
12 | describe('character remapping', () => {
13 | it('should remap relevant characters to their normalized equivalent', () => {
14 | const transformer = resolveConfusablesTransformer();
15 | expect(transformer.transform('⓵'.codePointAt(0)!)).toBe('1'.charCodeAt(0));
16 | expect(transformer.transform('❌'.codePointAt(0)!)).toBe('X'.codePointAt(0));
17 | });
18 |
19 | it('should leave other characters unchanged', () => {
20 | const transformer = resolveConfusablesTransformer();
21 | expect(transformer.transform(CharacterCode.LowerA)).toBe(CharacterCode.LowerA);
22 | });
23 | });
24 | });
25 |
--------------------------------------------------------------------------------
/test/transformer/resolve-leetspeak/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import { resolveLeetSpeakTransformer } from '../../../src/transformer/resolve-leetspeak';
3 | import { CharacterCode } from '../../../src/util/Char';
4 |
5 | describe('resolveLeetSpeakTransformer()', () => {
6 | it('should return a simple transformer container', () => {
7 | const container = resolveLeetSpeakTransformer();
8 | expect(container.type).toBe(TransformerType.Simple);
9 | expect(typeof container.transform).toBe('function');
10 | });
11 |
12 | describe('character remapping', () => {
13 | it('should remap relevant characters to their normalized equivalent', () => {
14 | const transformer = resolveLeetSpeakTransformer();
15 | expect(transformer.transform('@'.charCodeAt(0))).toBe(CharacterCode.LowerA);
16 | expect(transformer.transform('4'.charCodeAt(0))).toBe(CharacterCode.LowerA);
17 | expect(transformer.transform('('.charCodeAt(0))).toBe('c'.charCodeAt(0));
18 | expect(transformer.transform('3'.charCodeAt(0))).toBe('e'.charCodeAt(0));
19 | expect(transformer.transform('1'.charCodeAt(0))).toBe('i'.charCodeAt(0));
20 | expect(transformer.transform('!'.charCodeAt(0))).toBe('i'.charCodeAt(0));
21 | expect(transformer.transform('|'.charCodeAt(0))).toBe('i'.charCodeAt(0));
22 | expect(transformer.transform('6'.charCodeAt(0))).toBe('g'.charCodeAt(0));
23 | expect(transformer.transform('0'.charCodeAt(0))).toBe('o'.charCodeAt(0));
24 | expect(transformer.transform('$'.charCodeAt(0))).toBe('s'.charCodeAt(0));
25 | expect(transformer.transform('5'.charCodeAt(0))).toBe('s'.charCodeAt(0));
26 | expect(transformer.transform('7'.charCodeAt(0))).toBe('t'.charCodeAt(0));
27 | expect(transformer.transform('2'.charCodeAt(0))).toBe(CharacterCode.LowerZ);
28 | });
29 |
30 | it('should leave other characters as is', () => {
31 | const transformer = resolveLeetSpeakTransformer();
32 | expect(transformer.transform('f'.charCodeAt(0))).toBe('f'.charCodeAt(0));
33 | expect(transformer.transform(CharacterCode.Backslash)).toBe(CharacterCode.Backslash);
34 | });
35 | });
36 | });
37 |
--------------------------------------------------------------------------------
/test/transformer/skip-non-alphabetic/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import { skipNonAlphabeticTransformer } from '../../../src/transformer/skip-non-alphabetic';
3 | import { CharacterCode } from '../../../src/util/Char';
4 |
5 | describe('skipNonAlphabeticTransformer()', () => {
6 | it('should return a simple transformer container', () => {
7 | const container = skipNonAlphabeticTransformer();
8 | expect(container.type).toBe(TransformerType.Simple);
9 | expect(typeof container.transform).toBe('function');
10 | });
11 |
12 | describe('character skipping', () => {
13 | it('should leave lowercase alphabet characters as is', () => {
14 | const transformer = skipNonAlphabeticTransformer();
15 | expect(transformer.transform('c'.charCodeAt(0))).toBe('c'.charCodeAt(0));
16 | expect(transformer.transform(CharacterCode.LowerZ)).toBe(CharacterCode.LowerZ);
17 | });
18 |
19 | it('should skip uppercase alphabet characters', () => {
20 | const transformer = skipNonAlphabeticTransformer();
21 | expect(transformer.transform('D'.charCodeAt(0))).toBe('D'.charCodeAt(0));
22 | expect(transformer.transform(CharacterCode.UpperA)).toBe(CharacterCode.UpperA);
23 | });
24 |
25 | it('should return undefined (skip) for all other characters', () => {
26 | const transformer = skipNonAlphabeticTransformer();
27 | expect(transformer.transform(CharacterCode.Backslash)).toBeUndefined();
28 | expect(transformer.transform(32)).toBeUndefined();
29 | expect(transformer.transform(CharacterCode.QuestionMark)).toBeUndefined();
30 | expect(transformer.transform(CharacterCode.Zero)).toBeUndefined();
31 | });
32 | });
33 | });
34 |
--------------------------------------------------------------------------------
/test/transformer/to-ascii-lowercase/index.test.ts:
--------------------------------------------------------------------------------
1 | import { TransformerType } from '../../../src/transformer/Transformers';
2 | import { toAsciiLowerCaseTransformer } from '../../../src/transformer/to-ascii-lowercase';
3 | import { CharacterCode } from '../../../src/util/Char';
4 |
5 | describe('toAsciiLowerCaseTransformer()', () => {
6 | it('should return a simple transformer container', () => {
7 | const container = toAsciiLowerCaseTransformer();
8 | expect(container.type).toBe(TransformerType.Simple);
9 | expect(typeof container.transform).toBe('function');
10 | });
11 |
12 | describe('case folding', () => {
13 | it('should change uppercase ascii characters to lowercase', () => {
14 | const container = toAsciiLowerCaseTransformer();
15 | expect(container.transform(CharacterCode.UpperA)).toBe(CharacterCode.LowerA);
16 | });
17 |
18 | it('should leave lowercase chars unchanged', () => {
19 | const container = toAsciiLowerCaseTransformer();
20 | expect(container.transform(CharacterCode.LowerA)).toBe(CharacterCode.LowerA);
21 | });
22 |
23 | it('should leave all other characters unchanged', () => {
24 | const container = toAsciiLowerCaseTransformer();
25 | expect(container.transform(3)).toBe(3);
26 | expect(container.transform(CharacterCode.Backslash)).toBe(CharacterCode.Backslash);
27 | });
28 | });
29 | });
30 |
--------------------------------------------------------------------------------
/test/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "../tsconfig.base.json",
3 | "include": ["."],
4 | "compilerOptions": { "noEmit": true },
5 | "references": [{ "path": "../src" }]
6 | }
7 |
--------------------------------------------------------------------------------
/test/util/CharacterIterator.fuzz.test.ts:
--------------------------------------------------------------------------------
1 | import * as fc from 'fast-check';
2 | import { CharacterIterator } from '../../src/util/CharacterIterator';
3 |
4 | test('the result of the character iterator over a string s should be equal to spreading s and mapping each value into its codepoint', () => {
5 | fc.assert(
6 | fc.property(fc.string16bits(), (str) => {
7 | const expected = [...str].map((s) => s.codePointAt(0)!);
8 | expect([...new CharacterIterator(str)]).toStrictEqual(expected);
9 | }),
10 | );
11 | });
12 |
--------------------------------------------------------------------------------
/test/util/CharacterIterator.test.ts:
--------------------------------------------------------------------------------
1 | import { CharacterCode } from '../../src/util/Char';
2 | import { CharacterIterator } from '../../src/util/CharacterIterator';
3 |
4 | describe('constructor', () => {
5 | it('should default the input to an empty string if not provided', () => {
6 | expect(new CharacterIterator(undefined).input).toBe('');
7 | });
8 |
9 | it('should set the input to that provided if not undefined', () => {
10 | expect(new CharacterIterator('hello').input).toBe('hello');
11 | });
12 |
13 | it('should default the position to -1', () => {
14 | expect(new CharacterIterator('world').position).toBe(-1);
15 | });
16 | });
17 |
18 | describe('CharacterIterator#setInput()', () => {
19 | it('should reset the position', () => {
20 | const iter = new CharacterIterator('world');
21 | iter.next();
22 | expect(iter.setInput('hello').position).toBe(-1);
23 | });
24 |
25 | it('should set the input', () => {
26 | const iter = new CharacterIterator('world');
27 | iter.next();
28 | expect(iter.setInput('hello').input).toBe('hello');
29 | });
30 | });
31 |
32 | describe('CharacterIterator#reset()', () => {
33 | it('should reset the position', () => {
34 | const iter = new CharacterIterator('world');
35 | iter.next();
36 | iter.reset();
37 | expect(iter.position).toBe(-1);
38 | expect(iter.next()).toStrictEqual({ done: false, value: 'w'.charCodeAt(0) });
39 | });
40 |
41 | it('should not reset the input', () => {
42 | const iter = new CharacterIterator('hello');
43 | iter.next();
44 | iter.reset();
45 | expect(iter.input).toBe('hello');
46 | });
47 | });
48 |
49 | describe('CharacterIterator#next()', () => {
50 | it('should return done: true when done', () => {
51 | const iter = new CharacterIterator();
52 | expect(iter.next()).toStrictEqual({ done: true, value: undefined });
53 | });
54 |
55 | it('should return the next character code unmodified if it does not form a surrogate pair', () => {
56 | const iter = new CharacterIterator('h');
57 | expect(iter.next()).toStrictEqual({ done: false, value: 'h'.charCodeAt(0) });
58 | });
59 |
60 | it('should return the next character despite it being a high surrogate if it is the last character', () => {
61 | const highSurrogate = '🌉'.charCodeAt(0);
62 | const iter = new CharacterIterator(String.fromCharCode(highSurrogate));
63 | expect(iter.next()).toStrictEqual({ done: false, value: highSurrogate });
64 | });
65 |
66 | it('should return the next character despite it being a high surrogate if the character after it is not a low surrogate', () => {
67 | const highSurrogate = '🌉'.charCodeAt(0);
68 | const iter = new CharacterIterator(String.fromCharCode(highSurrogate, CharacterCode.LowerA));
69 | expect(iter.next()).toStrictEqual({ done: false, value: highSurrogate });
70 | expect(iter.next()).toStrictEqual({ done: false, value: CharacterCode.LowerA });
71 | });
72 |
73 | it('should combine valid surrogate pairs into its corresponding code point', () => {
74 | const iter = new CharacterIterator('🌉abc');
75 | expect(iter.next()).toStrictEqual({ done: false, value: '🌉abc'.codePointAt(0) });
76 | expect(iter.next()).toStrictEqual({ done: false, value: CharacterCode.LowerA });
77 | });
78 | });
79 |
80 | describe('CharacterIterator#position', () => {
81 | it('should start as -1', () => {
82 | expect(new CharacterIterator().position).toBe(-1);
83 | });
84 |
85 | it('should be the start position of the last character read (no surrogate pairs)', () => {
86 | const iter = new CharacterIterator('test');
87 | iter.next();
88 | expect(iter.position).toBe(0);
89 | });
90 |
91 | it('should be the start position of the last character read (with surrogate pairs)', () => {
92 | const iter = new CharacterIterator('🌉abc');
93 | iter.next();
94 | expect(iter.position).toBe(0);
95 | iter.next();
96 | expect(iter.position).toBe(2); // surrogate pair takes up 2 chars
97 | });
98 |
99 | it('should revert to -1 after resetting', () => {
100 | const iter = new CharacterIterator('hello');
101 | iter.next();
102 | iter.reset();
103 | expect(iter.position).toBe(-1);
104 | });
105 | });
106 |
107 | describe('CharacterIterator#lastWidth', () => {
108 | it('should start as 0', () => {
109 | expect(new CharacterIterator().lastWidth).toBe(0);
110 | });
111 |
112 | it('should be 2 if the last character consumed was a surrogate pair', () => {
113 | const iter = new CharacterIterator('🌉abc');
114 | iter.next();
115 | expect(iter.lastWidth).toBe(2);
116 | });
117 |
118 | it('should be 1 if the last character consumed was not a surrogate pair', () => {
119 | const iter = new CharacterIterator('hello');
120 | iter.next();
121 | expect(iter.lastWidth).toBe(1);
122 | });
123 |
124 | it('should revert to 0 after resetting', () => {
125 | const iter = new CharacterIterator('asdf');
126 | iter.next();
127 | iter.reset();
128 | expect(iter.lastWidth).toBe(0);
129 | });
130 | });
131 |
132 | describe('CharacterIterator#done', () => {
133 | it('should be true for empty strings', () => {
134 | expect(new CharacterIterator().done).toBeTruthy();
135 | });
136 |
137 | it('should be false if the input has not been completely consumed', () => {
138 | expect(new CharacterIterator('hh').done).toBeFalsy();
139 | });
140 |
141 | it('should be true if all input has been consumed', () => {
142 | const iter = new CharacterIterator('hello');
143 | for (let i = 0; i < 5; i++) iter.next();
144 | expect(iter.done).toBeTruthy();
145 | });
146 |
147 | it('should be false after resetting', () => {
148 | const iter = new CharacterIterator('hello');
149 | for (let i = 0; i < 5; i++) iter.next();
150 | expect(iter.done).toBeTruthy();
151 | iter.reset();
152 | expect(iter.done).toBeFalsy();
153 | });
154 | });
155 |
156 | describe('iterating over it', () => {
157 | it('should be iterable', () => {
158 | const iter = new CharacterIterator('hello!');
159 | const chars: number[] = [];
160 | for (const char of iter) {
161 | chars.push(char);
162 | }
163 |
164 | expect(String.fromCodePoint(...chars)).toBe('hello!');
165 | });
166 | });
167 |
--------------------------------------------------------------------------------
/test/util/Interval.test.ts:
--------------------------------------------------------------------------------
1 | import { compareIntervals } from '../../src/util/Interval';
2 |
3 | describe('compareIntervals()', () => {
4 | it("should return -1 if the first interval's lower bound is less than the second's", () => {
5 | expect(compareIntervals(1, 5, 2, 3)).toBe(-1);
6 | });
7 |
8 | it("should return 1 if the second interval's lower bound is less than the first's", () => {
9 | expect(compareIntervals(2, 3, 1, 5)).toBe(1);
10 | });
11 |
12 | it("should return -1 if the first interval's upper bound is less than the second's", () => {
13 | expect(compareIntervals(2, 3, 2, 5)).toBe(-1);
14 | });
15 |
16 | it("should return 1 if the second interval's upper bound is less than the first's", () => {
17 | expect(compareIntervals(2, 5, 2, 3)).toBe(1);
18 | });
19 |
20 | it('should return 0 if the first and second intervals are equal', () => {
21 | expect(compareIntervals(1, 5, 1, 5)).toBe(0);
22 | });
23 | });
24 |
--------------------------------------------------------------------------------
/tsconfig.base.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "strict": true,
4 | "noUnusedLocals": true,
5 | "noImplicitAny": true,
6 | "alwaysStrict": true,
7 | "pretty": true,
8 | "module": "CommonJS",
9 | "moduleResolution": "Node",
10 | "target": "ES2020",
11 | "lib": ["ES2020"]
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/tsconfig.eslint.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "./tsconfig.base.json",
3 | "include": ["src", "test", "examples", "jest.config.ts"]
4 | }
5 |
--------------------------------------------------------------------------------
/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "readme": "none",
3 | "entryPoints": ["src/index.ts"],
4 | "out": "docs/reference",
5 | "tsconfig": "src/tsconfig.json",
6 | "excludePrivate": true
7 | }
8 |
--------------------------------------------------------------------------------