├── .commitlintrc.json ├── .eslintignore ├── .eslintrc.yml ├── .gitattributes ├── .github ├── CODE_OF_CONDUCT.md ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ └── feature_request.yml ├── PULL_REQUEST_TEMPLATE.md ├── problem-matchers │ └── tsc.json └── workflows │ ├── codeql-analysis.yml │ └── continuous-integration.yml ├── .gitignore ├── .prettierignore ├── .prettierrc.json ├── .vscode └── settings.json ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── docs ├── README.md ├── guide │ ├── README.md │ ├── censoring-text.md │ ├── datasets.md │ ├── matchers.md │ ├── patterns.md │ └── transformers.md └── reference │ ├── .nojekyll │ ├── README.md │ ├── classes │ ├── DataSet.md │ ├── ParserError.md │ ├── PhraseBuilder.md │ ├── RegExpMatcher.md │ └── TextCensor.md │ ├── enums │ └── SyntaxKind.md │ └── interfaces │ ├── BlacklistedTerm.md │ ├── BoundaryAssertionNode.md │ ├── CollapseDuplicatesTransformerOptions.md │ ├── LiteralNode.md │ ├── MatchPayload.md │ ├── Matcher.md │ ├── OptionalNode.md │ ├── ParsedPattern.md │ ├── PhraseContainer.md │ ├── ProcessedCollapseDuplicatesTransformerOptions.md │ ├── RegExpMatcherOptions.md │ └── WildcardNode.md ├── examples ├── extending-datasets.js └── repl.js ├── jest.config.ts ├── package.json ├── pnpm-lock.yaml ├── renovate.json ├── scripts ├── english-words.txt └── search-words.js ├── src ├── censor │ ├── BuiltinStrategies.ts │ └── TextCensor.ts ├── dataset │ └── DataSet.ts ├── index.ts ├── matcher │ ├── BlacklistedTerm.ts │ ├── IntervalCollection.ts │ ├── MatchPayload.ts │ ├── Matcher.ts │ └── regexp │ │ └── RegExpMatcher.ts ├── pattern │ ├── Nodes.ts │ ├── Parser.ts │ ├── ParserError.ts │ ├── Pattern.ts │ └── Util.ts ├── preset │ └── english.ts ├── transformer │ ├── TransformerSet.ts │ ├── Transformers.ts │ ├── collapse-duplicates │ │ ├── index.ts │ │ └── transformer.ts │ ├── remap-characters │ │ └── index.ts │ ├── resolve-confusables │ │ ├── confusables.ts │ │ └── index.ts │ ├── resolve-leetspeak │ │ ├── dictionary.ts │ │ └── index.ts │ ├── skip-non-alphabetic │ │ └── index.ts │ └── to-ascii-lowercase │ │ └── index.ts ├── tsconfig.json └── util │ ├── Char.ts │ ├── CharacterIterator.ts │ └── Interval.ts ├── test ├── censor │ ├── BuiltinStrategies.test.ts │ └── TextCensor.test.ts ├── dataset │ └── DataSet.test.ts ├── jest.setup.ts ├── matcher │ ├── BlacklistedTerm.test.ts │ ├── IntervalCollection.test.ts │ ├── MatchPayload.test.ts │ └── regexp │ │ └── RegExpMatcher.test.ts ├── pattern │ ├── Parser.test.ts │ ├── ParserError.test.ts │ ├── Pattern.test.ts │ └── Util.test.ts ├── transformer │ ├── TransformerSet.test.ts │ ├── Transformers.test.ts │ ├── collapse-duplicates │ │ ├── index.test.ts │ │ └── transformer.test.ts │ ├── remap-characters │ │ └── index.test.ts │ ├── resolve-confusables │ │ └── index.test.ts │ ├── resolve-leetspeak │ │ └── index.test.ts │ ├── skip-non-alphabetic │ │ └── index.test.ts │ └── to-ascii-lowercase │ │ └── index.test.ts ├── tsconfig.json └── util │ ├── Char.test.ts │ ├── CharacterIterator.fuzz.test.ts │ ├── CharacterIterator.test.ts │ └── Interval.test.ts ├── tsconfig.base.json ├── tsconfig.eslint.json └── typedoc.json /.commitlintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["@commitlint/config-angular"], 3 | "rules": { 4 | "scope-case": [2, "always", "lowerCase"], 5 | "type-enum": [ 6 | 2, 7 | "always", 8 | ["chore", "build", "ci", "docs", "feat", "fix", "perf", "refactor", "revert", "style", "test"] 9 | ] 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | coverage 4 | scripts 5 | examples 6 | -------------------------------------------------------------------------------- /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | --- 2 | extends: 3 | - plugin:@typescript-eslint/recommended-type-checked 4 | - plugin:jest/recommended 5 | - plugin:jest/style 6 | 7 | plugins: 8 | - jest 9 | - prettier 10 | 11 | parserOptions: 12 | project: 13 | - tsconfig.eslint.json 14 | - src/tsconfig.json 15 | - test/tsconfig.json 16 | 17 | rules: 18 | prettier/prettier: 19 | - error 20 | - endOfLine: auto 21 | no-duplicate-imports: off 22 | curly: 23 | - error 24 | - multi-line 25 | import/extensions: off 26 | id-length: off 27 | tsdoc/syntax: off 28 | '@typescript-eslint/restrict-plus-operands': off 29 | '@typescript-eslint/no-explicit-any': off 30 | '@typescript-eslint/no-unsafe-enum-comparison': off 31 | '@typescript-eslint/consistent-type-definitions': 32 | - error 33 | - interface 34 | '@typescript-eslint/prefer-literal-enum-member': 35 | - error 36 | - allowBitwiseExpressions: true 37 | '@typescript-eslint/no-use-before-define': off 38 | # The following rule conflicts with Prettier in certain cases. 39 | # Also see https://github.com/typescript-eslint/typescript-eslint/issues/1824. 40 | '@typescript-eslint/indent': off 41 | '@typescript-eslint/no-misused-promises': 42 | - error 43 | - checksVoidReturn: false 44 | '@typescript-eslint/no-unnecessary-condition': 45 | - error 46 | - allowConstantLoopConditions: true 47 | '@typescript-eslint/no-throw-literal': off 48 | '@typescript-eslint/naming-convention': 49 | - error 50 | - selector: 51 | - enumMember 52 | - typeAlias 53 | - interface 54 | - enum 55 | - class 56 | format: 57 | - PascalCase 58 | leadingUnderscore: forbid 59 | trailingUnderscore: forbid 60 | 61 | - selector: 62 | - method 63 | - accessor 64 | - parameterProperty 65 | format: 66 | - strictCamelCase 67 | 68 | - selector: 69 | - property 70 | format: 71 | - strictCamelCase 72 | - PascalCase 73 | filter: 74 | regex: '\d+' 75 | match: false 76 | leadingUnderscore: allow 77 | trailingUnderscore: forbid 78 | 79 | - selector: typeParameter 80 | format: 81 | - PascalCase 82 | 83 | - selector: variable 84 | format: 85 | - strictCamelCase 86 | - UPPER_CASE 87 | leadingUnderscore: allow 88 | trailingUnderscore: forbid 89 | 90 | - selector: function 91 | format: 92 | - strictCamelCase 93 | leadingUnderscore: forbid 94 | trailingUnderscore: forbid 95 | '@typescript-eslint/member-ordering': 96 | - error 97 | - default: 98 | - public-static-field 99 | - protected-static-field 100 | - private-static-field 101 | - static-field 102 | - public-static-method 103 | - protected-static-method 104 | - private-static-method 105 | - static-method 106 | - signature 107 | - public-instance-field 108 | - protected-instance-field 109 | - private-instance-field 110 | - instance-field 111 | - public-constructor 112 | - protected-constructor 113 | - private-constructor 114 | - constructor 115 | - public-instance-method 116 | - protected-instance-method 117 | - private-instance-method 118 | - instance-method 119 | '@typescript-eslint/consistent-type-imports': error 120 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language 12 | - Being respectful of differing viewpoints and experiences 13 | - Gracefully accepting constructive criticism 14 | - Focusing on what is best for the community 15 | - Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | - Trolling, insulting/derogatory comments, and personal or political attacks 21 | - Public or private harassment 22 | - Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [jo3.l.dev@outlook.com](mailto:jo3.l.dev@outlook.com). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [https://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: https://contributor-covenant.org 46 | [version]: https://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug report 2 | description: Create an issue about a possible bug 3 | title: 'bug: ' 4 | labels: [bug] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Please first [check through existing issues](https://github.com/jo3-l/obscenity/issues) in case your problem 10 | has already been raised. 11 | - type: textarea 12 | attributes: 13 | label: Expected behavior 14 | description: What were you expecting to happen? 15 | placeholder: Using foo, I expected bar to happen. 16 | validations: 17 | required: true 18 | - type: textarea 19 | attributes: 20 | label: Actual behavior 21 | description: What happened instead? 22 | placeholder: Instead, baz happened. 23 | validations: 24 | required: true 25 | - type: markdown 26 | attributes: 27 | value: | 28 | Including more detail in your bug report will expedite the review 29 | process. A minimal reproducible example is preferred. 30 | - type: textarea 31 | attributes: 32 | label: Minimal reproducible example 33 | description: A [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) that demonstrates the problem. 34 | placeholder: | 35 | import { RegExpMatcher } from 'obscenity'; 36 | // ... 37 | render: typescript 38 | - type: textarea 39 | attributes: 40 | label: Steps to reproduce 41 | description: Provide steps to reproduce the problem. 42 | placeholder: | 43 | 1. Run foo 44 | 2. ... 45 | 3. See error 46 | - type: textarea 47 | attributes: 48 | label: Additional context 49 | description: | 50 | Links? References? Anything that will give us more context about the issue you are encountering? 51 | 52 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in. 53 | - type: textarea 54 | attributes: 55 | label: Node.js version 56 | description: What version of Node are you using? 57 | placeholder: v16.7.0 58 | validations: 59 | required: true 60 | - type: textarea 61 | attributes: 62 | label: Obscenity version 63 | description: | 64 | What version of the library are you using? 65 | 66 | Tip: You can get this using 'npm ls obscenity', 'yarn list obscenity' or 'pnpm ls obscenity' (depending on which package manager you're using). 67 | placeholder: v0.1.0 68 | validations: 69 | required: true 70 | - type: checkboxes 71 | attributes: 72 | label: Priority 73 | description: What should the priority of this issue be? 74 | options: 75 | - label: Low 76 | - label: Medium 77 | - label: High 78 | validations: 79 | required: true 80 | - type: checkboxes 81 | attributes: 82 | label: Terms 83 | description: 'By submitting this issue, you confirm the following:' 84 | options: 85 | - label: I agree to follow the project's Code of Conduct. 86 | required: true 87 | - label: I have searched existing issues for similar reports. 88 | required: true 89 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for the repository 3 | title: 'request: ' 4 | labels: [enhancement] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | Please first [check through existing issues](https://github.com/jo3-l/obscenity/issues) in case your request 10 | has already been raised. 11 | - type: textarea 12 | attributes: 13 | label: Description 14 | description: Is your feature request related to a problem? Please describe. 15 | validations: 16 | required: true 17 | - type: textarea 18 | attributes: 19 | label: Solution 20 | description: Explain the solution you would like to see. Please also provide alternatives to your solution. 21 | validations: 22 | required: true 23 | - type: checkboxes 24 | attributes: 25 | label: Code of Conduct 26 | description: By submitting this issue, you agree to follow our [Code of Conduct](../CODE_OF_CONDUCT.md) 27 | options: 28 | - label: I agree to follow this project's Code of Conduct. 29 | required: true 30 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | **Type of change:** 2 | 3 | - [ ] Refactor 4 | - [ ] Performance improvement 5 | - [ ] New feature 6 | - [ ] Bug fix 7 | - [ ] Other (please describe): 8 | 9 | **Please describe the changes this PR makes and why it should be merged:** 10 | 11 | **Status:** 12 | 13 | - [ ] I've added/modified unit tests relevant to my change / not needed 14 | - [ ] This PR contains breaking changes 15 | - [ ] This PR doesn't include changes to the code 16 | -------------------------------------------------------------------------------- /.github/problem-matchers/tsc.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "tsc", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\s+\\d+\\>)?([^\\s].*)\\((\\d+),(\\d+)\\)\\s*:\\s+(error|warning|info)\\s+(\\w{1,2}\\d+)\\s*:\\s*(.*)$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "severity": 4, 12 | "code": 5, 13 | "message": 6 14 | } 15 | ] 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: CodeQL Analysis 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: '0 0 * * 1' 8 | 9 | jobs: 10 | codeql-build: 11 | name: CodeQL analysis 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | 18 | - name: Initialize CodeQL 19 | uses: github/codeql-action/init@v3 20 | 21 | - name: Auto-build 22 | uses: github/codeql-action/autobuild@v3 23 | 24 | - name: Perform CodeQL analysis 25 | uses: github/codeql-action/analyze@v3 26 | -------------------------------------------------------------------------------- /.github/workflows/continuous-integration.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | run-eslint: 9 | name: Run ESLint 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v4 15 | 16 | - name: Install pnpm 17 | uses: pnpm/action-setup@v4 18 | 19 | - name: Install Node.js v22 20 | uses: actions/setup-node@v4 21 | with: 22 | node-version: 22 23 | cache: pnpm 24 | 25 | - name: Install dependencies 26 | run: pnpm install --frozen-lockfile 27 | 28 | - name: Run ESLint 29 | run: pnpm lint 30 | 31 | unit-tests: 32 | name: Run unit tests 33 | runs-on: ubuntu-latest 34 | strategy: 35 | matrix: 36 | node-version: [18.x, 20.x, 22.x] 37 | 38 | steps: 39 | - name: Checkout repository 40 | uses: actions/checkout@v4 41 | 42 | - name: Install pnpm 43 | uses: pnpm/action-setup@v4 44 | 45 | - name: Install Node.js 46 | uses: actions/setup-node@v4 47 | with: 48 | node-version: ${{ matrix.node-version }} 49 | cache: pnpm 50 | 51 | - name: Install dependencies 52 | run: pnpm install --frozen-lockfile 53 | 54 | - name: Run unit tests 55 | run: pnpm test:ci 56 | 57 | - name: Upload coverage 58 | uses: codecov/codecov-action@v5 59 | 60 | build-project: 61 | name: Compile source code 62 | runs-on: ubuntu-latest 63 | 64 | steps: 65 | - name: Checkout repository 66 | uses: actions/checkout@v4 67 | 68 | - name: Install pnpm 69 | uses: pnpm/action-setup@v4 70 | 71 | - name: Install Node.js v22 72 | uses: actions/setup-node@v4 73 | with: 74 | node-version: 22 75 | cache: pnpm 76 | 77 | - name: Install dependencies 78 | run: pnpm install --frozen-lockfile 79 | 80 | - name: Register problem matcher 81 | run: echo "##[add-matcher].github/problem-matchers/tsc.json" 82 | 83 | - name: Compile TypeScript 84 | run: pnpm build 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Deps 2 | node_modules 3 | 4 | # Build artifacts 5 | dist 6 | 7 | # Coverage 8 | coverage 9 | *.lcov 10 | .nyc_output 11 | 12 | # Logs 13 | pnpm-debug.log* 14 | *.log 15 | 16 | # IDE 17 | .idea -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | -------------------------------------------------------------------------------- /.prettierrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 120, 3 | "useTabs": true, 4 | "singleQuote": true, 5 | "quoteProps": "as-needed", 6 | "trailingComma": "all", 7 | "endOfLine": "auto", 8 | "overrides": [ 9 | { 10 | "files": ["README.md"], 11 | "options": { 12 | "printWidth": 80 13 | } 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "typescript.tsdk": "node_modules/typescript/lib" 3 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. 4 | 5 | ### [0.4.3](https://github.com/jo3-l/obscenity/compare/v0.4.2...v0.4.3) (2025-01-26) 6 | 7 | 8 | ### Bug Fixes 9 | 10 | * **preset/english:** match 'shit' at end of word ([0299b49](https://github.com/jo3-l/obscenity/commit/0299b4978dec6d218a4e004fe20962a79500fe7c)), closes [#47](https://github.com/jo3-l/obscenity/issues/47) 11 | 12 | ### [0.4.2](https://github.com/jo3-l/obscenity/compare/v0.4.1...v0.4.2) (2025-01-18) 13 | 14 | 15 | ### Features 16 | 17 | * add more characters to leet transformer ([#78](https://github.com/jo3-l/obscenity/issues/78)) ([fa673e6](https://github.com/jo3-l/obscenity/commit/fa673e66226e13388401274610e7d1bd0801ade0)) 18 | 19 | 20 | ### Bug Fixes 21 | 22 | * **censor:** don't generate the same character twice in a row ([#85](https://github.com/jo3-l/obscenity/issues/85)) ([58f2715](https://github.com/jo3-l/obscenity/commit/58f271556aa878e619457054f8a2f423e8b574ca)), closes [#82](https://github.com/jo3-l/obscenity/issues/82) 23 | * **preset/english:** add word boundary to 'shit' pattern ([9554e7c](https://github.com/jo3-l/obscenity/commit/9554e7cc7b796f64a80baa272ed3e49ad03466a3)), closes [#93](https://github.com/jo3-l/obscenity/issues/93) 24 | * **preset/english:** whitelist "fick" ([#88](https://github.com/jo3-l/obscenity/issues/88)) ([40f66fb](https://github.com/jo3-l/obscenity/commit/40f66fb17524f49b1e4be6a2fe1037f3e1b468c2)) 25 | 26 | ### [0.4.1](https://github.com/jo3-l/obscenity/compare/v0.4.0...v0.4.1) (2024-12-03) 27 | 28 | 29 | ### Bug Fixes 30 | 31 | * **preset/english:** add "fickle" to whitelist ([#87](https://github.com/jo3-l/obscenity/issues/87)) ([da754da](https://github.com/jo3-l/obscenity/commit/da754da8d42cf4b36534141b2ceafaa4810b99b5)) 32 | * **preset/english:** remove erroneous patterns for `dick` ([e43d502](https://github.com/jo3-l/obscenity/commit/e43d50260d8f3c55374bd1da65be0dff33a1fd6d)), closes [#86](https://github.com/jo3-l/obscenity/issues/86) 33 | 34 | ## [0.4.0](https://github.com/jo3-l/obscenity/compare/v0.3.1...v0.4.0) (2024-08-02) 35 | 36 | 37 | ### ⚠ BREAKING CHANGES 38 | 39 | * **regexp-matcher:** Passing an empty whitelisted term to the RegExpMatcher will result in a runtime error. 40 | 41 | This was unsupported previously and likely did not work correctly. Make it a real error. 42 | 43 | ### Bug Fixes 44 | 45 | * **regexp-matcher:** advance index correctly in whitelist matcher ([ebf95ad](https://github.com/jo3-l/obscenity/commit/ebf95add62be8297f693ca6d8aafefc10afc1a8b)), closes [#49](https://github.com/jo3-l/obscenity/issues/49) 46 | * **regexp-matcher:** correctly remap to original indices in all cases ([3a49579](https://github.com/jo3-l/obscenity/commit/3a49579f3c242d3e159e88707df090e3f6dc0121)), closes [#71](https://github.com/jo3-l/obscenity/issues/71) 47 | * **regexp-matcher:** reject empty whitelist terms ([9a46113](https://github.com/jo3-l/obscenity/commit/9a461130b98920e22d5acf92650146ae48d2226b)) 48 | 49 | ### [0.3.1](https://github.com/jo3-l/obscenity/compare/v0.3.0...v0.3.1) (2024-07-17) 50 | 51 | ## [0.3.0](https://github.com/jo3-l/obscenity/compare/v0.2.2...v0.3.0) (2024-07-17) 52 | 53 | 54 | ### ⚠ BREAKING CHANGES 55 | 56 | * The library no longer exports a version constant. 57 | 58 | * drop version constant ([2810674](https://github.com/jo3-l/obscenity/commit/2810674de20d82d7372c617d2e8ef76e911f27ad)) 59 | 60 | ### [0.2.2](https://github.com/jo3-l/obscenity/compare/v0.2.1...v0.2.2) (2024-07-17) 61 | 62 | 63 | ### Features 64 | 65 | * **english-preset:** add more blacklisted terms ([#50](https://github.com/jo3-l/obscenity/issues/50)) ([4653de5](https://github.com/jo3-l/obscenity/commit/4653de51e63bd3457daca57316c2b2c851752072)) 66 | 67 | 68 | ### Bug Fixes 69 | 70 | * **english-preset:** whitelist 'kung-fu' ([d60b4f4](https://github.com/jo3-l/obscenity/commit/d60b4f4b766592785ba7c9c51d6d0607c5f26c57)), closes [#67](https://github.com/jo3-l/obscenity/issues/67) 71 | 72 | ### [0.2.1](https://github.com/jo3-l/obscenity/compare/v0.2.0...v0.2.1) (2024-03-03) 73 | 74 | 75 | ### Features 76 | 77 | * **english-preset:** add more blacklisted terms ([#50](https://github.com/jo3-l/obscenity/issues/50)) ([c189595](https://github.com/jo3-l/obscenity/commit/c189595b09554899aeead3dd070d36f8f3269150)) 78 | 79 | ## [0.2.0](https://github.com/jo3-l/obscenity/compare/v0.1.4...v0.2.0) (2024-01-05) 80 | 81 | 82 | ### ⚠ BREAKING CHANGES 83 | 84 | * **english-preset:** Using the default English preset, Obscenity will no longer strip non-alphabetic characters from the input text before matching. 85 | 86 | This addresses a class of egregious false negatives in previous versions (see #23), but introduces a regression where cases such as 'f u c k' (with the space) will no longer be detected by default. We expect to provide a more comprehensive fix in the next minor release. 87 | 88 | If desired, it remains possible to revert to the previous behavior by providing a custom set of transformers to the matcher. 89 | * **matchers:** The NfaMatcher class has been removed. Use the RegExpMatcher instead. 90 | 91 | ### Features 92 | 93 | * **english-preset:** blacklist 'shit' by default ([b0d90aa](https://github.com/jo3-l/obscenity/commit/b0d90aa4b7dd6d15a2105490f1d2b0c87e58bdcf)), closes [#47](https://github.com/jo3-l/obscenity/issues/47) 94 | 95 | 96 | ### Bug Fixes 97 | 98 | * **english-preset:** don't include skip-non-alphabetic transformer ([620c721](https://github.com/jo3-l/obscenity/commit/620c721662c3ddd8d8ca8838861b9c4ba3ea66e7)), closes [#23](https://github.com/jo3-l/obscenity/issues/23) [#46](https://github.com/jo3-l/obscenity/issues/46) 99 | * **english-preset:** remove extraneous patterns for n-word ([e135be5](https://github.com/jo3-l/obscenity/commit/e135be58510149db9b678801a2e6e3468b3bd4bb)), closes [#48](https://github.com/jo3-l/obscenity/issues/48) 100 | * **pkg:** ensure types resolve in ESM ([718da8a](https://github.com/jo3-l/obscenity/commit/718da8a7399c0dcf948fbe8041714ad6d61c9f73)), closes [#44](https://github.com/jo3-l/obscenity/issues/44) 101 | 102 | 103 | * **matchers:** remove NfaMatcher ([b69c21d](https://github.com/jo3-l/obscenity/commit/b69c21d178ac5e3270fd35d2b876263045a67d81)) 104 | 105 | ### [0.1.4](https://github.com/jo3-l/obscenity/compare/v0.1.1...v0.1.4) (2023-06-06) 106 | 107 | ### Bug Fixes 108 | 109 | - **matchers:** gracefully handle empty patterns ([#31](https://github.com/jo3-l/obscenity/issues/31)) ([79cfa63](https://github.com/jo3-l/obscenity/commit/79cfa630c964be79d1dc16eb0e5d65af4d68e7ab)) 110 | 111 | ### 0.1.1, 0.1.2, 0.1.3 112 | 113 | Versions skipped due to temporary issue with release workflow. 114 | 115 | ## 0.1.0 (2021-08-27) 116 | 117 | Initial release. 118 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | If you wish to contribute to Obscenity, feel free to fork the repository and submit a pull request. We use [ESLint](https://eslint.org/) and [Prettier](https://prettier.io/) to enforce a consistent code style and catch possible issues; setting up relevant plugins for your editor of choice is highly recommended. 4 | 5 | ## Setup 6 | 7 | **Prerequisites:** Node.js (preferably latest version, but any version >= 12 will work), and [pnpm](https://pnpm.io/) for managing packages. 8 | 9 | 1. Fork & clone the main repository. 10 | 2. Create a new branch for your changes: `git checkout -b feat/my-feature`. 11 | 3. Run `pnpm install` to install all dependencies. 12 | 4. Make your changes. 13 | 5. Run `pnpm lint` and `pnpm style` to lint and format the code. Then, run `pnpm test` to make sure all the tests are still passing after your change. 14 | 6. Commit your changes (make sure you follow our commit convention, which is based off [Angular's commit message guidelines](https://github.com/conventional-changelog/conventional-changelog/tree/master/packages/conventional-changelog-angular)). 15 | 7. Submit a [pull request](https://github.com/jo3-l/obscenity/pulls). 16 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | Copyright © 2021 Joe L. 4 | 5 | Permission is hereby granted, free of charge, to any person 6 | obtaining a copy of this software and associated documentation 7 | files (the “Software”), to deal in the Software without 8 | restriction, including without limitation the rights to use, 9 | copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the 11 | Software is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 19 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 21 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 22 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 24 | OTHER DEALINGS IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Obscenity 2 | 3 | > Robust, extensible profanity filter for NodeJS. 4 | 5 | Build status 6 | Codecov status 7 | npm version 8 | Language 9 | License 10 | 11 | ## Why Obscenity? 12 | 13 | - **Accurate:** Though Obscenity is far from perfect (as with all profanity filters), it makes reducing false positives as simple as possible: adding whitelisted phrases is as easy as adding a new string to an array, and using word boundaries is equally simple. 14 | - **Robust:** Obscenity's transformer-based design allows it to match on variants of phrases other libraries are typically unable to, e.g. `fuuuuuuuckkk`, `ʃṳ𝒸𝗄`, `wordsbeforefuckandafter` and so on. There's no need to manually write out all the variants either: just adding the pattern `fuck` will match all of the cases above by default. 15 | - **Extensible:** With Obscenity, you aren't locked into anything - removing phrases that you don't agree with from the default set of words is trivial, as is disabling any transformations you don't like (perhaps you feel that leet-speak decoding is too error-prone for you). 16 | 17 | ## Installation 18 | 19 | ```shell 20 | $ npm install obscenity 21 | $ yarn add obscenity 22 | $ pnpm add obscenity 23 | ``` 24 | 25 | ## Example usage 26 | 27 | First, import Obscenity: 28 | 29 | ```javascript 30 | const { 31 | RegExpMatcher, 32 | TextCensor, 33 | englishDataset, 34 | englishRecommendedTransformers, 35 | } = require('obscenity'); 36 | ``` 37 | 38 | Or, in TypeScript/ESM: 39 | 40 | ```typescript 41 | import { 42 | RegExpMatcher, 43 | TextCensor, 44 | englishDataset, 45 | englishRecommendedTransformers, 46 | } from 'obscenity'; 47 | ``` 48 | 49 | Now, we can create a new matcher using the English preset. 50 | 51 | ```javascript 52 | const matcher = new RegExpMatcher({ 53 | ...englishDataset.build(), 54 | ...englishRecommendedTransformers, 55 | }); 56 | ``` 57 | 58 | Now, we can use our matcher to search for profanities in the text. Here's two examples of what you can do: 59 | 60 | **Check if there are any matches in some text:** 61 | 62 | ```javascript 63 | if (matcher.hasMatch('fuck you')) { 64 | console.log('The input text contains profanities.'); 65 | } 66 | // The input text contains profanities. 67 | ``` 68 | 69 | **Output the positions of all matches along with the original word used:** 70 | 71 | ```javascript 72 | // Pass "true" as the "sorted" parameter so the matches are sorted by their position. 73 | const matches = matcher.getAllMatches('ʃ𝐟ʃὗƈk ỹоứ 𝔟ⁱẗ𝙘ɦ', true); 74 | for (const match of matches) { 75 | const { phraseMetadata, startIndex, endIndex } = 76 | englishDataset.getPayloadWithPhraseMetadata(match); 77 | console.log( 78 | `Match for word ${phraseMetadata.originalWord} found between ${startIndex} and ${endIndex}.`, 79 | ); 80 | } 81 | // Match for word fuck found between 0 and 6. 82 | // Match for word bitch found between 12 and 18. 83 | ``` 84 | 85 | **Censoring matched text:** 86 | 87 | To censor text, we'll need to import another class: the `TextCensor`. 88 | Some other imports and creation of the matcher have been elided for simplicity. 89 | 90 | ```javascript 91 | const { TextCensor, ... } = require('obscenity'); 92 | // ... 93 | const censor = new TextCensor(); 94 | const input = 'fuck you little bitch'; 95 | const matches = matcher.getAllMatches(input); 96 | console.log(censor.applyTo(input, matches)); 97 | // %@$% you little **%@% 98 | ``` 99 | 100 | This is just a small slice of what Obscenity can do: for more, check out the [documentation](#documentation). 101 | 102 | ## Accuracy 103 | 104 | > **Note:** As with all swear filters, Obscenity is not perfect (nor will it ever be). Use its output as a heuristic, and not as the sole judge of whether some content is appropriate or not. 105 | 106 | With the English preset, Obscenity (correctly) finds matches in all of the following texts: 107 | 108 | - you are a little **fuck**er 109 | - **fk** you 110 | - **ffuk** you 111 | - i like **a$$es** 112 | - ʃ𝐟ʃὗƈk ỹоứ 113 | 114 | ...and it **does not match** on the following: 115 | 116 | - the **pen is** mightier than the sword 117 | - i love banan**as s**o yeah 118 | - this song seems really b**anal** 119 | - g**rape**s are really yummy 120 | 121 | ## Documentation 122 | 123 | For a step-by-step guide on how to use Obscenity, check out the [guide](./docs/guide). 124 | 125 | Otherwise, refer to the [auto-generated API documentation](./docs/reference). 126 | 127 | ## Contributing 128 | 129 | Issues can be reported using the [issue tracker](https://github.com/jo3-l/obscenity/issues). 130 | If you'd like to submit a pull request, please read the [contribution guide](./CONTRIBUTING.md) first. 131 | 132 | ## Author 133 | 134 | **Obscenity** © [Joe L.](https://github.com/jo3-l/) under the MIT license. Authored and maintained by Joe L. 135 | 136 | > GitHub [@jo3-l](https://github.com/jo3-l) 137 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | Obscenity's documentation is composed of a [reference](./reference/) automatically generated using [TypeDoc](https://typedoc.org/) and a brief [guide](./guide/) comprising a number of hand-written articles. 4 | -------------------------------------------------------------------------------- /docs/guide/README.md: -------------------------------------------------------------------------------- 1 | # Welcome 2 | 3 | Welcome to the Obscenity guide, a collection of articles that teach you how to use Obscenity in a step-by-step fashion. 4 | 5 | Though you can read it in any order you wish, we recommend the following order: 6 | 7 | - [Patterns](./patterns.md) 8 | - [Transformers](./transformers.md) 9 | - [Matchers](./matchers.md) 10 | - [Datasets](./datasets.md) 11 | - [Censoring text](./censoring-text.md) 12 | -------------------------------------------------------------------------------- /docs/guide/censoring-text.md: -------------------------------------------------------------------------------- 1 | # Censoring Profane Phrases 2 | 3 | > Learn how to censor text with Obscenity's `TextCensor`. 4 | 5 | A common strategy to deal with content containing banned phrases is to _censor_ 6 | them by replacing the offending parts of the content with placeholders. 7 | 8 | Obscenity's `TextCensor` class makes this simple. Consider the following basic 9 | example: 10 | 11 | ```typescript 12 | import { TextCensor, RegExpMatcher, englishDataset, englishRecommendedTransformers } from 'obscenity'; 13 | 14 | const matcher = new RegExpMatcher({ ...englishDataset.build(), ...englishRecommendedTransformers }); 15 | const censor = new TextCensor(); // (1) 16 | 17 | const text = 'f u c k you!'; 18 | const matches = matcher.getAllMatches(text); 19 | console.log(censor.applyTo(text, matches)); // (2) 20 | //> "@$** you!" 21 | ``` 22 | 23 | We start by constructing a `TextCensor` (1). Then, we apply this censor to a 24 | piece of content by invoking the `applyTo` method with the original text along 25 | with the set of matches (2). 26 | 27 | Note that in the above example the offending content has been replaced with 28 | [grawlix](https://en.wikipedia.org/wiki/Grawlix). However, if this is 29 | undesirable for your use-case, the replacement text can be easily customized 30 | by providing your own _censor strategy_. 31 | 32 | ## Censor Strategies 33 | 34 | A censor strategy specifies how to generate replacement text given a match. 35 | Under the hood, a censor strategy is simply a function that receives a _censor 36 | context_ and returns a replacement string. 37 | 38 | The most basic type of censor strategy simply returns a fixed replacement 39 | string: 40 | 41 | ```typescript 42 | const fudgeStrategy = () => 'fudge'; 43 | ``` 44 | 45 | To use this censor strategy, we use the `setStrategy` method on our 46 | `TextCensor`: 47 | 48 | ```typescript 49 | const censor = new TextCensor().setStrategy(fudgeStrategy); 50 | // ... 51 | console.log(censor.applyTo(text, matches)); 52 | //> "fudge you!" 53 | ``` 54 | 55 | We can also create more complex strategies that generate output dynamically 56 | based on the specific text matched. For instance, let us try writing a strategy 57 | that will generate a string of asterisks of varying length ⸺ `ass` should become 58 | `***`, `fuck` `****`, and so on. To do this, we can use the `matchLength` property 59 | of the censor context: 60 | 61 | ```typescript 62 | const asteriskStrategy = (ctx: CensorContext) => '*'.repeat(ctx.matchLength); 63 | ``` 64 | 65 | which works as expected: 66 | 67 | ```typescript 68 | const censor = new TextCensor().setStrategy(asteriskStrategy); 69 | // ... 70 | console.log(censor.applyTo(text, matches)); 71 | //> "**** you!" 72 | ``` 73 | 74 | Other than the match length, censor contexts also include the following data: 75 | 76 | - All the properties of `MatchPayload`s, as `CensorContext` extends 77 | `MatchPayload`. Thus, `ctx.termId`, `ctx.startIndex`, `ctx.endIndex`, and so 78 | on are all accessible. 79 | - `input` ⸺ The input text. 80 | - `overlapsAtStart` ⸺ Whether the current match overlaps at the start with some other match. 81 | - `overlapsAtEnd` ⸺ Whether the current match overlaps at the end with some other match. 82 | 83 | ## Built-in Censor Strategies 84 | 85 | Obscenity exports the two censor strategies discussed in this article to save 86 | you the work of implementing them yourself: 87 | 88 | - `grawlixCensorStrategy()` ⸺ Generates grawlix; this is the default strategy. 89 | - `asteriskCensorStrategy()` ⸺ Generates repeated asterisks.. 90 | 91 | In addition, a number of utilities are provided to aid in writing custom censor 92 | strategies: 93 | 94 | - `fixedPhraseCensorStrategy()` ⸺ Returns a censor strategy that produces a 95 | fixed phrase. For example, `fixedPhraseCensorStrategy('fudge')` always returns 96 | `fudge`. 97 | - `fixedCharCensorStrategy()` ⸺ Returns a censor strategy that produces the 98 | input character repeated an appropriate number of times. For example, 99 | `fixedCharCensorStrategy('$')` might return `$`, `$$`, `$$`, and so on. 100 | - `randomCharFromSetCensorStrategy()` ⸺ Returns a censor strategy that produces 101 | random characters from the set of characters given, repeated an appropriate 102 | number of times. For example, `randomCharFromSetCensorStrategy('%&')` might 103 | return `%%&`, `&%&&`, and so on. 104 | - `keepStartCensorStrategy()` ⸺ Extends another censor strategy by keeping the 105 | first character matched. For example, 106 | `keepStartCensorStrategy(asteriskStrategy)` might produce `f***` as the 107 | replacement string. 108 | - `keepEndCensorStrategy()` ⸺ Same as above, but keeps the last character 109 | matched instead. 110 | 111 | --- 112 | 113 | Great, now you know all about censoring text (and if you've read the guide in 114 | order, all of Obscenity's features)! If you have further questions, consult the 115 | reference documentation. 116 | -------------------------------------------------------------------------------- /docs/guide/datasets.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | 3 | > Learn about datasets, a way to organize blacklisted and whitelisted terms. 4 | 5 | Say that you want to know what the original word associated with a match was. You could do this with a giant chain of `if-else`s: 6 | 7 | ```typescript 8 | const patterns = [ 9 | { id: 0, pattern: pattern`fck` }, 10 | { id: 1, pattern: pattern`fuck` }, 11 | { id: 2, pattern: pattern`bish` }, 12 | { id: 3, pattern: pattern`bitch` }, 13 | // ... 14 | ]; 15 | 16 | const matcher = new RegExpMatcher({ ... }); 17 | const payloads = matcher.getAllMatches(text); 18 | for (const payload of payloads) { 19 | if (payload.termId === 0 || payload.termId === 1) console.log('Original word: fuck'); 20 | else if (payload.termId === 2 || payload.termId === 3) console.log('Original word: bitch'); 21 | // ... 22 | } 23 | ``` 24 | 25 | ...but clearly this becomes quite unmaintainable with many patterns. What's the solution? 26 | 27 | **Datasets** can come in handy here. They support creating groups of blacklisted/whitelisted terms ("phrases") and associating arbitrary metadata with them. To see what's meant by this, see the following example: 28 | 29 | ```typescript 30 | import { DataSet, pattern } from 'obscenity'; 31 | 32 | const dataset = new DataSet<{ originalWord: string }>() 33 | // addPhrase() adds a new phrase to the dataset. 34 | .addPhrase((phrase) => 35 | phrase 36 | // setMetadata() sets the metadata of the phrase. 37 | .setMetadata({ originalWord: 'fuck' }) 38 | // addPattern() associates a pattern with the phrase. 39 | .addPattern(pattern`fck`) 40 | .addPattern(pattern`fuck`), 41 | ) 42 | .addPhrase((phrase) => 43 | phrase 44 | .setMetadata({ originalWord: 'bitch' }) 45 | .addPattern(pattern`bish`) 46 | .addPattern(pattern`bitch`) 47 | // addWhitelistedTerm() associates a whitelisted term with the phrase. 48 | .addWhitelistedTerm('abish'), 49 | ); 50 | ``` 51 | 52 | To use our dataset with a matcher, we can call the `build()` method, which will produce an object structured like `{ blacklistedTerms, whitelistedTerms }`, which we can then use in the matcher options: 53 | 54 | ```typescript 55 | const built = dataset.build(); 56 | const matcher = new RegExpMatcher({ 57 | blacklistedTerms: built.blacklistedTerms, 58 | whitelistedTerms: built.whitelistedTerms, 59 | // Other options go here. 60 | }); 61 | 62 | // Or, using spread notation: 63 | const matcher = new RegExpMatcher({ 64 | ...built, 65 | // Other options go here. 66 | }); 67 | ``` 68 | 69 | But how does this help us solve the original problem (getting the original word from a match)? Simple. We can use the `getPayloadWithPhraseMetadata` method: 70 | 71 | ```typescript 72 | const payloads = matcher.getAllMatches(input); 73 | const payloadsWithMetadata = payloads.map(dataset.getPayloadWithPhraseMetadata); 74 | ``` 75 | 76 | The `getPayloadWithPhraseMetadata` will return a copy of the original match payload with a new property added: `phraseMetadata`, which is the phrase metadata associated with the term that matched. 77 | 78 | So, to get the original word that matched for the first payload, we could just use the following: 79 | 80 | ```typescript 81 | const originalWord = payloadsWithMetadata[0].phraseMetadata!.originalWord; 82 | ``` 83 | 84 | Though associating metadata with phrases is one of the main features of the `DataSet`, it's by no means the only one, as we'll see in the next section. 85 | 86 | ## Extending existing datasets 87 | 88 | Say that you would like to use the English preset, but you don't really agree with one of the words in there. That's simple to fix: we can just extend the dataset of English words: 89 | 90 | ```typescript 91 | const myDataset = new DataSet<{ originalWord: string }>() 92 | // addAll() adds all the data from the dataset passed. 93 | .addAll(englishDataset) 94 | // removePhrasesIf() removes phrases from the current dataset if the function provided 95 | // returns true. 96 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'bitch'); 97 | ``` 98 | 99 | The `addAll` method adds 100 | 101 | Using our new dataset is equally as simple: 102 | 103 | ```typescript 104 | const matcher = new RegExpMatcher({ 105 | ...myDataset.build(), 106 | // Other options go here. 107 | }); 108 | ``` 109 | 110 | --- 111 | 112 | **Next up: [Censoring text](./censoring-text.md).** 113 | -------------------------------------------------------------------------------- /docs/guide/matchers.md: -------------------------------------------------------------------------------- 1 | # Matchers 2 | 3 | > Learn about Obscenity's `Matcher` interface and its implementations. 4 | 5 | We've previously discussed patterns and transformers. It's time to learn about how to use Obscenity to search for blacklisted terms in text, while respecting whitelisted terms. 6 | 7 | To facilitate this, Obscenity provides the `RegExpMatcher`, which -- as the name suggests -- implements matching using regular expressions and string searching methods. At a high level, all it does is: 8 | 9 | ``` 10 | apply transformations to text before matching whitelisted terms 11 | find whitelisted terms in text 12 | 13 | apply transformations to text before matching blacklisted terms 14 | for each blacklisted term 15 | for all matches of the blacklisted term in the text 16 | if a whitelisted term did not match this part of the text 17 | emit match 18 | ``` 19 | 20 | For now, the `RegExpMatcher` is the only matcher implementation offered by Obscenity, though this may change in future versions. 21 | 22 | ## Providing matcher options 23 | 24 | Matchers support several options: 25 | 26 | - `blacklistedTerms` - a list of blacklisted terms. Blacklisted terms are objects with a unique ID that identify them and a pattern, e.g. `` { id: 0, pattern: pattern`my pattern` } ``. 27 | 28 | > **Tip:** If you only want to supply a list of patterns (as you don't care about knowing exactly which pattern matched, you can use the `assignIncrementingIds` utility): 29 | > 30 | > ```typescript 31 | > import { RegExpMatcher, assignIncrementingIds, pattern } from 'obscenity'; 32 | > 33 | > const matcher = new RegExpMatcher({ 34 | > blacklistedTerms: assignIncrementingIds([pattern`my pattern`]), 35 | > }); 36 | > ``` 37 | 38 | - `whitelistedTerms` - a list of whitelisted terms, which are just strings. 39 | 40 | - `blacklistMatcherTransformers` - a set of transformers that should be applied to the text before matching blacklisted terms. They will be applied in the order they are given. 41 | 42 | - `whitelistMatcherTransformers` - a set of transformers that should be applied to the text before matching whitelisted terms. They will be applied in the order they are given. 43 | 44 | ### Example 45 | 46 | ```typescript 47 | import { RegExpMatcher, pattern } from 'obscenity'; 48 | 49 | const matcher = new RegExpMatcher({ 50 | blacklistedTerms: [ 51 | { id: 0, pattern: pattern`hi` }, 52 | { id: 1, pattern: pattern`bye` }, 53 | ], 54 | whitelistedTerms: ['achingly'], 55 | blacklistMatcherTransformers: [skipSpaces], 56 | whitelistMatcherTransformers: [], 57 | }); 58 | ``` 59 | 60 | This will match `hi` and `bye` (ignoring spaces) unless the `hi` is part of `achingly` (not ignoring spaces). 61 | 62 | ## Presets 63 | 64 | While coming up with your own list of blacklisted terms / whitelisted terms / transformers is a possibility, it does take quite a bit of time if you want to make sure you have few false positives and match as many variants as possible. 65 | 66 | To save you some work, Obscenity features _presets_, which are sets of blacklisted terms, whitelisted terms, and transformers. For example, to use the English preset: 67 | 68 | ```typescript 69 | import { RegExpMatcher, englishDataset, englishRecommendedTransformers } from 'obscenity'; 70 | 71 | const matcher = new RegExpMatcher({ 72 | ...englishDataset.build(), 73 | ...englishRecommendedTransformers, 74 | }); 75 | ``` 76 | 77 | ### Available presets 78 | 79 | The English preset is the only one available at the moment, but more may be added in the future. 80 | 81 | ## Using the matcher 82 | 83 | Now, we can use our matcher to answer some questions about our text. Namely, we can ask it whether the text contains any blacklisted terms, and where those blacklisted terms appeared. 84 | 85 | To check whether the text contains any blacklisted terms, we can use the `hasMatch()` method: 86 | 87 | ```typescript 88 | const hasMatch = matcher.hasMatch(input); 89 | ``` 90 | 91 | This should be preferred if you do not need to know which terms matched and where they matched. 92 | 93 | If you do need to obtain more information about the matches, though, you can use `getAllMatches()`: 94 | 95 | ```typescript 96 | const payloads = matcher.getAllMatches(input); 97 | ``` 98 | 99 | You may notice that the resulting list of matches is not sorted. That is, matches beginning at a higher index might come before matches beginning at a lower index. 100 | If having a sorted list of matches is a requirement for your code, you can pass `true` as an argument to `getAllMatches`. 101 | 102 | ```typescript 103 | const sortedPayloads = matcher.getAllMatches(input, true); 104 | ``` 105 | 106 | `getAllMatches()` returns a list of match payloads, which contain four pieces of information: 107 | 108 | - `termId` - the ID of the term that matched; 109 | - `startIndex` - the start index of the match, inclusive; 110 | - `endIndex` - the end index of the match, inclusive; 111 | - `matchLength` - the number of characters that matched. 112 | 113 | The information emitted may not be enough for your use-case (perhaps you want to track the type of word was used, what the original word was, etc.). If that's the case, be sure to check out the next article! 114 | 115 | --- 116 | 117 | **Next up: [Datasets](./datasets.md).** 118 | -------------------------------------------------------------------------------- /docs/guide/patterns.md: -------------------------------------------------------------------------------- 1 | # Patterns 2 | 3 | > Learn about Obscenity's custom pattern syntax. 4 | 5 | **Patterns** are used to specify blacklisted words. To ease matching variations of words with only small changes, they support some special syntax (namely, wildcards, optional expressions, and boundary assertions). For example, the pattern `f?ck` matches `f`, then any character, then `ck`; and matches on `fuck`, `fbck`, `fyck`, `fack`, and so on. 6 | 7 | ## Why a custom pattern syntax? 8 | 9 | This might sound similar to [regular expressions](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions), which are widely used for similar purposes. Why not just use them instead of inventing some custom syntax? A few reasons: 10 | 11 | - Regular expressions are overkill for profanity filtering in most cases. Their expressive syntax is, for the most part, completely unneeded as most variations are normalized before matching (see the article on [transformers](./transformers.md)). 12 | - Not supporting all the features of regular expressions can make a more efficient implementation in certain cases. In addition to a simpler matcher implementation using regular expressions (ironically) and string searching methods, Obscenity also features a matcher implementation using finite automata techniques which searches for patterns in parallel, which may be useful if you have a large number of patterns. 13 | 14 | ## Pattern syntax 15 | 16 | Most characters match _literally_. that is, `a` matches an `a`, `book` matches `book`, and so on. However, there are three special expressions that are available: 17 | 18 | - **Wildcards:** A `?` matches any character. 19 | - **Optional expressions:** Wrapping an expression in a set of square brackets (`[]`) matches it _optional_: `a[bc]` matches either `a` or `abc`. 20 | - **Boundary assertions:** Placing a pipe (`|`) at the start or end of the pattern asserts position at a word boundary: `|tit` matches `tit` and `tits` but not `substitute`. Similarly, `chick|` matches 21 | `chick` but not `chicken`. 22 | 23 | A special character mentioned above can be escaped using a backslash (`\`): `\?` matches `?` instead of a wildcard. 24 | 25 | ## Using patterns with Obscenity 26 | 27 | A pattern may be created using the `parseRawPattern()` function: 28 | 29 | ```typescript 30 | import { parseRawPattern } from 'obscenity'; 31 | 32 | const p = parseRawPattern('f?ck'); 33 | ``` 34 | 35 | However, it is usually more convenient to use the `pattern` [tagged template](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#tagged_templates): 36 | 37 | ```typescript 38 | import { pattern } from 'obscenity'; 39 | 40 | const p = pattern`f?ck`; 41 | ``` 42 | 43 | Note the lack of `()` when calling `pattern` and the usage of backticks. 44 | 45 | Due to how the `pattern` tagged template works internally, it is not necessary to double-escape backslashes: 46 | 47 | ```typescript 48 | import { pattern } from 'obscenity'; 49 | 50 | const p = pattern`\[`; 51 | ``` 52 | 53 | If you were using `parseRawPattern` instead, the following would be required: 54 | 55 | ```typescript 56 | const p = parseRawPattern('\\['); 57 | ``` 58 | 59 | --- 60 | 61 | **Next up: [Transformers](./transformers.md).** 62 | -------------------------------------------------------------------------------- /docs/guide/transformers.md: -------------------------------------------------------------------------------- 1 | # Transformers 2 | 3 | > Learn all about transformers: what they are, built-in transformers, and how to make your own. 4 | 5 | **Transformers** normalize text before it is passed to the matcher. For example, all of the following could be implementing using transformers: 6 | 7 | - Confusable character resolution: `Ἢἕļľᦞ ш٥ṟlᑰ!` -> `hello world` 8 | - Leet-speak resolution: `h3llo world` -> `hello world` 9 | - Duplicate character collapsing: `heeello world` -> `hello world` 10 | 11 | ## Simple transformers 12 | 13 | In their simplest form, transformers are just functions that map characters to other characters. For example, a transformer that changes `a` to `b` and keeps other characters intact might look like: 14 | 15 | ```typescript 16 | import { createSimpleTransformer } from 'obscenity'; 17 | 18 | const a = 'a'.charCodeAt(0); 19 | const b = 'b'.charCodeAt(0); 20 | const changeAToB = createSimpleTransformer((c) => (c === a ? b : c)); 21 | ``` 22 | 23 | > **Note:** `createSimpleTransformer` is an adapter that returns the input function in a structure suitable for use with matchers, which are discussed in the next article. Don't forget to use it when creating transformers! 24 | 25 | > **Warning:** Note that as transformers take the character _code_ as input rather than a string, implementing the transformer like this: 26 | > 27 | > ```typescript 28 | > const changeAToB = createSimpleTransformer((c) => (c === 'a' ? 'b' : c)); 29 | > ``` 30 | > 31 | > ...is, unfortunately, incorrect, as `c` is a number and would never be equal to `'a'`. Remember to always use character codes when writing transformers! 32 | 33 | ### Removing characters 34 | 35 | Sometimes, changing characters isn't enough. Perhaps you want to completely ignore a character when matching. As an example, perhaps you want to skip the spaces in `f u c k` so it becomes `fuck`. 36 | 37 | To do this, simply return `undefined` from the transformer, which signifies that the character should be ignored. With this in mind, we can easily write a transformer that skips spaces: 38 | 39 | ```typescript 40 | import { createSimpleTransformer } from 'obscenity'; 41 | 42 | const space = ' '.charCodeAt(0); 43 | const skipSpaces = createSimpleTransformer((c) => (c === space ? undefined : c)); 44 | ``` 45 | 46 | ## Stateful transformers 47 | 48 | The aforementioned type of transformer is inadequate if you want to implement more complicated transformers, though. For example, if we wanted to implement a transformer that collapses duplicate characters, we'd hit a roadblock quite quickly in that we need to store the last character to check whether it's a duplicate, but simple transformers provide no clear way to do so. 49 | 50 | This is where _stateful transformers_ come in handy. Stateful transformers are objects that implement the `StatefulTransformer` interface. More specifically, your object has to have the following methods: 51 | 52 | - `transform(char)`, which takes a character and returns the transformed character. 53 | - `reset()`, which resets any internal state the transformer has. 54 | 55 | With this in mind, we can now write a stateful transformer that ignores duplicate characters: 56 | 57 | ```typescript 58 | class CollapseDuplicates implements StatefulTransformer { 59 | private lastCharacter = -1; 60 | 61 | public transform(char: number) { 62 | if (char === this.lastCharacter) return undefined; 63 | this.lastCharacter = char; 64 | return char; 65 | } 66 | 67 | public reset() { 68 | this.lastCharacter = -1; 69 | } 70 | } 71 | ``` 72 | 73 | Now, we can use the `createStatefulTransformer` adapter to get a structure suitable for use with matchers (discussed in the next article): 74 | 75 | ```typescript 76 | import { createStatefulTransformer } from 'obscenity'; 77 | 78 | const collapseDuplicates = createStatefulTransformer(() => new CollapseDuplicates()); 79 | ``` 80 | 81 | --- 82 | 83 | Excellent, you now know all about transformers! Now, let's take a look at the various built-in transformers Obscenity provides out of the box. 84 | 85 | ## Built-in transformers 86 | 87 | Obscenity features a number of built-in transformers for common tasks. 88 | 89 | - **Collapsing duplicate characters** is implemented by the `collapseDuplicatesTransformer()`: `fuuuuuuuck` becomes `fuck`; 90 | - **Resolving confusable Unicode characters** is implemented by the `resolveConfusablesTransformer()`: ``Ἢἕļľᦞ ш٥ṟlᑰ!` becomes `hello world!`; 91 | - **Resolving leet-speak** is implemented by the `resolveLeetSpeakTransformer()`: `h3llo world` becomes `hello world`; 92 | - **Skipping non-alphabetic characters** is implemented by the `skipNonAlphabeticTransformer()`: `f.u.c.. k` becomes `fuck`; 93 | - **Converting characters to lower-case** is implemented by the `toAsciiLowerCaseTransformer()`: `fUCk` becomes `fuck`. 94 | 95 | --- 96 | 97 | **Next up: [Matchers](./matchers.md).** 98 | -------------------------------------------------------------------------------- /docs/reference/.nojekyll: -------------------------------------------------------------------------------- 1 | TypeDoc added this file to prevent GitHub Pages from using Jekyll. You can turn off this behavior by setting the `githubPages` option to false. -------------------------------------------------------------------------------- /docs/reference/classes/DataSet.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / DataSet 2 | 3 | # Class: DataSet 4 | 5 | Holds phrases (groups of patterns and whitelisted terms), optionally 6 | associating metadata with them. 7 | 8 | ## Type parameters 9 | 10 | | Name | Description | 11 | | :------ | :------ | 12 | | `MetadataType` | Metadata type for phrases. Note that the metadata type is implicitly nullable. | 13 | 14 | ## Table of contents 15 | 16 | ### Constructors 17 | 18 | - [constructor](DataSet.md#constructor) 19 | 20 | ### Methods 21 | 22 | - [addAll](DataSet.md#addall) 23 | - [addPhrase](DataSet.md#addphrase) 24 | - [build](DataSet.md#build) 25 | - [getPayloadWithPhraseMetadata](DataSet.md#getpayloadwithphrasemetadata) 26 | - [removePhrasesIf](DataSet.md#removephrasesif) 27 | 28 | ## Constructors 29 | 30 | ### constructor 31 | 32 | • **new DataSet**<`MetadataType`\>() 33 | 34 | #### Type parameters 35 | 36 | | Name | 37 | | :------ | 38 | | `MetadataType` | 39 | 40 | ## Methods 41 | 42 | ### addAll 43 | 44 | ▸ **addAll**(`other`): [`DataSet`](DataSet.md)<`MetadataType`\> 45 | 46 | Adds all the phrases from the dataset provided to this one. 47 | 48 | **`Example`** 49 | 50 | ```typescript 51 | const customDataset = new DataSet().addAll(englishDataset); 52 | ``` 53 | 54 | #### Parameters 55 | 56 | | Name | Type | Description | 57 | | :------ | :------ | :------ | 58 | | `other` | [`DataSet`](DataSet.md)<`MetadataType`\> | Other dataset. | 59 | 60 | #### Returns 61 | 62 | [`DataSet`](DataSet.md)<`MetadataType`\> 63 | 64 | #### Defined in 65 | 66 | [src/dataset/DataSet.ts:29](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L29) 67 | 68 | ___ 69 | 70 | ### addPhrase 71 | 72 | ▸ **addPhrase**(`fn`): [`DataSet`](DataSet.md)<`MetadataType`\> 73 | 74 | Adds a phrase to this dataset. 75 | 76 | **`Example`** 77 | 78 | ```typescript 79 | const data = new DataSet<{ originalWord: string }>() 80 | .addPhrase((phrase) => phrase.setMetadata({ originalWord: 'fuck' }) 81 | .addPattern(pattern`fuck`) 82 | .addPattern(pattern`f[?]ck`) 83 | .addWhitelistedTerm('Afck')) 84 | .build(); 85 | ``` 86 | 87 | #### Parameters 88 | 89 | | Name | Type | Description | 90 | | :------ | :------ | :------ | 91 | | `fn` | (`builder`: [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\>) => [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> | A function that takes a [[PhraseBuilder]], adds patterns/whitelisted terms/metadata to it, and returns it. | 92 | 93 | #### Returns 94 | 95 | [`DataSet`](DataSet.md)<`MetadataType`\> 96 | 97 | #### Defined in 98 | 99 | [src/dataset/DataSet.ts:75](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L75) 100 | 101 | ___ 102 | 103 | ### build 104 | 105 | ▸ **build**(): `Pick`<[`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md), ``"blacklistedTerms"`` \| ``"whitelistedTerms"``\> 106 | 107 | Returns the dataset in a format suitable for usage with the [[RegExpMatcher]]. 108 | 109 | **`Example`** 110 | 111 | ```typescript 112 | // With the RegExpMatcher: 113 | const matcher = new RegExpMatcher({ 114 | ...dataset.build(), 115 | // additional options here 116 | }); 117 | ``` 118 | 119 | #### Returns 120 | 121 | `Pick`<[`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md), ``"blacklistedTerms"`` \| ``"whitelistedTerms"``\> 122 | 123 | #### Defined in 124 | 125 | [src/dataset/DataSet.ts:118](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L118) 126 | 127 | ___ 128 | 129 | ### getPayloadWithPhraseMetadata 130 | 131 | ▸ **getPayloadWithPhraseMetadata**(`payload`): [`MatchPayloadWithPhraseMetadata`](../README.md#matchpayloadwithphrasemetadata)<`MetadataType`\> 132 | 133 | Retrieves the phrase metadata associated with a pattern and returns a 134 | copy of the match payload with said metadata attached to it. 135 | 136 | **`Example`** 137 | 138 | ```typescript 139 | const matches = matcher.getAllMatches(input); 140 | const matchesWithPhraseMetadata = matches.map((match) => dataset.getPayloadWithPhraseMetadata(match)); 141 | // Now we can access the 'phraseMetadata' property: 142 | const phraseMetadata = matchesWithPhraseMetadata[0].phraseMetadata; 143 | ``` 144 | 145 | #### Parameters 146 | 147 | | Name | Type | Description | 148 | | :------ | :------ | :------ | 149 | | `payload` | [`MatchPayload`](../interfaces/MatchPayload.md) | Original match payload. | 150 | 151 | #### Returns 152 | 153 | [`MatchPayloadWithPhraseMetadata`](../README.md#matchpayloadwithphrasemetadata)<`MetadataType`\> 154 | 155 | #### Defined in 156 | 157 | [src/dataset/DataSet.ts:94](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L94) 158 | 159 | ___ 160 | 161 | ### removePhrasesIf 162 | 163 | ▸ **removePhrasesIf**(`predicate`): [`DataSet`](DataSet.md)<`MetadataType`\> 164 | 165 | Removes phrases that match the predicate given. 166 | 167 | **`Example`** 168 | 169 | ```typescript 170 | const customDataset = new DataSet<{ originalWord: string }>() 171 | .addAll(englishDataset) 172 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck'); 173 | ``` 174 | 175 | #### Parameters 176 | 177 | | Name | Type | Description | 178 | | :------ | :------ | :------ | 179 | | `predicate` | (`phrase`: [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\>) => `boolean` | A predicate that determines whether or not a phrase should be removed. Return `true` to remove, `false` to keep. | 180 | 181 | #### Returns 182 | 183 | [`DataSet`](DataSet.md)<`MetadataType`\> 184 | 185 | #### Defined in 186 | 187 | [src/dataset/DataSet.ts:46](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L46) 188 | -------------------------------------------------------------------------------- /docs/reference/classes/ParserError.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / ParserError 2 | 3 | # Class: ParserError 4 | 5 | Custom error thrown by the parser when syntactical errors are detected. 6 | 7 | ## Hierarchy 8 | 9 | - `Error` 10 | 11 | ↳ **`ParserError`** 12 | 13 | ## Table of contents 14 | 15 | ### Constructors 16 | 17 | - [constructor](ParserError.md#constructor) 18 | 19 | ### Properties 20 | 21 | - [column](ParserError.md#column) 22 | - [line](ParserError.md#line) 23 | - [message](ParserError.md#message) 24 | - [name](ParserError.md#name) 25 | - [stack](ParserError.md#stack) 26 | 27 | ## Constructors 28 | 29 | ### constructor 30 | 31 | • **new ParserError**(`message`, `line`, `column`) 32 | 33 | #### Parameters 34 | 35 | | Name | Type | 36 | | :------ | :------ | 37 | | `message` | `string` | 38 | | `line` | `number` | 39 | | `column` | `number` | 40 | 41 | #### Overrides 42 | 43 | Error.constructor 44 | 45 | #### Defined in 46 | 47 | [src/pattern/ParserError.ts:18](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L18) 48 | 49 | ## Properties 50 | 51 | ### column 52 | 53 | • `Readonly` **column**: `number` 54 | 55 | The column on which the error occurred (one-based). 56 | Note that surrogate pairs are counted as 1 column wide, not 2. 57 | 58 | #### Defined in 59 | 60 | [src/pattern/ParserError.ts:16](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L16) 61 | 62 | ___ 63 | 64 | ### line 65 | 66 | • `Readonly` **line**: `number` 67 | 68 | The line on which the error occurred (one-based). 69 | 70 | #### Defined in 71 | 72 | [src/pattern/ParserError.ts:10](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L10) 73 | 74 | ___ 75 | 76 | ### message 77 | 78 | • **message**: `string` 79 | 80 | #### Inherited from 81 | 82 | Error.message 83 | 84 | #### Defined in 85 | 86 | node_modules/.pnpm/typescript@5.2.2/node_modules/typescript/lib/lib.es5.d.ts:1068 87 | 88 | ___ 89 | 90 | ### name 91 | 92 | • `Readonly` **name**: ``"ParserError"`` 93 | 94 | #### Overrides 95 | 96 | Error.name 97 | 98 | #### Defined in 99 | 100 | [src/pattern/ParserError.ts:5](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/ParserError.ts#L5) 101 | 102 | ___ 103 | 104 | ### stack 105 | 106 | • `Optional` **stack**: `string` 107 | 108 | #### Inherited from 109 | 110 | Error.stack 111 | 112 | #### Defined in 113 | 114 | node_modules/.pnpm/typescript@5.2.2/node_modules/typescript/lib/lib.es5.d.ts:1069 115 | -------------------------------------------------------------------------------- /docs/reference/classes/PhraseBuilder.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / PhraseBuilder 2 | 3 | # Class: PhraseBuilder 4 | 5 | Builder for phrases. 6 | 7 | ## Type parameters 8 | 9 | | Name | 10 | | :------ | 11 | | `MetadataType` | 12 | 13 | ## Table of contents 14 | 15 | ### Constructors 16 | 17 | - [constructor](PhraseBuilder.md#constructor) 18 | 19 | ### Methods 20 | 21 | - [addPattern](PhraseBuilder.md#addpattern) 22 | - [addWhitelistedTerm](PhraseBuilder.md#addwhitelistedterm) 23 | - [build](PhraseBuilder.md#build) 24 | - [setMetadata](PhraseBuilder.md#setmetadata) 25 | 26 | ## Constructors 27 | 28 | ### constructor 29 | 30 | • **new PhraseBuilder**<`MetadataType`\>() 31 | 32 | #### Type parameters 33 | 34 | | Name | 35 | | :------ | 36 | | `MetadataType` | 37 | 38 | ## Methods 39 | 40 | ### addPattern 41 | 42 | ▸ **addPattern**(`pattern`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 43 | 44 | Associates a pattern with this phrase. 45 | 46 | #### Parameters 47 | 48 | | Name | Type | Description | 49 | | :------ | :------ | :------ | 50 | | `pattern` | [`ParsedPattern`](../interfaces/ParsedPattern.md) | Pattern to add. | 51 | 52 | #### Returns 53 | 54 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 55 | 56 | #### Defined in 57 | 58 | [src/dataset/DataSet.ts:149](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L149) 59 | 60 | ___ 61 | 62 | ### addWhitelistedTerm 63 | 64 | ▸ **addWhitelistedTerm**(`term`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 65 | 66 | Associates a whitelisted pattern with this phrase. 67 | 68 | #### Parameters 69 | 70 | | Name | Type | Description | 71 | | :------ | :------ | :------ | 72 | | `term` | `string` | Whitelisted term to add. | 73 | 74 | #### Returns 75 | 76 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 77 | 78 | #### Defined in 79 | 80 | [src/dataset/DataSet.ts:159](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L159) 81 | 82 | ___ 83 | 84 | ### build 85 | 86 | ▸ **build**(): [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\> 87 | 88 | Builds the phrase, returning a [[PhraseContainer]] for use with the 89 | [[DataSet]]. 90 | 91 | #### Returns 92 | 93 | [`PhraseContainer`](../interfaces/PhraseContainer.md)<`MetadataType`\> 94 | 95 | #### Defined in 96 | 97 | [src/dataset/DataSet.ts:178](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L178) 98 | 99 | ___ 100 | 101 | ### setMetadata 102 | 103 | ▸ **setMetadata**(`metadata?`): [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 104 | 105 | Associates some metadata with this phrase. 106 | 107 | #### Parameters 108 | 109 | | Name | Type | Description | 110 | | :------ | :------ | :------ | 111 | | `metadata?` | `MetadataType` | Metadata to use. | 112 | 113 | #### Returns 114 | 115 | [`PhraseBuilder`](PhraseBuilder.md)<`MetadataType`\> 116 | 117 | #### Defined in 118 | 119 | [src/dataset/DataSet.ts:169](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L169) 120 | -------------------------------------------------------------------------------- /docs/reference/classes/RegExpMatcher.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / RegExpMatcher 2 | 3 | # Class: RegExpMatcher 4 | 5 | An implementation of the [[Matcher]] interface using regular expressions and 6 | string searching methods. 7 | 8 | ## Implements 9 | 10 | - [`Matcher`](../interfaces/Matcher.md) 11 | 12 | ## Table of contents 13 | 14 | ### Constructors 15 | 16 | - [constructor](RegExpMatcher.md#constructor) 17 | 18 | ### Methods 19 | 20 | - [getAllMatches](RegExpMatcher.md#getallmatches) 21 | - [hasMatch](RegExpMatcher.md#hasmatch) 22 | 23 | ## Constructors 24 | 25 | ### constructor 26 | 27 | • **new RegExpMatcher**(`options`) 28 | 29 | Creates a new [[RegExpMatcher]] with the options given. 30 | 31 | **`Example`** 32 | 33 | ```typescript 34 | // Use the options provided by the English preset. 35 | const matcher = new RegExpMatcher({ 36 | ...englishDataset.build(), 37 | ...englishRecommendedTransformers, 38 | }); 39 | ``` 40 | 41 | **`Example`** 42 | 43 | ```typescript 44 | // Simple matcher that only has blacklisted patterns. 45 | const matcher = new RegExpMatcher({ 46 | blacklistedTerms: assignIncrementingIds([ 47 | pattern`fuck`, 48 | pattern`f?uck`, // wildcards (?) 49 | pattern`bitch`, 50 | pattern`b[i]tch` // optionals ([i] matches either "i" or "") 51 | ]), 52 | }); 53 | 54 | // Check whether some string matches any of the patterns. 55 | const doesMatch = matcher.hasMatch('fuck you bitch'); 56 | ``` 57 | 58 | **`Example`** 59 | 60 | ```typescript 61 | // A more advanced example, with transformers and whitelisted terms. 62 | const matcher = new RegExpMatcher({ 63 | blacklistedTerms: [ 64 | { id: 1, pattern: pattern`penis` }, 65 | { id: 2, pattern: pattern`fuck` }, 66 | ], 67 | whitelistedTerms: ['pen is'], 68 | blacklistMatcherTransformers: [ 69 | resolveConfusablesTransformer(), // '🅰' => 'a' 70 | resolveLeetSpeakTransformer(), // '$' => 's' 71 | foldAsciiCharCaseTransformer(), // case insensitive matching 72 | skipNonAlphabeticTransformer(), // 'f.u...c.k' => 'fuck' 73 | collapseDuplicatesTransformer(), // 'aaaa' => 'a' 74 | ], 75 | }); 76 | 77 | // Output all matches. 78 | console.log(matcher.getAllMatches('fu.....uuuuCK the pen is mightier than the sword!')); 79 | ``` 80 | 81 | #### Parameters 82 | 83 | | Name | Type | Description | 84 | | :------ | :------ | :------ | 85 | | `options` | [`RegExpMatcherOptions`](../interfaces/RegExpMatcherOptions.md) | Options to use. | 86 | 87 | #### Defined in 88 | 89 | [src/matcher/regexp/RegExpMatcher.ts:74](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L74) 90 | 91 | ## Methods 92 | 93 | ### getAllMatches 94 | 95 | ▸ **getAllMatches**(`input`, `sorted?`): [`MatchPayload`](../interfaces/MatchPayload.md)[] 96 | 97 | Returns all matches of blacklisted terms in the text. 98 | 99 | If you only need to check for the presence of a match, and do not need 100 | more specific information about the matches, use the `hasMatch()` method, 101 | which is typically more efficient. 102 | 103 | #### Parameters 104 | 105 | | Name | Type | Default value | Description | 106 | | :------ | :------ | :------ | :------ | 107 | | `input` | `string` | `undefined` | Text to find profanities in. | 108 | | `sorted` | `boolean` | `false` | Whether the resulting list of matches should be sorted using [[compareMatchByPositionAndId]]. Defaults to `false`. | 109 | 110 | #### Returns 111 | 112 | [`MatchPayload`](../interfaces/MatchPayload.md)[] 113 | 114 | A list of matches of the matcher on the text. The matches are 115 | guaranteed to be sorted if and only if the `sorted` parameter is `true`, 116 | otherwise, their order is unspecified. 117 | 118 | #### Implementation of 119 | 120 | [Matcher](../interfaces/Matcher.md).[getAllMatches](../interfaces/Matcher.md#getallmatches) 121 | 122 | #### Defined in 123 | 124 | [src/matcher/regexp/RegExpMatcher.ts:87](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L87) 125 | 126 | ___ 127 | 128 | ### hasMatch 129 | 130 | ▸ **hasMatch**(`input`): `boolean` 131 | 132 | Checks whether there is a match for any blacklisted term in the text. 133 | 134 | This is typically more efficient than calling `getAllMatches` and 135 | checking the result, though it depends on the implementation. 136 | 137 | #### Parameters 138 | 139 | | Name | Type | Description | 140 | | :------ | :------ | :------ | 141 | | `input` | `string` | Text to check. | 142 | 143 | #### Returns 144 | 145 | `boolean` 146 | 147 | #### Implementation of 148 | 149 | [Matcher](../interfaces/Matcher.md).[hasMatch](../interfaces/Matcher.md#hasmatch) 150 | 151 | #### Defined in 152 | 153 | [src/matcher/regexp/RegExpMatcher.ts:120](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L120) 154 | -------------------------------------------------------------------------------- /docs/reference/classes/TextCensor.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / TextCensor 2 | 3 | # Class: TextCensor 4 | 5 | Censors regions of text matched by a [[Matcher]], supporting flexible 6 | [[TextCensorStrategy | censoring strategies]]. 7 | 8 | ## Table of contents 9 | 10 | ### Constructors 11 | 12 | - [constructor](TextCensor.md#constructor) 13 | 14 | ### Methods 15 | 16 | - [applyTo](TextCensor.md#applyto) 17 | - [setStrategy](TextCensor.md#setstrategy) 18 | 19 | ## Constructors 20 | 21 | ### constructor 22 | 23 | • **new TextCensor**() 24 | 25 | ## Methods 26 | 27 | ### applyTo 28 | 29 | ▸ **applyTo**(`input`, `matches`): `string` 30 | 31 | Applies the censoring strategy to the text, returning the censored text. 32 | 33 | **Overlapping regions** 34 | 35 | Overlapping regions are an annoying edge case to deal with when censoring 36 | text. There is no single best way to handle them, but the implementation 37 | of this method guarantees that overlapping regions will always be 38 | replaced, following the rules below: 39 | 40 | - Replacement text for matched regions will be generated in the order 41 | specified by [[compareMatchByPositionAndId]]; 42 | - When generating replacements for regions that overlap at the start with 43 | some other region, the start index of the censor context passed to the 44 | censoring strategy will be the end index of the first region, plus one. 45 | 46 | #### Parameters 47 | 48 | | Name | Type | Description | 49 | | :------ | :------ | :------ | 50 | | `input` | `string` | Input text. | 51 | | `matches` | [`MatchPayload`](../interfaces/MatchPayload.md)[] | A list of matches. | 52 | 53 | #### Returns 54 | 55 | `string` 56 | 57 | The censored text. 58 | 59 | #### Defined in 60 | 61 | [src/censor/TextCensor.ts:66](https://github.com/jo3-l/obscenity/blob/0299b49/src/censor/TextCensor.ts#L66) 62 | 63 | ___ 64 | 65 | ### setStrategy 66 | 67 | ▸ **setStrategy**(`strategy`): [`TextCensor`](TextCensor.md) 68 | 69 | Sets the censoring strategy, which is responsible for generating 70 | replacement text for regions of the text that should be censored. 71 | 72 | The default censoring strategy is the [[grawlixCensorStrategy]], 73 | generating text like `$%@*`. There are several other built-in strategies 74 | available: 75 | - [[keepStartCensorStrategy]] - extends another strategy and keeps the 76 | first character matched, e.g. `f***`. 77 | - [[keepEndCensorStrategy]] - extends another strategy and keeps the last 78 | character matched, e.g. `***k`. 79 | - [[asteriskCensorStrategy]] - replaces the text with asterisks, e.g. 80 | `****`. 81 | - [[grawlixCensorStrategy]] - the default strategy, discussed earlier. 82 | 83 | Note that since censoring strategies are just functions (see the 84 | documentation for [[TextCensorStrategy]]), it is relatively simple to 85 | create your own. 86 | 87 | To ease creation of common censoring strategies, we provide a number of 88 | utility functions: 89 | - [[fixedPhraseCensorStrategy]] - generates a fixed phrase, e.g. `fudge`. 90 | - [[fixedCharCensorStrategy]] - generates replacement strings constructed 91 | from the character given, repeated as many times as needed. 92 | - [[randomCharFromSetCensorStrategy]] - generates replacement strings 93 | made up of random characters from the set of characters provided. 94 | 95 | #### Parameters 96 | 97 | | Name | Type | Description | 98 | | :------ | :------ | :------ | 99 | | `strategy` | [`TextCensorStrategy`](../README.md#textcensorstrategy) | Text censoring strategy to use. | 100 | 101 | #### Returns 102 | 103 | [`TextCensor`](TextCensor.md) 104 | 105 | #### Defined in 106 | 107 | [src/censor/TextCensor.ts:41](https://github.com/jo3-l/obscenity/blob/0299b49/src/censor/TextCensor.ts#L41) 108 | -------------------------------------------------------------------------------- /docs/reference/enums/SyntaxKind.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / SyntaxKind 2 | 3 | # Enumeration: SyntaxKind 4 | 5 | An enumeration of the kinds of nodes there are. 6 | 7 | ## Table of contents 8 | 9 | ### Enumeration Members 10 | 11 | - [BoundaryAssertion](SyntaxKind.md#boundaryassertion) 12 | - [Literal](SyntaxKind.md#literal) 13 | - [Optional](SyntaxKind.md#optional) 14 | - [Wildcard](SyntaxKind.md#wildcard) 15 | 16 | ## Enumeration Members 17 | 18 | ### BoundaryAssertion 19 | 20 | • **BoundaryAssertion** = ``3`` 21 | 22 | #### Defined in 23 | 24 | [src/pattern/Nodes.ts:33](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L33) 25 | 26 | ___ 27 | 28 | ### Literal 29 | 30 | • **Literal** = ``2`` 31 | 32 | #### Defined in 33 | 34 | [src/pattern/Nodes.ts:32](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L32) 35 | 36 | ___ 37 | 38 | ### Optional 39 | 40 | • **Optional** = ``0`` 41 | 42 | #### Defined in 43 | 44 | [src/pattern/Nodes.ts:30](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L30) 45 | 46 | ___ 47 | 48 | ### Wildcard 49 | 50 | • **Wildcard** = ``1`` 51 | 52 | #### Defined in 53 | 54 | [src/pattern/Nodes.ts:31](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L31) 55 | -------------------------------------------------------------------------------- /docs/reference/interfaces/BlacklistedTerm.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / BlacklistedTerm 2 | 3 | # Interface: BlacklistedTerm 4 | 5 | Represents a blacklisted term. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [id](BlacklistedTerm.md#id) 12 | - [pattern](BlacklistedTerm.md#pattern) 13 | 14 | ## Properties 15 | 16 | ### id 17 | 18 | • **id**: `number` 19 | 20 | The identifier of the pattern; should be unique across all patterns. 21 | 22 | #### Defined in 23 | 24 | [src/matcher/BlacklistedTerm.ts:10](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/BlacklistedTerm.ts#L10) 25 | 26 | ___ 27 | 28 | ### pattern 29 | 30 | • **pattern**: [`ParsedPattern`](ParsedPattern.md) 31 | 32 | The parsed pattern. 33 | 34 | #### Defined in 35 | 36 | [src/matcher/BlacklistedTerm.ts:15](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/BlacklistedTerm.ts#L15) 37 | -------------------------------------------------------------------------------- /docs/reference/interfaces/BoundaryAssertionNode.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / BoundaryAssertionNode 2 | 3 | # Interface: BoundaryAssertionNode 4 | 5 | A boundary assertion node. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [kind](BoundaryAssertionNode.md#kind) 12 | 13 | ## Properties 14 | 15 | ### kind 16 | 17 | • **kind**: [`BoundaryAssertion`](../enums/SyntaxKind.md#boundaryassertion) 18 | 19 | #### Defined in 20 | 21 | [src/pattern/Nodes.ts:72](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L72) 22 | -------------------------------------------------------------------------------- /docs/reference/interfaces/CollapseDuplicatesTransformerOptions.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / CollapseDuplicatesTransformerOptions 2 | 3 | # Interface: CollapseDuplicatesTransformerOptions 4 | 5 | Options for the collapse duplicates transformer. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [customThresholds](CollapseDuplicatesTransformerOptions.md#customthresholds) 12 | - [defaultThreshold](CollapseDuplicatesTransformerOptions.md#defaultthreshold) 13 | 14 | ## Properties 15 | 16 | ### customThresholds 17 | 18 | • `Optional` **customThresholds**: `Map`<`string`, `number`\> 19 | 20 | Custom thresholds for characters. If a character has an entry 21 | corresponding to it, the value of tne entry will be used as the maximum 22 | length of character runs comprised of said character before they are 23 | collapsed. 24 | 25 | The intended use-case for this option is for characters which appear 26 | more than once in a row in patterns. For example, the word `book` has 27 | two `o`s in a row, and matches `book`. With this transformer, though, 28 | `book` would become `bok`, meaning that `book` would no longer match `book`. 29 | The fix would be to add an entry corresponding to `o` that overrides its 30 | threshold to be `2`, with the effect of leaving `book` unchanged. 31 | 32 | **`Default`** 33 | 34 | ```ts 35 | new Map() 36 | ``` 37 | 38 | #### Defined in 39 | 40 | [src/transformer/collapse-duplicates/index.ts:91](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L91) 41 | 42 | ___ 43 | 44 | ### defaultThreshold 45 | 46 | • `Optional` **defaultThreshold**: `number` 47 | 48 | The maximum number of characters in a run that will be accepted before 49 | they will be collapsed. 50 | 51 | For example, if this value was `2`, `aa` would stay the same but `aaa` 52 | would be transformed to `aa`. 53 | 54 | **`Default`** 55 | 56 | ```ts 57 | 1 58 | ``` 59 | 60 | #### Defined in 61 | 62 | [src/transformer/collapse-duplicates/index.ts:102](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L102) 63 | -------------------------------------------------------------------------------- /docs/reference/interfaces/LiteralNode.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / LiteralNode 2 | 3 | # Interface: LiteralNode 4 | 5 | A literal node. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [chars](LiteralNode.md#chars) 12 | - [kind](LiteralNode.md#kind) 13 | 14 | ## Properties 15 | 16 | ### chars 17 | 18 | • **chars**: `number`[] 19 | 20 | The code points that this literal matches. 21 | 22 | #### Defined in 23 | 24 | [src/pattern/Nodes.ts:63](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L63) 25 | 26 | ___ 27 | 28 | ### kind 29 | 30 | • **kind**: [`Literal`](../enums/SyntaxKind.md#literal) 31 | 32 | #### Defined in 33 | 34 | [src/pattern/Nodes.ts:65](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L65) 35 | -------------------------------------------------------------------------------- /docs/reference/interfaces/MatchPayload.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / MatchPayload 2 | 3 | # Interface: MatchPayload 4 | 5 | Information emitted on a successful match. 6 | 7 | If you require more information about matches than what is provided here, see 8 | the [[DataSet]] class, which supports associating metadata with patterns. 9 | 10 | ## Table of contents 11 | 12 | ### Properties 13 | 14 | - [endIndex](MatchPayload.md#endindex) 15 | - [matchLength](MatchPayload.md#matchlength) 16 | - [startIndex](MatchPayload.md#startindex) 17 | - [termId](MatchPayload.md#termid) 18 | 19 | ## Properties 20 | 21 | ### endIndex 22 | 23 | • **endIndex**: `number` 24 | 25 | End index of the match, inclusive. 26 | 27 | If the last character of the pattern is a surrogate pair, 28 | then this points to the index of the low surrogate. 29 | 30 | #### Defined in 31 | 32 | [src/matcher/MatchPayload.ts:16](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L16) 33 | 34 | ___ 35 | 36 | ### matchLength 37 | 38 | • **matchLength**: `number` 39 | 40 | Total number of of code points that matched. 41 | 42 | #### Defined in 43 | 44 | [src/matcher/MatchPayload.ts:21](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L21) 45 | 46 | ___ 47 | 48 | ### startIndex 49 | 50 | • **startIndex**: `number` 51 | 52 | Start index of the match, inclusive. 53 | 54 | #### Defined in 55 | 56 | [src/matcher/MatchPayload.ts:26](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L26) 57 | 58 | ___ 59 | 60 | ### termId 61 | 62 | • **termId**: `number` 63 | 64 | ID of the blacklisted term that matched. 65 | 66 | #### Defined in 67 | 68 | [src/matcher/MatchPayload.ts:31](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/MatchPayload.ts#L31) 69 | -------------------------------------------------------------------------------- /docs/reference/interfaces/Matcher.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / Matcher 2 | 3 | # Interface: Matcher 4 | 5 | Searches for blacklisted terms in text, ignoring parts matched by whitelisted 6 | terms. 7 | 8 | See: 9 | - [[RegExpMatcher]] for an implementation using regular expressions. 10 | 11 | ## Implemented by 12 | 13 | - [`RegExpMatcher`](../classes/RegExpMatcher.md) 14 | 15 | ## Table of contents 16 | 17 | ### Methods 18 | 19 | - [getAllMatches](Matcher.md#getallmatches) 20 | - [hasMatch](Matcher.md#hasmatch) 21 | 22 | ## Methods 23 | 24 | ### getAllMatches 25 | 26 | ▸ **getAllMatches**(`input`, `sorted?`): [`MatchPayload`](MatchPayload.md)[] 27 | 28 | Returns all matches of blacklisted terms in the text. 29 | 30 | If you only need to check for the presence of a match, and do not need 31 | more specific information about the matches, use the `hasMatch()` method, 32 | which is typically more efficient. 33 | 34 | #### Parameters 35 | 36 | | Name | Type | Description | 37 | | :------ | :------ | :------ | 38 | | `input` | `string` | Text to find profanities in. | 39 | | `sorted?` | `boolean` | Whether the resulting list of matches should be sorted using [[compareMatchByPositionAndId]]. Defaults to `false`. | 40 | 41 | #### Returns 42 | 43 | [`MatchPayload`](MatchPayload.md)[] 44 | 45 | A list of matches of the matcher on the text. The matches are 46 | guaranteed to be sorted if and only if the `sorted` parameter is `true`, 47 | otherwise, their order is unspecified. 48 | 49 | #### Defined in 50 | 51 | [src/matcher/Matcher.ts:25](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/Matcher.ts#L25) 52 | 53 | ___ 54 | 55 | ### hasMatch 56 | 57 | ▸ **hasMatch**(`input`): `boolean` 58 | 59 | Checks whether there is a match for any blacklisted term in the text. 60 | 61 | This is typically more efficient than calling `getAllMatches` and 62 | checking the result, though it depends on the implementation. 63 | 64 | #### Parameters 65 | 66 | | Name | Type | Description | 67 | | :------ | :------ | :------ | 68 | | `input` | `string` | Text to check. | 69 | 70 | #### Returns 71 | 72 | `boolean` 73 | 74 | #### Defined in 75 | 76 | [src/matcher/Matcher.ts:35](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/Matcher.ts#L35) 77 | -------------------------------------------------------------------------------- /docs/reference/interfaces/OptionalNode.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / OptionalNode 2 | 3 | # Interface: OptionalNode 4 | 5 | An optional node. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [childNode](OptionalNode.md#childnode) 12 | - [kind](OptionalNode.md#kind) 13 | 14 | ## Properties 15 | 16 | ### childNode 17 | 18 | • **childNode**: [`LiteralNode`](LiteralNode.md) \| [`WildcardNode`](WildcardNode.md) 19 | 20 | The node contained within the optional expression. For `[abc]`, this 21 | would be a literal node with the value `abc`. 22 | 23 | #### Defined in 24 | 25 | [src/pattern/Nodes.ts:44](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L44) 26 | 27 | ___ 28 | 29 | ### kind 30 | 31 | • **kind**: [`Optional`](../enums/SyntaxKind.md#optional) 32 | 33 | #### Defined in 34 | 35 | [src/pattern/Nodes.ts:46](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L46) 36 | -------------------------------------------------------------------------------- /docs/reference/interfaces/ParsedPattern.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / ParsedPattern 2 | 3 | # Interface: ParsedPattern 4 | 5 | A parsed pattern. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [nodes](ParsedPattern.md#nodes) 12 | - [requireWordBoundaryAtEnd](ParsedPattern.md#requirewordboundaryatend) 13 | - [requireWordBoundaryAtStart](ParsedPattern.md#requirewordboundaryatstart) 14 | 15 | ## Properties 16 | 17 | ### nodes 18 | 19 | • **nodes**: [`Node`](../README.md#node)[] 20 | 21 | A list of nodes which make up the pattern. 22 | 23 | #### Defined in 24 | 25 | [src/pattern/Nodes.ts:8](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L8) 26 | 27 | ___ 28 | 29 | ### requireWordBoundaryAtEnd 30 | 31 | • **requireWordBoundaryAtEnd**: `boolean` 32 | 33 | Whether the pattern requires a word boundary at the end. 34 | 35 | #### Defined in 36 | 37 | [src/pattern/Nodes.ts:13](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L13) 38 | 39 | ___ 40 | 41 | ### requireWordBoundaryAtStart 42 | 43 | • **requireWordBoundaryAtStart**: `boolean` 44 | 45 | Whether the pattern requires a word boundary at the start. 46 | 47 | #### Defined in 48 | 49 | [src/pattern/Nodes.ts:18](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L18) 50 | -------------------------------------------------------------------------------- /docs/reference/interfaces/PhraseContainer.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / PhraseContainer 2 | 3 | # Interface: PhraseContainer 4 | 5 | Represents a phrase. 6 | 7 | ## Type parameters 8 | 9 | | Name | 10 | | :------ | 11 | | `MetadataType` | 12 | 13 | ## Table of contents 14 | 15 | ### Properties 16 | 17 | - [metadata](PhraseContainer.md#metadata) 18 | - [patterns](PhraseContainer.md#patterns) 19 | - [whitelistedTerms](PhraseContainer.md#whitelistedterms) 20 | 21 | ## Properties 22 | 23 | ### metadata 24 | 25 | • `Optional` **metadata**: `MetadataType` 26 | 27 | Metadata associated with this phrase. 28 | 29 | #### Defined in 30 | 31 | [src/dataset/DataSet.ts:204](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L204) 32 | 33 | ___ 34 | 35 | ### patterns 36 | 37 | • **patterns**: [`ParsedPattern`](ParsedPattern.md)[] 38 | 39 | Patterns associated with this phrase. 40 | 41 | #### Defined in 42 | 43 | [src/dataset/DataSet.ts:209](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L209) 44 | 45 | ___ 46 | 47 | ### whitelistedTerms 48 | 49 | • **whitelistedTerms**: `string`[] 50 | 51 | Whitelisted terms associated with this phrase. 52 | 53 | #### Defined in 54 | 55 | [src/dataset/DataSet.ts:214](https://github.com/jo3-l/obscenity/blob/0299b49/src/dataset/DataSet.ts#L214) 56 | -------------------------------------------------------------------------------- /docs/reference/interfaces/ProcessedCollapseDuplicatesTransformerOptions.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / ProcessedCollapseDuplicatesTransformerOptions 2 | 3 | # Interface: ProcessedCollapseDuplicatesTransformerOptions 4 | 5 | ## Table of contents 6 | 7 | ### Properties 8 | 9 | - [customThresholds](ProcessedCollapseDuplicatesTransformerOptions.md#customthresholds) 10 | - [defaultThreshold](ProcessedCollapseDuplicatesTransformerOptions.md#defaultthreshold) 11 | 12 | ## Properties 13 | 14 | ### customThresholds 15 | 16 | • **customThresholds**: `Map`<`number`, `number`\> 17 | 18 | #### Defined in 19 | 20 | [src/transformer/collapse-duplicates/index.ts:68](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L68) 21 | 22 | ___ 23 | 24 | ### defaultThreshold 25 | 26 | • **defaultThreshold**: `number` 27 | 28 | #### Defined in 29 | 30 | [src/transformer/collapse-duplicates/index.ts:69](https://github.com/jo3-l/obscenity/blob/0299b49/src/transformer/collapse-duplicates/index.ts#L69) 31 | -------------------------------------------------------------------------------- /docs/reference/interfaces/RegExpMatcherOptions.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / RegExpMatcherOptions 2 | 3 | # Interface: RegExpMatcherOptions 4 | 5 | Options for the [[RegExpMatcher]]. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [blacklistMatcherTransformers](RegExpMatcherOptions.md#blacklistmatchertransformers) 12 | - [blacklistedTerms](RegExpMatcherOptions.md#blacklistedterms) 13 | - [whitelistMatcherTransformers](RegExpMatcherOptions.md#whitelistmatchertransformers) 14 | - [whitelistedTerms](RegExpMatcherOptions.md#whitelistedterms) 15 | 16 | ## Properties 17 | 18 | ### blacklistMatcherTransformers 19 | 20 | • `Optional` **blacklistMatcherTransformers**: `TransformerContainer`[] 21 | 22 | A set of transformers that should be applied to the input text before 23 | blacklisted patterns are matched. This does not affect the matching of 24 | whitelisted terms. 25 | 26 | Transformers will be applied in the order they appear. 27 | 28 | **`Default`** 29 | 30 | ```ts 31 | [] 32 | ``` 33 | 34 | #### Defined in 35 | 36 | [src/matcher/regexp/RegExpMatcher.ts:229](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L229) 37 | 38 | ___ 39 | 40 | ### blacklistedTerms 41 | 42 | • **blacklistedTerms**: [`BlacklistedTerm`](BlacklistedTerm.md)[] 43 | 44 | A list of blacklisted terms. 45 | 46 | #### Defined in 47 | 48 | [src/matcher/regexp/RegExpMatcher.ts:234](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L234) 49 | 50 | ___ 51 | 52 | ### whitelistMatcherTransformers 53 | 54 | • `Optional` **whitelistMatcherTransformers**: `TransformerContainer`[] 55 | 56 | A set of transformers that should be applied to the input text before 57 | whitelisted terms are matched. This does not affect the matching of 58 | blacklisted terms. 59 | 60 | Transformers will be applied in the order they appear. 61 | 62 | **`Default`** 63 | 64 | ```ts 65 | [] 66 | ``` 67 | 68 | #### Defined in 69 | 70 | [src/matcher/regexp/RegExpMatcher.ts:245](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L245) 71 | 72 | ___ 73 | 74 | ### whitelistedTerms 75 | 76 | • `Optional` **whitelistedTerms**: `string`[] 77 | 78 | A list of whitelisted terms. If a whitelisted term matches some part of 79 | the text, a match of a blacklisted pattern within that part of the text 80 | will not be emitted. 81 | 82 | For example, if we had a pattern `penis` and a whitelisted term `pen is`, 83 | only no matches would be reported for the input text `the pen is mightier 84 | than the sword.` 85 | 86 | **`Default`** 87 | 88 | ```ts 89 | [] 90 | ``` 91 | 92 | #### Defined in 93 | 94 | [src/matcher/regexp/RegExpMatcher.ts:258](https://github.com/jo3-l/obscenity/blob/0299b49/src/matcher/regexp/RegExpMatcher.ts#L258) 95 | -------------------------------------------------------------------------------- /docs/reference/interfaces/WildcardNode.md: -------------------------------------------------------------------------------- 1 | [obscenity](../README.md) / WildcardNode 2 | 3 | # Interface: WildcardNode 4 | 5 | A wildcard node. 6 | 7 | ## Table of contents 8 | 9 | ### Properties 10 | 11 | - [kind](WildcardNode.md#kind) 12 | 13 | ## Properties 14 | 15 | ### kind 16 | 17 | • **kind**: [`Wildcard`](../enums/SyntaxKind.md#wildcard) 18 | 19 | #### Defined in 20 | 21 | [src/pattern/Nodes.ts:53](https://github.com/jo3-l/obscenity/blob/0299b49/src/pattern/Nodes.ts#L53) 22 | -------------------------------------------------------------------------------- /examples/extending-datasets.js: -------------------------------------------------------------------------------- 1 | // Add a new word to the English dataset and removing an existing one. 2 | 3 | // Import what we need from Obscenity. 4 | const { RegExpMatcher, DataSet, englishDataset, englishRecommendedTransformers, pattern } = require('../dist'); 5 | 6 | // Create a new dataset. 7 | const myDataset = new DataSet() 8 | // Add all the data from the english dataset into our new one. 9 | .addAll(englishDataset) 10 | // Remove "fuck" and all its variants. 11 | .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck') 12 | // Add "simp". 13 | .addPhrase((phrase) => 14 | phrase 15 | .setMetadata({ originalWord: 'simp' }) 16 | .addPattern(pattern`simp`) 17 | .addWhitelistedTerm('simple'), 18 | ); 19 | 20 | // Use our new dataset. 21 | const matcher = new RegExpMatcher({ 22 | ...myDataset, 23 | ...englishRecommendedTransformers, 24 | }); 25 | 26 | console.log(matcher.hasMatch('simp')); 27 | -------------------------------------------------------------------------------- /examples/repl.js: -------------------------------------------------------------------------------- 1 | // A REPL where you can enter text and see whether Obscenity matches on it with 2 | // its English preset. 3 | 4 | // Import the REPL built-in package. 5 | const repl = require('repl'); 6 | // Import what we need from Obscenity. 7 | const { RegExpMatcher, englishDataset, englishRecommendedTransformers } = require('../dist'); 8 | 9 | // Create our matcher, using the English preset. 10 | const matcher = new RegExpMatcher({ 11 | ...englishDataset.build(), 12 | ...englishRecommendedTransformers, 13 | }); 14 | 15 | // Display a nice welcome message. 16 | console.log(`Welcome to the REPL example for Obscenity. 17 | Type ".help" for more information.`); 18 | 19 | // Start our REPL server. 20 | const replServer = repl.start({ 21 | prompt: '> ', 22 | eval(input, _ctx, _file, cb) { 23 | // Get all matches of blacklisted terms in the input. We pass 'true' to 24 | // getAllMatches() so the output is sorted (easier to read). 25 | const matches = matcher 26 | .getAllMatches(input, true) 27 | // Add some additional metadata about the phrases that were matched. 28 | .map((match) => englishDataset.getPayloadWithPhraseMetadata(match)); 29 | 30 | // Return the matches to the REPL server. 31 | cb(undefined, matches); 32 | }, 33 | }); 34 | 35 | // Overwrite the default help command. 36 | replServer.defineCommand('help', { 37 | help: 'View a help message', 38 | action() { 39 | console.log(`To try out Obscenity with the English preset, simply type a phrase. 40 | Obscene words found in the input will be displayed when you click enter. 41 | 42 | Press ^D to exit the REPL`); 43 | this.displayPrompt(); 44 | }, 45 | }); 46 | 47 | // Override some special commands that aren't useful for this example. 48 | function invalidCommand() { 49 | console.log('Invalid REPL keyword'); 50 | this.displayPrompt(); 51 | } 52 | 53 | replServer.defineCommand('save', { action: invalidCommand }); 54 | replServer.defineCommand('load', { action: invalidCommand }); 55 | -------------------------------------------------------------------------------- /jest.config.ts: -------------------------------------------------------------------------------- 1 | import type { Config } from '@jest/types'; 2 | 3 | const config: Config.InitialOptions = { 4 | preset: 'ts-jest', 5 | testEnvironment: 'node', 6 | testRunner: 'jest-circus/runner', 7 | testMatch: ['/test/**/*.test.ts'], 8 | transform: { 9 | // eslint-disable-next-line @typescript-eslint/naming-convention 10 | '^.+\\.ts$': [ 11 | 'ts-jest', 12 | { 13 | tsconfig: '/test/tsconfig.json', 14 | }, 15 | ], 16 | }, 17 | collectCoverage: true, 18 | collectCoverageFrom: ['/src/**/*.ts'], 19 | coverageDirectory: 'coverage', 20 | coverageReporters: ['text', 'lcov', 'clover'], 21 | coveragePathIgnorePatterns: [ 22 | '/src/index\\.ts', // library entry point 23 | '/src/preset/.*\\.ts', // presets 24 | ], 25 | setupFilesAfterEnv: ['/test/jest.setup.ts'], 26 | }; 27 | 28 | export default config; 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "obscenity", 3 | "version": "0.4.3", 4 | "description": "Robust, extensible profanity filter.", 5 | "files": [ 6 | "/dist", 7 | "!/dist/*.tsbuildinfo" 8 | ], 9 | "main": "./dist/index.js", 10 | "module": "./dist/index.mjs", 11 | "types": "./dist/index.d.ts", 12 | "exports": { 13 | "import": { 14 | "types": "./dist/index.d.ts", 15 | "default": "./dist/index.mjs" 16 | }, 17 | "require": { 18 | "types": "./dist/index.d.ts", 19 | "default": "./dist/index.js" 20 | } 21 | }, 22 | "scripts": { 23 | "build": "rimraf dist && tsc -b src && gen-esm-wrapper dist/index.js dist/index.mjs", 24 | "build:benchmarks": "tsc -b benchmarks", 25 | "build:docs": "rimraf docs/references && typedoc --plugin typedoc-plugin-markdown", 26 | "test": "jest", 27 | "test:watch": "jest --watch", 28 | "test:ci": "jest --ci --no-stack-trace --verbose", 29 | "style": "prettier --write src/**/*.ts test/**/*.ts", 30 | "lint": "eslint src test", 31 | "lint:fix": "eslint src test --fix", 32 | "release": "git checkout main && git pull origin main && pnpm i && pnpm lint && pnpm test && pnpm build && pnpm build:docs && git add -A && standard-version -a", 33 | "release:tags": "git push --follow-tags origin main", 34 | "release:github": "conventional-github-releaser -p angular", 35 | "release:publish": "pnpm publish --access public" 36 | }, 37 | "repository": { 38 | "type": "git", 39 | "url": "https://github.com/jo3-l/obscenity.git" 40 | }, 41 | "bugs": { 42 | "url": "https://github.com/jo3-l/obscenity/issues" 43 | }, 44 | "keywords": [ 45 | "profanity", 46 | "profane", 47 | "obscenities", 48 | "obscenity", 49 | "obscene", 50 | "filter", 51 | "curse", 52 | "swear", 53 | "swearing", 54 | "vulgar", 55 | "vulgarity", 56 | "bad-words", 57 | "badwords", 58 | "cuss", 59 | "cussing" 60 | ], 61 | "homepage": "https://github.com/jo3-l/obscenity#readme", 62 | "author": "Joe L. ", 63 | "license": "MIT", 64 | "devDependencies": { 65 | "@commitlint/cli": "^18.0.0", 66 | "@commitlint/config-angular": "^18.0.0", 67 | "@jest/types": "^29.5.0", 68 | "@types/jest": "^29.5.2", 69 | "@typescript-eslint/eslint-plugin": "^8.0.0", 70 | "@typescript-eslint/parser": "^8.0.0", 71 | "conventional-github-releaser": "^3.1.5", 72 | "eslint": "^8.57.0", 73 | "eslint-config-prettier": "^10.0.0", 74 | "eslint-plugin-jest": "^27.9.0", 75 | "eslint-plugin-prettier": "^4.2.1", 76 | "fast-check": "^2.25.0", 77 | "gen-esm-wrapper": "^1.1.3", 78 | "is-ci": "^4.0.0", 79 | "jest": "^29.7.0", 80 | "jest-circus": "^29.5.0", 81 | "prettier": "^2.8.8", 82 | "rimraf": "^6.0.0", 83 | "standard-version": "^9.5.0", 84 | "ts-jest": "^29.1.1", 85 | "ts-node": "^10.9.1", 86 | "typedoc": "^0.25.0", 87 | "typedoc-plugin-markdown": "^3.15.3", 88 | "typescript": "^5.2.2" 89 | }, 90 | "engines": { 91 | "node": ">=14.0.0" 92 | }, 93 | "packageManager": "pnpm@9.15.4" 94 | } 95 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base", 4 | ":semanticCommits", 5 | ":semanticPrefixChore", 6 | ":preserveSemverRanges", 7 | ":rebaseStalePrs", 8 | ":label(deps)" 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /scripts/search-words.js: -------------------------------------------------------------------------------- 1 | const repl = require('repl'); 2 | const fs = require('fs'); 3 | const { join } = require('path'); 4 | 5 | const words = fs.readFileSync(join(__dirname, 'english-words.txt'), { encoding: 'utf8' }).split('\n'); 6 | 7 | repl.start({ 8 | prompt: '> ', 9 | eval: (cmd, _, __, cb) => { 10 | cmd = cmd.trim(); 11 | let prefixAnchor = cmd.startsWith('^'); 12 | if (prefixAnchor) cmd = cmd.slice(1); 13 | 14 | let suffixAnchor = cmd.endsWith('$'); 15 | if (suffixAnchor) cmd = cmd.slice(0, -1); 16 | 17 | const result = []; 18 | for (let i = 0; i < words.length; i++) { 19 | words[i] = words[i].trim(); 20 | const word = words[i]; 21 | let ok = false; 22 | if (prefixAnchor) ok = word.startsWith(cmd); 23 | else if (suffixAnchor) ok = word.endsWith(cmd); 24 | else ok = word.includes(cmd); 25 | if (ok) result.push(word); 26 | } 27 | 28 | cb(undefined, result); 29 | }, 30 | writer: (output) => { 31 | if (output.length === 0) return 'No words found matching the query given.'; 32 | return `${output.length} words found:\n\n${output.join('\n')}`; 33 | }, 34 | }); 35 | -------------------------------------------------------------------------------- /src/censor/BuiltinStrategies.ts: -------------------------------------------------------------------------------- 1 | import { getAndAssertSingleCodePoint } from '../util/Char'; 2 | import type { CensorContext, TextCensorStrategy } from './TextCensor'; 3 | 4 | /** 5 | * A text censoring strategy that extends another strategy, adding the first 6 | * character matched at the start of the generated string. 7 | * 8 | * @example 9 | * ```typescript 10 | * const strategy = keepStartCensorStrategy(grawlixCensorStrategy()); 11 | * const censor = new TextCensor().setStrategy(strategy); 12 | * // Before: 'fuck you' 13 | * // After: 'f$@* you' 14 | * ``` 15 | * @example 16 | * ```typescript 17 | * // Since keepEndCensorStrategy() returns another text censoring strategy, you can use it 18 | * // as the base strategy to pass to keepStartCensorStrategy(). 19 | * const strategy = keepStartCensorStrategy(keepEndCensorStrategy(asteriskCensorStrategy())); 20 | * const censor = new TextCensor().setStrategy(strategy); 21 | * // Before: 'fuck you' 22 | * // After: 'f**k you' 23 | * ``` 24 | * @param baseStrategy - Strategy to extend. It will be used to produce the end of 25 | * the generated string. 26 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 27 | */ 28 | export function keepStartCensorStrategy(baseStrategy: TextCensorStrategy): TextCensorStrategy { 29 | return (ctx: CensorContext) => { 30 | if (ctx.overlapsAtStart) return baseStrategy(ctx); 31 | const firstChar = String.fromCodePoint(ctx.input.codePointAt(ctx.startIndex)!); 32 | return firstChar + baseStrategy({ ...ctx, matchLength: ctx.matchLength - 1 }); 33 | }; 34 | } 35 | 36 | /** 37 | * A text censoring strategy that extends another strategy, adding the last 38 | * character matched at the end of the generated string. 39 | * 40 | * @example 41 | * ```typescript 42 | * const strategy = keepEndCensorStrategy(asteriskCensorStrategy()); 43 | * const censor = new TextCensor().setStrategy(strategy); 44 | * // Before: 'fuck you' 45 | * // After: '***k you' 46 | * ``` 47 | * @param baseStrategy - Strategy to extend. It will be used to produce the start 48 | * of the generated string. 49 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 50 | */ 51 | export function keepEndCensorStrategy(baseStrategy: TextCensorStrategy): TextCensorStrategy { 52 | return (ctx: CensorContext) => { 53 | if (ctx.overlapsAtEnd) return baseStrategy(ctx); 54 | const lastChar = String.fromCodePoint(ctx.input.codePointAt(ctx.endIndex)!); 55 | return baseStrategy({ ...ctx, matchLength: ctx.matchLength - 1 }) + lastChar; 56 | }; 57 | } 58 | 59 | /** 60 | * A text censoring strategy that generates strings made up of asterisks (`*`). 61 | * 62 | * @example 63 | * ```typescript 64 | * const strategy = asteriskCensorStrategy(); 65 | * const censor = new TextCensor().setStrategy(strategy); 66 | * // Before: 'fuck you' 67 | * // After: '**** you' 68 | * ``` 69 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 70 | */ 71 | export function asteriskCensorStrategy() { 72 | return fixedCharCensorStrategy('*'); 73 | } 74 | 75 | /** 76 | * A text censoring strategy that generates 77 | * [grawlix](https://www.merriam-webster.com/words-at-play/grawlix-symbols-swearing-comic-strips), 78 | * i.e. strings that contain the characters `%`, `@`, `$`, `&`, and `*`. 79 | * 80 | * @example 81 | * ```typescript 82 | * const strategy = grawlixCensorStrategy(); 83 | * const censor = new TextCensor().setStrategy(strategy); 84 | * // Before: 'fuck you' 85 | * // After: '%@&* you' 86 | * ``` 87 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 88 | */ 89 | export function grawlixCensorStrategy() { 90 | return randomCharFromSetCensorStrategy('%@$&*'); 91 | } 92 | 93 | /** 94 | * A text censoring strategy that returns a fixed string. 95 | * 96 | * @example 97 | * ```typescript 98 | * // The replacement phrase '' effectively removes all matched regions 99 | * // from the string. 100 | * const strategy = fixedPhraseCensorStrategy(''); 101 | * const censor = new TextCensor().setStrategy(strategy); 102 | * // Before: 'fuck you' 103 | * // After: ' you' 104 | * ``` 105 | * @example 106 | * ```typescript 107 | * const strategy = fixedPhraseCensorStrategy('fudge'); 108 | * const censor = new TextCensor().setStrategy(strategy); 109 | * // Before: 'fuck you' 110 | * // After: 'fudge you' 111 | * ``` 112 | * @param phrase - Replacement phrase to use. 113 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 114 | */ 115 | export function fixedPhraseCensorStrategy(phrase: string): TextCensorStrategy { 116 | return () => phrase; 117 | } 118 | 119 | /** 120 | * A text censoring strategy that generates replacement strings that are made up 121 | * of the character given, repeated as many times as needed. 122 | * 123 | * @example 124 | * ```typescript 125 | * const strategy = fixedCharCensorStrategy('*'); 126 | * const censor = new TextCensor().setStrategy(strategy); 127 | * // Before: 'fuck you' 128 | * // After: '**** you'. 129 | * ``` 130 | * @param char - String that represents the code point which should be used when 131 | * generating the replacement string. Must be exactly one code point in length. 132 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 133 | */ 134 | export function fixedCharCensorStrategy(char: string): TextCensorStrategy { 135 | // Make sure the input character is one code point in length. 136 | getAndAssertSingleCodePoint(char); 137 | return (ctx: CensorContext) => char.repeat(ctx.matchLength); 138 | } 139 | 140 | /** 141 | * A text censoring strategy that generates replacement strings made up of 142 | * random characters from the set of characters provided. The strings never 143 | * contain two of the same character in a row. 144 | * 145 | * @example 146 | * ```typescript 147 | * const strategy = randomCharFromSetCensorStrategy('$#!'); 148 | * const censor = new TextCensor().setStrategy(strategy); 149 | * // Before: 'fuck you!' 150 | * // After: '!#$# you!' 151 | * ``` 152 | * @param charset - Set of characters from which the replacement string should 153 | * be constructed. Must have at least two characters. 154 | * @returns A [[TextCensorStrategy]] for use with the [[TextCensor]]. 155 | */ 156 | export function randomCharFromSetCensorStrategy(charset: string): TextCensorStrategy { 157 | const chars = [...charset]; 158 | if (chars.length < 2) throw new Error('The character set passed must have at least 2 characters.'); 159 | return (ctx: CensorContext) => { 160 | if (ctx.matchLength === 0) return ''; 161 | 162 | let lastIdx = Math.floor(Math.random() * chars.length); 163 | let censored = chars[lastIdx]; 164 | for (let i = 1; i < ctx.matchLength; i++) { 165 | let idx = Math.floor(Math.random() * (chars.length - 1)); 166 | // Transform the distribution for idx from [0, len-1) to 167 | // [0, lastIdx) ∪ (lastIdx, len) to exclude lastIdx while 168 | // ensuring a uniform distribution of generated characters. 169 | if (idx >= lastIdx) idx++; 170 | lastIdx = idx; 171 | censored += chars[idx]; 172 | } 173 | return censored; 174 | }; 175 | } 176 | -------------------------------------------------------------------------------- /src/censor/TextCensor.ts: -------------------------------------------------------------------------------- 1 | import type { MatchPayload } from '../matcher/MatchPayload'; 2 | import { compareMatchByPositionAndId } from '../matcher/MatchPayload'; 3 | import { grawlixCensorStrategy } from './BuiltinStrategies'; 4 | 5 | /** 6 | * Censors regions of text matched by a [[Matcher]], supporting flexible 7 | * [[TextCensorStrategy | censoring strategies]]. 8 | */ 9 | export class TextCensor { 10 | private strategy: TextCensorStrategy = grawlixCensorStrategy(); 11 | 12 | /** 13 | * Sets the censoring strategy, which is responsible for generating 14 | * replacement text for regions of the text that should be censored. 15 | * 16 | * The default censoring strategy is the [[grawlixCensorStrategy]], 17 | * generating text like `$%@*`. There are several other built-in strategies 18 | * available: 19 | * - [[keepStartCensorStrategy]] - extends another strategy and keeps the 20 | * first character matched, e.g. `f***`. 21 | * - [[keepEndCensorStrategy]] - extends another strategy and keeps the last 22 | * character matched, e.g. `***k`. 23 | * - [[asteriskCensorStrategy]] - replaces the text with asterisks, e.g. 24 | * `****`. 25 | * - [[grawlixCensorStrategy]] - the default strategy, discussed earlier. 26 | * 27 | * Note that since censoring strategies are just functions (see the 28 | * documentation for [[TextCensorStrategy]]), it is relatively simple to 29 | * create your own. 30 | * 31 | * To ease creation of common censoring strategies, we provide a number of 32 | * utility functions: 33 | * - [[fixedPhraseCensorStrategy]] - generates a fixed phrase, e.g. `fudge`. 34 | * - [[fixedCharCensorStrategy]] - generates replacement strings constructed 35 | * from the character given, repeated as many times as needed. 36 | * - [[randomCharFromSetCensorStrategy]] - generates replacement strings 37 | * made up of random characters from the set of characters provided. 38 | * 39 | * @param strategy - Text censoring strategy to use. 40 | */ 41 | public setStrategy(strategy: TextCensorStrategy) { 42 | this.strategy = strategy; 43 | return this; 44 | } 45 | 46 | /** 47 | * Applies the censoring strategy to the text, returning the censored text. 48 | * 49 | * **Overlapping regions** 50 | * 51 | * Overlapping regions are an annoying edge case to deal with when censoring 52 | * text. There is no single best way to handle them, but the implementation 53 | * of this method guarantees that overlapping regions will always be 54 | * replaced, following the rules below: 55 | * 56 | * - Replacement text for matched regions will be generated in the order 57 | * specified by [[compareMatchByPositionAndId]]; 58 | * - When generating replacements for regions that overlap at the start with 59 | * some other region, the start index of the censor context passed to the 60 | * censoring strategy will be the end index of the first region, plus one. 61 | * 62 | * @param input - Input text. 63 | * @param matches - A list of matches. 64 | * @returns The censored text. 65 | */ 66 | public applyTo(input: string, matches: MatchPayload[]) { 67 | if (matches.length === 0) return input; 68 | const sorted = [...matches].sort(compareMatchByPositionAndId); 69 | 70 | let censored = ''; 71 | let lastIndex = 0; // end index of last match, plus one 72 | for (let i = 0; i < sorted.length; i++) { 73 | const match = sorted[i]; 74 | if (lastIndex > match.endIndex) continue; // completely contained in the previous span 75 | 76 | const overlapsAtStart = match.startIndex < lastIndex; 77 | // Add the chunk of text between the end of the last match and the 78 | // start of the current match. 79 | if (!overlapsAtStart) censored += input.slice(lastIndex, match.startIndex); 80 | 81 | const actualStartIndex = Math.max(lastIndex, match.startIndex); 82 | const overlapsAtEnd = 83 | i < sorted.length - 1 && // not the last match 84 | match.endIndex >= sorted[i + 1].startIndex && // end index of this match and start index of next one overlap 85 | match.endIndex < sorted[i + 1].endIndex; // doesn't completely contain next match 86 | censored += this.strategy({ ...match, startIndex: actualStartIndex, input, overlapsAtStart, overlapsAtEnd }); 87 | lastIndex = match.endIndex + 1; 88 | } 89 | 90 | censored += input.slice(lastIndex); 91 | return censored; 92 | } 93 | } 94 | 95 | /** 96 | * A text censoring strategy, which receives a [[CensorContext]] and returns a 97 | * replacement string. 98 | */ 99 | export type TextCensorStrategy = (ctx: CensorContext) => string; 100 | 101 | /** 102 | * Context passed to [[TextCensorStrategy | text censoring strategies]]. 103 | */ 104 | export type CensorContext = MatchPayload & { 105 | /** 106 | * The entire input text, without any censoring applied to it. 107 | */ 108 | input: string; 109 | 110 | /** 111 | * Whether the current region overlaps at the end with some other region. 112 | */ 113 | overlapsAtEnd: boolean; 114 | 115 | /** 116 | * Whether the current region overlaps at the start with some other region. 117 | */ 118 | overlapsAtStart: boolean; 119 | }; 120 | -------------------------------------------------------------------------------- /src/dataset/DataSet.ts: -------------------------------------------------------------------------------- 1 | import { assignIncrementingIds } from '../matcher/BlacklistedTerm'; 2 | import type { MatchPayload } from '../matcher/MatchPayload'; 3 | import type { RegExpMatcherOptions } from '../matcher/regexp/RegExpMatcher'; 4 | import type { ParsedPattern } from '../pattern/Nodes'; 5 | 6 | /** 7 | * Holds phrases (groups of patterns and whitelisted terms), optionally 8 | * associating metadata with them. 9 | * 10 | * @typeParam MetadataType - Metadata type for phrases. Note that the metadata 11 | * type is implicitly nullable. 12 | */ 13 | export class DataSet { 14 | private readonly containers: PhraseContainer[] = []; 15 | 16 | private patternCount = 0; 17 | 18 | private readonly patternIdToPhraseContainer = new Map(); // pattern ID => index of its container 19 | 20 | /** 21 | * Adds all the phrases from the dataset provided to this one. 22 | * 23 | * @example 24 | * ```typescript 25 | * const customDataset = new DataSet().addAll(englishDataset); 26 | * ``` 27 | * @param other - Other dataset. 28 | */ 29 | public addAll(other: DataSet) { 30 | for (const container of other.containers) this.registerContainer(container); 31 | return this; 32 | } 33 | 34 | /** 35 | * Removes phrases that match the predicate given. 36 | * 37 | * @example 38 | * ```typescript 39 | * const customDataset = new DataSet<{ originalWord: string }>() 40 | * .addAll(englishDataset) 41 | * .removePhrasesIf((phrase) => phrase.metadata.originalWord === 'fuck'); 42 | * ``` 43 | * @param predicate - A predicate that determines whether or not a phrase should be removed. 44 | * Return `true` to remove, `false` to keep. 45 | */ 46 | public removePhrasesIf(predicate: (phrase: PhraseContainer) => boolean) { 47 | // Clear the internal state, then gradually rebuild it by adding the 48 | // containers that should be kept. 49 | this.patternCount = 0; 50 | this.patternIdToPhraseContainer.clear(); 51 | const containers = this.containers.splice(0); 52 | for (const container of containers) { 53 | const remove = predicate(container); 54 | if (!remove) this.registerContainer(container); 55 | } 56 | 57 | return this; 58 | } 59 | 60 | /** 61 | * Adds a phrase to this dataset. 62 | * 63 | * @example 64 | * ```typescript 65 | * const data = new DataSet<{ originalWord: string }>() 66 | * .addPhrase((phrase) => phrase.setMetadata({ originalWord: 'fuck' }) 67 | * .addPattern(pattern`fuck`) 68 | * .addPattern(pattern`f[?]ck`) 69 | * .addWhitelistedTerm('Afck')) 70 | * .build(); 71 | * ``` 72 | * @param fn - A function that takes a [[PhraseBuilder]], adds 73 | * patterns/whitelisted terms/metadata to it, and returns it. 74 | */ 75 | public addPhrase(fn: (builder: PhraseBuilder) => PhraseBuilder) { 76 | const container = fn(new PhraseBuilder()).build(); 77 | this.registerContainer(container); 78 | return this; 79 | } 80 | 81 | /** 82 | * Retrieves the phrase metadata associated with a pattern and returns a 83 | * copy of the match payload with said metadata attached to it. 84 | * 85 | * @example 86 | * ```typescript 87 | * const matches = matcher.getAllMatches(input); 88 | * const matchesWithPhraseMetadata = matches.map((match) => dataset.getPayloadWithPhraseMetadata(match)); 89 | * // Now we can access the 'phraseMetadata' property: 90 | * const phraseMetadata = matchesWithPhraseMetadata[0].phraseMetadata; 91 | * ``` 92 | * @param payload - Original match payload. 93 | */ 94 | public getPayloadWithPhraseMetadata(payload: MatchPayload): MatchPayloadWithPhraseMetadata { 95 | const offset = this.patternIdToPhraseContainer.get(payload.termId); 96 | if (offset === undefined) { 97 | throw new Error(`The pattern with ID ${payload.termId} does not exist in this dataset.`); 98 | } 99 | 100 | return { 101 | ...payload, 102 | phraseMetadata: this.containers[offset].metadata, 103 | }; 104 | } 105 | 106 | /** 107 | * Returns the dataset in a format suitable for usage with the [[RegExpMatcher]]. 108 | * 109 | * @example 110 | * ```typescript 111 | * // With the RegExpMatcher: 112 | * const matcher = new RegExpMatcher({ 113 | * ...dataset.build(), 114 | * // additional options here 115 | * }); 116 | * ``` 117 | */ 118 | public build(): Pick { 119 | return { 120 | blacklistedTerms: assignIncrementingIds(this.containers.flatMap((p) => p.patterns)), 121 | whitelistedTerms: this.containers.flatMap((p) => p.whitelistedTerms), 122 | }; 123 | } 124 | 125 | private registerContainer(container: PhraseContainer) { 126 | const offset = this.containers.push(container) - 1; 127 | for (let i = 0, phraseId = this.patternCount; i < container.patterns.length; i++, phraseId++) { 128 | this.patternIdToPhraseContainer.set(phraseId, offset); 129 | this.patternCount++; 130 | } 131 | } 132 | } 133 | 134 | /** 135 | * Builder for phrases. 136 | */ 137 | export class PhraseBuilder { 138 | private readonly patterns: ParsedPattern[] = []; 139 | 140 | private readonly whitelistedTerms: string[] = []; 141 | 142 | private metadata?: MetadataType; 143 | 144 | /** 145 | * Associates a pattern with this phrase. 146 | * 147 | * @param pattern - Pattern to add. 148 | */ 149 | public addPattern(pattern: ParsedPattern) { 150 | this.patterns.push(pattern); 151 | return this; 152 | } 153 | 154 | /** 155 | * Associates a whitelisted pattern with this phrase. 156 | * 157 | * @param term - Whitelisted term to add. 158 | */ 159 | public addWhitelistedTerm(term: string) { 160 | this.whitelistedTerms.push(term); 161 | return this; 162 | } 163 | 164 | /** 165 | * Associates some metadata with this phrase. 166 | * 167 | * @param metadata - Metadata to use. 168 | */ 169 | public setMetadata(metadata?: MetadataType) { 170 | this.metadata = metadata; 171 | return this; 172 | } 173 | 174 | /** 175 | * Builds the phrase, returning a [[PhraseContainer]] for use with the 176 | * [[DataSet]]. 177 | */ 178 | public build(): PhraseContainer { 179 | return { 180 | patterns: this.patterns, 181 | whitelistedTerms: this.whitelistedTerms, 182 | metadata: this.metadata, 183 | }; 184 | } 185 | } 186 | 187 | /** 188 | * Extends the default match payload by adding phrase metadata. 189 | */ 190 | export type MatchPayloadWithPhraseMetadata = MatchPayload & { 191 | /** 192 | * Phrase metadata associated with the pattern that matched. 193 | */ 194 | phraseMetadata?: MetadataType; 195 | }; 196 | 197 | /** 198 | * Represents a phrase. 199 | */ 200 | export interface PhraseContainer { 201 | /** 202 | * Metadata associated with this phrase. 203 | */ 204 | metadata?: MetadataType; 205 | 206 | /** 207 | * Patterns associated with this phrase. 208 | */ 209 | patterns: ParsedPattern[]; 210 | 211 | /** 212 | * Whitelisted terms associated with this phrase. 213 | */ 214 | whitelistedTerms: string[]; 215 | } 216 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './censor/BuiltinStrategies'; 2 | export * from './censor/TextCensor'; 3 | 4 | export * from './dataset/DataSet'; 5 | 6 | export * from './matcher/regexp/RegExpMatcher'; 7 | export * from './matcher/BlacklistedTerm'; 8 | export * from './matcher/MatchPayload'; 9 | export * from './matcher/Matcher'; 10 | 11 | export * from './pattern/Nodes'; 12 | export * from './pattern/ParserError'; 13 | export * from './pattern/Pattern'; 14 | 15 | export * from './preset/english'; 16 | 17 | export * from './transformer/collapse-duplicates'; 18 | export * from './transformer/remap-characters'; 19 | export * from './transformer/resolve-confusables'; 20 | export * from './transformer/resolve-leetspeak'; 21 | export * from './transformer/skip-non-alphabetic'; 22 | export * from './transformer/to-ascii-lowercase'; 23 | -------------------------------------------------------------------------------- /src/matcher/BlacklistedTerm.ts: -------------------------------------------------------------------------------- 1 | import type { ParsedPattern } from '../pattern/Nodes'; 2 | 3 | /** 4 | * Represents a blacklisted term. 5 | */ 6 | export interface BlacklistedTerm { 7 | /** 8 | * The identifier of the pattern; should be unique across all patterns. 9 | */ 10 | id: number; 11 | 12 | /** 13 | * The parsed pattern. 14 | */ 15 | pattern: ParsedPattern; 16 | } 17 | 18 | /** 19 | * Assigns incrementing IDs to the patterns provided, starting with 0. It is 20 | * useful if you have a list of patterns to match against but don't care about 21 | * identifying which pattern matched. 22 | * 23 | * @example 24 | * ```typescript 25 | * const matcher = new RegExpMatcher({ 26 | * ..., 27 | * blacklistedTerms: assignIncrementingIds([ 28 | * pattern`f?uck`, 29 | * pattern`|shit|`, 30 | * ]), 31 | * }); 32 | * ``` 33 | * @param patterns - List of parsed patterns. 34 | * @returns A list of blacklisted terms with valid IDs which can then be passed 35 | * to the [[RegExpMatcher]]. 36 | */ 37 | export function assignIncrementingIds(patterns: ParsedPattern[]) { 38 | let currentId = 0; 39 | return patterns.map((pattern) => ({ id: currentId++, pattern })); 40 | } 41 | -------------------------------------------------------------------------------- /src/matcher/IntervalCollection.ts: -------------------------------------------------------------------------------- 1 | import type { Interval } from '../util/Interval'; 2 | 3 | export class IntervalCollection implements Iterable { 4 | private dirty = false; 5 | 6 | private readonly intervals: Interval[] = []; 7 | 8 | public insert(lowerBound: number, upperBound: number) { 9 | this.intervals.push([lowerBound, upperBound]); 10 | this.dirty = true; 11 | } 12 | 13 | public query(lowerBound: number, upperBound: number) { 14 | if (this.intervals.length === 0) return false; 15 | if (this.dirty) { 16 | this.dirty = false; 17 | // Sort by lower bound. 18 | this.intervals.sort( 19 | /* istanbul ignore next: not possible to write a robust test for this */ 20 | (a, b) => (a[0] < b[0] ? -1 : b[0] < a[0] ? 1 : 0), 21 | ); 22 | } 23 | 24 | for (const interval of this.intervals) { 25 | // Since the intervals are sorted by lower bound, if we see an 26 | // interval with a lower bound greater than the target, we can skip 27 | // checking all the ones after it as it's impossible that they fully 28 | // contain the target interval. 29 | if (interval[0] > lowerBound) break; 30 | if (interval[0] <= lowerBound && upperBound <= interval[1]) return true; 31 | } 32 | 33 | return false; 34 | } 35 | 36 | public values() { 37 | return this.intervals.values(); 38 | } 39 | 40 | public [Symbol.iterator]() { 41 | return this.values(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/matcher/MatchPayload.ts: -------------------------------------------------------------------------------- 1 | import { compareIntervals } from '../util/Interval'; 2 | 3 | /** 4 | * Information emitted on a successful match. 5 | * 6 | * If you require more information about matches than what is provided here, see 7 | * the [[DataSet]] class, which supports associating metadata with patterns. 8 | */ 9 | export interface MatchPayload { 10 | /** 11 | * End index of the match, inclusive. 12 | * 13 | * If the last character of the pattern is a surrogate pair, 14 | * then this points to the index of the low surrogate. 15 | */ 16 | endIndex: number; 17 | 18 | /** 19 | * Total number of of code points that matched. 20 | */ 21 | matchLength: number; 22 | 23 | /** 24 | * Start index of the match, inclusive. 25 | */ 26 | startIndex: number; 27 | 28 | /** 29 | * ID of the blacklisted term that matched. 30 | */ 31 | termId: number; 32 | } 33 | 34 | /** 35 | * Compares two match payloads. 36 | * 37 | * If the first match payload's start index is less than the second's, `-1` is 38 | * returned; 39 | * If the second match payload's start index is less than the first's, `1` is 40 | * returned; 41 | * If the first match payload's end index is less than the second's, `-1` is 42 | * returned; 43 | * If the second match payload's end index is less than the first's, `1` is 44 | * returned; 45 | * If the first match payload's term ID is less than the second's, `-1` is 46 | * returned; 47 | * If the first match payload's term ID is equal to the second's, `0` is 48 | * returned; 49 | * Otherwise, `1` is returned. 50 | * 51 | * @param a - First match payload. 52 | * @param b - Second match payload. 53 | * @returns The result of the comparison: -1 if the first should sort lower than 54 | * the second, 0 if they are the same, and 1 if the second should sort lower 55 | * than the first. 56 | */ 57 | export function compareMatchByPositionAndId(a: MatchPayload, b: MatchPayload) { 58 | const result = compareIntervals(a.startIndex, a.endIndex, b.startIndex, b.endIndex); 59 | if (result !== 0) return result; 60 | return a.termId === b.termId ? 0 : a.termId < b.termId ? -1 : 1; 61 | } 62 | -------------------------------------------------------------------------------- /src/matcher/Matcher.ts: -------------------------------------------------------------------------------- 1 | import type { MatchPayload } from './MatchPayload'; 2 | 3 | /** 4 | * Searches for blacklisted terms in text, ignoring parts matched by whitelisted 5 | * terms. 6 | * 7 | * See: 8 | * - [[RegExpMatcher]] for an implementation using regular expressions. 9 | */ 10 | export interface Matcher { 11 | /** 12 | * Returns all matches of blacklisted terms in the text. 13 | * 14 | * If you only need to check for the presence of a match, and do not need 15 | * more specific information about the matches, use the `hasMatch()` method, 16 | * which is typically more efficient. 17 | * 18 | * @param input - Text to find profanities in. 19 | * @param sorted - Whether the resulting list of matches should be sorted 20 | * using [[compareMatchByPositionAndId]]. Defaults to `false`. 21 | * @returns A list of matches of the matcher on the text. The matches are 22 | * guaranteed to be sorted if and only if the `sorted` parameter is `true`, 23 | * otherwise, their order is unspecified. 24 | */ 25 | getAllMatches(input: string, sorted?: boolean): MatchPayload[]; 26 | 27 | /** 28 | * Checks whether there is a match for any blacklisted term in the text. 29 | * 30 | * This is typically more efficient than calling `getAllMatches` and 31 | * checking the result, though it depends on the implementation. 32 | * 33 | * @param input - Text to check. 34 | */ 35 | hasMatch(input: string): boolean; 36 | } 37 | -------------------------------------------------------------------------------- /src/pattern/Nodes.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * A parsed pattern. 3 | */ 4 | export interface ParsedPattern { 5 | /** 6 | * A list of nodes which make up the pattern. 7 | */ 8 | nodes: Node[]; 9 | 10 | /** 11 | * Whether the pattern requires a word boundary at the end. 12 | */ 13 | requireWordBoundaryAtEnd: boolean; 14 | 15 | /** 16 | * Whether the pattern requires a word boundary at the start. 17 | */ 18 | requireWordBoundaryAtStart: boolean; 19 | } 20 | 21 | /** 22 | * All the possible kinds of nodes. 23 | */ 24 | export type Node = LiteralNode | OptionalNode | WildcardNode; 25 | 26 | /** 27 | * An enumeration of the kinds of nodes there are. 28 | */ 29 | export enum SyntaxKind { 30 | Optional, 31 | Wildcard, 32 | Literal, 33 | BoundaryAssertion, 34 | } 35 | 36 | /** 37 | * An optional node. 38 | */ 39 | export interface OptionalNode { 40 | /** 41 | * The node contained within the optional expression. For `[abc]`, this 42 | * would be a literal node with the value `abc`. 43 | */ 44 | childNode: LiteralNode | WildcardNode; 45 | 46 | kind: SyntaxKind.Optional; 47 | } 48 | 49 | /** 50 | * A wildcard node. 51 | */ 52 | export interface WildcardNode { 53 | kind: SyntaxKind.Wildcard; 54 | } 55 | 56 | /** 57 | * A literal node. 58 | */ 59 | export interface LiteralNode { 60 | /** 61 | * The code points that this literal matches. 62 | */ 63 | chars: number[]; 64 | 65 | kind: SyntaxKind.Literal; 66 | } 67 | 68 | /** 69 | * A boundary assertion node. 70 | */ 71 | export interface BoundaryAssertionNode { 72 | kind: SyntaxKind.BoundaryAssertion; 73 | } 74 | -------------------------------------------------------------------------------- /src/pattern/ParserError.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Custom error thrown by the parser when syntactical errors are detected. 3 | */ 4 | export class ParserError extends Error { 5 | public readonly name = 'ParserError'; 6 | 7 | /** 8 | * The line on which the error occurred (one-based). 9 | */ 10 | public readonly line: number; 11 | 12 | /** 13 | * The column on which the error occurred (one-based). 14 | * Note that surrogate pairs are counted as 1 column wide, not 2. 15 | */ 16 | public readonly column: number; 17 | 18 | public constructor(message: string, line: number, column: number) { 19 | super(`${line}:${column}: ${message}`); 20 | this.line = line; 21 | this.column = column; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/pattern/Pattern.ts: -------------------------------------------------------------------------------- 1 | import { Parser } from './Parser'; 2 | 3 | const parser = new Parser(); 4 | 5 | /** 6 | * Parses a pattern, which matches a set of strings; see the `Syntax` section 7 | * for details. This function is intended to be called as a [template 8 | * tag](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals#tagged_templates). 9 | * 10 | * **Syntax** 11 | * 12 | * Generally speaking, in patterns, characters are interpreted literally. That 13 | * is, they match exactly what they are: `a` matches an `a`, `b` matches a `b`, 14 | * `;` matches a `;`, and so on. 15 | * 16 | * However, there are several constructs that have special meaning: 17 | * 18 | * - `[expr]` matches either the empty string or `expr` (an **optional 19 | * expression**). `expr` may be a sequence of literal characters or a wildcard 20 | * (see below). 21 | * - `?` matches any character (a **wildcard**). 22 | * - A `|` at the start or end of the pattern asserts position at a word 23 | * boundary (a **word boundary assertion**). If `|` is at the start, it 24 | * ensures that the match either starts at the start of the string or a non- 25 | * word character preceding it; if it is at the end, it ensures that the match 26 | * either ends at the end of the string or a non-word character follows it. 27 | * 28 | * A word character is an lower-case or upper-case ASCII alphabet character or 29 | * an ASCII digit. 30 | * - In a literal, a backslash may be used to **escape** one of the 31 | * meta-characters mentioned above so that it does match literally: `\\[` 32 | * matches `[`, and does not mark the start of an optional expression. 33 | * 34 | * **Note about escapes** 35 | * 36 | * As this function operates on raw strings, double-escaping backslashes is 37 | * not necessary: 38 | * 39 | * ```typescript 40 | * // Use this: 41 | * const parsed = pattern`hello \[`; 42 | * // Don't use this: 43 | * const parsed = pattern`hello \\[`; 44 | * ``` 45 | * 46 | * **Examples** 47 | * 48 | * - `baz` matches `baz` exactly. 49 | * 50 | * - `b\[ar` matches `b[ar` exactly. 51 | * 52 | * - `d?ude` matches `d`, then any character, then `ude`. All of the following 53 | * strings are matched by this pattern: 54 | * - `dyude` 55 | * - `d;ude` 56 | * - `d!ude` 57 | * 58 | * - `h[?]ello` matches either `h`, any character, then `ello` or the literal 59 | * string `hello`. The set of strings it matches is equal to the union of the 60 | * set of strings that the two patterns `hello` and `h?ello` match. All of the 61 | * following strings are matched by this pattern: 62 | * - `hello` 63 | * - `h!ello` 64 | * - `h;ello` 65 | * 66 | * - `|foobar|` asserts position at a word boundary, matches the literal string 67 | * `foobar`, and asserts position at a word boundary: 68 | * - `foobar` matches, as the start and end of string count as word 69 | * boundaries; 70 | * - `yofoobar` does _not_ match, as `f` is immediately preceded by a word 71 | * character; 72 | * - `hello foobar bye` matches, as `f` is immediately preceded by a non-word 73 | * character, and `r` is immediately followed by a non-word character. 74 | * 75 | * **Grammar** 76 | * 77 | * ``` 78 | * Pattern ::= '['? Atom* ']'? 79 | * Atom ::= Literal | Wildcard | Optional 80 | * Optional ::= '[' Literal | Wildcard ']' 81 | * Literal ::= (NON_SPECIAL | '\' SUPPORTS_ESCAPING)+ 82 | * 83 | * NON_SPECIAL ::= _any character other than '\', '?', '[', ']', or '|'_ 84 | * SUPPORTS_ESCAPING ::= '\' | '[' | ']' | '?' | '|' 85 | * ``` 86 | * 87 | * @example 88 | * ```typescript 89 | * const parsed = pattern`hello?`; // match "hello", then any character 90 | * ``` 91 | * @example 92 | * ```typescript 93 | * const parsed = pattern`w[o]rld`; // match "wrld" or "world" 94 | * ``` 95 | * @example 96 | * ```typescript 97 | * const parsed = pattern`my initials are \[??\]`; // match "my initials are [", then any two characters, then a "]" 98 | * ``` 99 | * @returns The parsed pattern, which can then be used with the 100 | * [[RegExpMatcher]]. 101 | * @throws [[ParserError]] if a syntactical error was detected while parsing the 102 | * pattern. 103 | * @see [[parseRawPattern]] if you want to parse a string into a pattern without 104 | * using a template tag. 105 | */ 106 | export function pattern(strings: TemplateStringsArray, ...expressions: unknown[]) { 107 | let result = strings.raw[0]; 108 | for (const [i, expression] of expressions.entries()) { 109 | result += expression; 110 | result += strings.raw[i + 1]; 111 | } 112 | 113 | return parser.parse(result); 114 | } 115 | 116 | /** 117 | * Parses a string as a pattern directly. 118 | * 119 | * **Note** 120 | * 121 | * It is recommended to use the [[pattern | pattern template tag]] instead of 122 | * this function for literal patterns (i.e. ones without dynamic content). 123 | * 124 | * @param pattern - The string to parse. 125 | * @throws [[ParserError]] if a syntactical error was detected while parsing the 126 | * pattern. 127 | * @returns The parsed pattern, which can then be used with the 128 | * [[RegExpMatcher]]. 129 | */ 130 | export function parseRawPattern(pattern: string) { 131 | return parser.parse(pattern); 132 | } 133 | -------------------------------------------------------------------------------- /src/pattern/Util.ts: -------------------------------------------------------------------------------- 1 | import type { Node, ParsedPattern } from './Nodes'; 2 | import { SyntaxKind } from './Nodes'; 3 | 4 | export function potentiallyMatchesEmptyString(pattern: ParsedPattern) { 5 | return pattern.nodes.every((node) => node.kind === SyntaxKind.Optional); 6 | } 7 | 8 | export function compilePatternToRegExp(pattern: ParsedPattern) { 9 | let regExpStr = ''; 10 | if (pattern.requireWordBoundaryAtStart) regExpStr += '\\b'; 11 | for (const node of pattern.nodes) regExpStr += getRegExpStringForNode(node); 12 | if (pattern.requireWordBoundaryAtEnd) regExpStr += `\\b`; 13 | return new RegExp(regExpStr, 'gs'); 14 | } 15 | 16 | const regExpSpecialChars = ['[', '.', '*', '+', '?', '^', '$', '{', '}', '(', ')', '|', '[', '\\', ']'].map((str) => 17 | str.charCodeAt(0), 18 | ); 19 | 20 | export function getRegExpStringForNode(node: Node): string { 21 | switch (node.kind) { 22 | case SyntaxKind.Literal: { 23 | let str = ''; 24 | for (const char of node.chars) { 25 | if (regExpSpecialChars.includes(char)) str += '\\'; 26 | str += String.fromCodePoint(char); 27 | } 28 | 29 | return str; 30 | } 31 | 32 | case SyntaxKind.Optional: 33 | return `(?:${getRegExpStringForNode(node.childNode)})?`; 34 | case SyntaxKind.Wildcard: 35 | return `.`; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/transformer/TransformerSet.ts: -------------------------------------------------------------------------------- 1 | import type { StatefulTransformer, TransformerContainer } from './Transformers'; 2 | import { TransformerType } from './Transformers'; 3 | 4 | export class TransformerSet { 5 | private readonly transformers: TransformerContainer[]; 6 | 7 | private readonly statefulTransformers: (StatefulTransformer | undefined)[]; 8 | 9 | public constructor(transformers: TransformerContainer[]) { 10 | this.transformers = transformers; 11 | this.statefulTransformers = Array.from({ length: this.transformers.length }); 12 | for (let i = 0; i < this.transformers.length; i++) { 13 | const transformer = this.transformers[i]; 14 | if (transformer.type === TransformerType.Stateful) { 15 | this.statefulTransformers[i] = transformer.factory(); 16 | } 17 | } 18 | } 19 | 20 | public applyTo(char: number) { 21 | let transformed: number | undefined = char; 22 | for (let i = 0; i < this.transformers.length && transformed !== undefined; i++) { 23 | const transformer = this.transformers[i]; 24 | if (transformer.type === TransformerType.Simple) transformed = transformer.transform(transformed); 25 | else transformed = this.statefulTransformers[i]!.transform(transformed); 26 | } 27 | 28 | return transformed; 29 | } 30 | 31 | public resetAll() { 32 | for (let i = 0; i < this.transformers.length; i++) { 33 | if (this.transformers[i].type === TransformerType.Stateful) { 34 | this.statefulTransformers[i]!.reset(); 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/transformer/Transformers.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * All the possible transformer types. 3 | */ 4 | export const enum TransformerType { 5 | Simple, 6 | Stateful, 7 | } 8 | 9 | /** 10 | * All the possible transformer container types. 11 | */ 12 | export type TransformerContainer = SimpleTransformerContainer | StatefulTransformerContainer; 13 | 14 | /** 15 | * Creates a container holding the transformer function provided. Simple 16 | * transformers are suitable for stateless transformations, e.g., a 17 | * transformation that maps certain characters to others. For transformations 18 | * that need to keep around state, see `createStatefulTransformer`. 19 | * 20 | * @example 21 | * ```typescript 22 | * function lowercaseToUppercase(char) { 23 | * return isLowercase(char) ? char - 32 : char; 24 | * } 25 | * 26 | * const transformer = createSimpleTransformer(lowercaseToUppercase); 27 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 28 | * ``` 29 | * @example 30 | * ```typescript 31 | * function ignoreAllNonDigitChars(char) { 32 | * return isDigit(char) ? char : undefined; 33 | * } 34 | * 35 | * const transformer = createSimpleTransformer(ignoreAllNonDigitChars); 36 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 37 | * ``` 38 | * @param transformer - Function that applies the transformation. It should 39 | * accept one argument, the input character, and return the transformed 40 | * character. A return value of `undefined` indicates that the character should 41 | * be ignored. 42 | * @returns A container holding the transformer, which can then be passed to the 43 | * [[RegExpMatcher]]. 44 | */ 45 | export function createSimpleTransformer(transformer: TransformerFn): SimpleTransformerContainer { 46 | return { type: TransformerType.Simple, transform: transformer }; 47 | } 48 | 49 | /** 50 | * Transforms input characters. 51 | * 52 | * @param char - Input character. 53 | * @returns The transformed character. A return value of `undefined` indicates 54 | * that the character should be ignored. 55 | */ 56 | export type TransformerFn = (char: number) => number | undefined; 57 | 58 | /** 59 | * Container for simple transformers. 60 | */ 61 | export interface SimpleTransformerContainer { 62 | /** 63 | * The transformer function. 64 | */ 65 | transform: TransformerFn; 66 | 67 | type: TransformerType.Simple; 68 | } 69 | 70 | /** 71 | * Creates a container holding the stateful transformer. Stateful transformers 72 | * are objects which satisfy the `StatefulTransformer` interface. They are 73 | * suitable for transformations that require keeping around some state regarding 74 | * the characters previously transformed in the text. 75 | * 76 | * @example 77 | * ```typescript 78 | * class IgnoreDuplicateCharactersTransformer implements StatefulTransformer { 79 | * private lastChar = -1; 80 | * 81 | * public transform(char: number) { 82 | * if (char === this.lastChar) return undefined; 83 | * this.lastChar = char; 84 | * return char; 85 | * } 86 | * 87 | * public reset() { 88 | * this.lastChar = -1; 89 | * } 90 | * } 91 | * 92 | * const transformer = createStatefulTransformer(() => new IgnoreDuplicateCharactersTransformer()); 93 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 94 | * ``` 95 | * @param factory A function that returns an instance of the stateful 96 | * transformer. 97 | * @returns A container holding the stateful transformer, which can then be 98 | * passed to the [[RegExpMatcher]]. 99 | */ 100 | export function createStatefulTransformer(factory: StatefulTransformerFactory): StatefulTransformerContainer { 101 | return { type: TransformerType.Stateful, factory }; 102 | } 103 | 104 | /** 105 | * A function that returns an instance of a stateful transformer. 106 | */ 107 | export type StatefulTransformerFactory = () => StatefulTransformer; 108 | 109 | /** 110 | * An interface that stateful transformers should implement. 111 | */ 112 | export interface StatefulTransformer { 113 | /** 114 | * Resets the state of the transformer. 115 | */ 116 | reset(): void; 117 | 118 | /** 119 | * Transforms input characters. 120 | * 121 | * @param char - Input character. 122 | * @returns The transformed character. A return value of `undefined` indicates 123 | * that the character should be ignored. 124 | */ 125 | transform: TransformerFn; 126 | } 127 | 128 | /** 129 | * Container for stateful transformers. 130 | */ 131 | export interface StatefulTransformerContainer { 132 | factory: StatefulTransformerFactory; 133 | type: TransformerType.Stateful; 134 | } 135 | -------------------------------------------------------------------------------- /src/transformer/collapse-duplicates/index.ts: -------------------------------------------------------------------------------- 1 | import { getAndAssertSingleCodePoint } from '../../util/Char'; 2 | import { createStatefulTransformer } from '../Transformers'; 3 | import { CollapseDuplicatesTransformer } from './transformer'; 4 | 5 | /** 6 | * Creates a transformer that collapses duplicate characters. This is useful for 7 | * detecting variants of patterns in which a character is repeated to bypass 8 | * detection. 9 | * 10 | * As an example, the pattern `hi` does not match `hhiii` by default, as the 11 | * frequency of the characters does not match. With this transformer, `hhiii` 12 | * would become `hi`, and would therefore match the pattern. 13 | * 14 | * **Application order** 15 | * 16 | * It is recommended that this transformer be applied after all other 17 | * transformers. Using it before other transformers may have the effect of not 18 | * catching duplicates of certain characters that were originally different but 19 | * became the same after a series of transformations. 20 | * 21 | * **Warning** 22 | * 23 | * This transformer should be used with caution, as while it can make certain 24 | * patterns match text that wouldn't have been matched before, it can also go 25 | * the other way. For example, the pattern `hello` clearly matches `hello`, but 26 | * with this transformer, by default, `hello` would become `helo` which does 27 | * _not_ match. In this cases, the `customThresholds` option can be used to 28 | * allow two `l`s in a row, making it leave `hello` unchanged. 29 | * 30 | * @example 31 | * ```typescript 32 | * // Collapse runs of the same character. 33 | * const transformer = collapseDuplicatesTransformer(); 34 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 35 | * ``` 36 | * @example 37 | * ```typescript 38 | * // Collapse runs of characters other than 'a'. 39 | * const transformer = collapseDuplicatesTransformer({ customThresholds: new Map([['a', Infinity]]) }); 40 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 41 | * ``` 42 | * @param options - Options for the transformer. 43 | * @returns A container holding the transformer, which can then be passed to the 44 | * [[RegExpMatcher]]. 45 | */ 46 | export function collapseDuplicatesTransformer({ 47 | defaultThreshold = 1, 48 | customThresholds = new Map(), 49 | }: CollapseDuplicatesTransformerOptions = {}) { 50 | const map = createCharacterToThresholdMap(customThresholds); 51 | return createStatefulTransformer( 52 | () => new CollapseDuplicatesTransformer({ defaultThreshold, customThresholds: map }), 53 | ); 54 | } 55 | 56 | function createCharacterToThresholdMap(customThresholds: Map) { 57 | const map = new Map(); 58 | for (const [str, threshold] of customThresholds) { 59 | if (threshold < 0) throw new RangeError('Expected all thresholds to be non-negative.'); 60 | const char = getAndAssertSingleCodePoint(str); 61 | map.set(char, threshold); 62 | } 63 | 64 | return map; 65 | } 66 | 67 | export interface ProcessedCollapseDuplicatesTransformerOptions { 68 | customThresholds: Map; 69 | defaultThreshold: number; 70 | } 71 | 72 | /** 73 | * Options for the collapse duplicates transformer. 74 | */ 75 | export interface CollapseDuplicatesTransformerOptions { 76 | /** 77 | * Custom thresholds for characters. If a character has an entry 78 | * corresponding to it, the value of tne entry will be used as the maximum 79 | * length of character runs comprised of said character before they are 80 | * collapsed. 81 | * 82 | * The intended use-case for this option is for characters which appear 83 | * more than once in a row in patterns. For example, the word `book` has 84 | * two `o`s in a row, and matches `book`. With this transformer, though, 85 | * `book` would become `bok`, meaning that `book` would no longer match `book`. 86 | * The fix would be to add an entry corresponding to `o` that overrides its 87 | * threshold to be `2`, with the effect of leaving `book` unchanged. 88 | * 89 | * @default new Map() 90 | */ 91 | customThresholds?: Map; 92 | 93 | /** 94 | * The maximum number of characters in a run that will be accepted before 95 | * they will be collapsed. 96 | * 97 | * For example, if this value was `2`, `aa` would stay the same but `aaa` 98 | * would be transformed to `aa`. 99 | * 100 | * @default 1 101 | */ 102 | defaultThreshold?: number; 103 | } 104 | -------------------------------------------------------------------------------- /src/transformer/collapse-duplicates/transformer.ts: -------------------------------------------------------------------------------- 1 | import type { StatefulTransformer } from '../Transformers'; 2 | import type { ProcessedCollapseDuplicatesTransformerOptions } from '.'; 3 | 4 | export class CollapseDuplicatesTransformer implements StatefulTransformer { 5 | private readonly defaultThreshold: number; 6 | 7 | private readonly customThresholds: Map; 8 | 9 | private remaining = -1; 10 | 11 | private lastChar = -1; 12 | 13 | public constructor({ defaultThreshold, customThresholds }: ProcessedCollapseDuplicatesTransformerOptions) { 14 | this.defaultThreshold = defaultThreshold; 15 | this.customThresholds = customThresholds; 16 | } 17 | 18 | public transform(char: number) { 19 | if (char === this.lastChar) { 20 | return this.remaining-- > 0 ? char : undefined; 21 | } 22 | 23 | const threshold = this.customThresholds.get(char) ?? this.defaultThreshold; 24 | this.remaining = threshold - 1; 25 | this.lastChar = char; 26 | return threshold > 0 ? char : undefined; 27 | } 28 | 29 | public reset() { 30 | this.remaining = -1; 31 | this.lastChar = -1; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/transformer/remap-characters/index.ts: -------------------------------------------------------------------------------- 1 | import { getAndAssertSingleCodePoint } from '../../util/Char'; 2 | import { CharacterIterator } from '../../util/CharacterIterator'; 3 | import { createSimpleTransformer } from '../Transformers'; 4 | 5 | /** 6 | * Maps certain characters to other characters, leaving other characters 7 | * unchanged. 8 | * 9 | * **Application order** 10 | * 11 | * It is recommended that this transformer be applied near the start of the 12 | * transformer chain. 13 | * 14 | * @example 15 | * ```typescript 16 | * // Transform 'a' to 'b'. 17 | * const transformer = remapCharactersTransformer({ 'b': 'a' }); 18 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 19 | * ``` 20 | * @example 21 | * ```typescript 22 | * // Transform '🅱️' to 'b', and use a map instead of an object as the argument. 23 | * const transformer = remapCharactersTransformer(new Map([['b', '🅱️']])); 24 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 25 | * ``` 26 | * @example 27 | * ```typescript 28 | * // Transform '🇴' and '0' to 'o'. 29 | * const transformer = remapCharactersTransformer({ o: '🇴0' }); 30 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 31 | * ``` 32 | * @param mapping - A map/object mapping certain characters to others. 33 | * @returns A container holding the transformer, which can then be passed to the 34 | * [[RegExpMatcher]]. 35 | * @see [[resolveConfusablesTransformer| Transformer that handles confusable Unicode characters]] 36 | * @see [[resolveLeetSpeakTransformer | Transformer that handles leet-speak]] 37 | */ 38 | export function remapCharactersTransformer(mapping: CharacterMapping) { 39 | const map = createOneToOneMap(mapping); 40 | return createSimpleTransformer((c) => map.get(c) ?? c); 41 | } 42 | 43 | function createOneToOneMap(mapping: CharacterMapping) { 44 | const map = new Map(); 45 | const iterable = mapping instanceof Map ? mapping.entries() : Object.entries(mapping); 46 | for (const [original, equivalents] of iterable) { 47 | const originalChar = getAndAssertSingleCodePoint(original); 48 | const iter = new CharacterIterator(equivalents); 49 | for (const equivalent of iter) map.set(equivalent, originalChar); 50 | } 51 | 52 | return map; 53 | } 54 | 55 | /** 56 | * Maps characters to other characters. 57 | * The key of the map/object should be the transformed character, while the value 58 | * should be a set of characters that map to the transformed character. 59 | */ 60 | export type CharacterMapping = Map | Record; 61 | -------------------------------------------------------------------------------- /src/transformer/resolve-confusables/confusables.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Maps confusable Unicode characters to their normalized equivalents. 3 | * 4 | * @copyright 5 | * The data here is taken from the 6 | * [confusables](https://github.com/gc/confusables) library. 7 | * 8 | * ```text 9 | * # The MIT License (MIT) 10 | * 11 | * Copyright © 2019 https://github.com/gc/ 12 | * 13 | * Permission is hereby granted, free of charge, to any person 14 | * obtaining a copy of this software and associated documentation 15 | * files (the “Software”), to deal in the Software without 16 | * restriction, including without limitation the rights to use, 17 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | * copies of the Software, and to permit persons to whom the 19 | * Software is furnished to do so, subject to the following 20 | * conditions: 21 | * 22 | * The above copyright notice and this permission notice shall be 23 | * included in all copies or substantial portions of the Software. 24 | * 25 | * THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, 26 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 27 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 29 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 30 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 31 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 32 | * OTHER DEALINGS IN THE SOFTWARE. 33 | * ``` 34 | */ 35 | export const confusables = new Map([ 36 | [' ', ' '], 37 | ['0', '⓿'], 38 | ['1', '⓵➊⑴¹𝟏𝟙1𝟷𝟣⒈𝟭1➀₁①❶⥠'], 39 | ['2', '⓶⒉⑵➋ƻ²ᒿ𝟚2𝟮𝟤ᒾ𝟸Ƨ𝟐②ᴤ₂➁❷ᘝƨ'], 40 | ['3', '³ⳌꞫ𝟑ℨ𝟛𝟯𝟥Ꝫ➌ЗȜ⓷ӠƷ3𝟹⑶⒊ʒʓǯǮƺ𝕴ᶾзᦡ➂③₃ᶚᴣᴟ❸ҘҙӬӡӭӟӞ'], 41 | ['4', '➍ҶᏎ𝟜ҷ⓸ҸҹӴӵᶣ4чㄩ⁴➃₄④❹Ӌ⑷⒋'], 42 | ['5', '𝟱⓹➎Ƽ𝟓𝟻𝟝𝟧5➄₅⑤⁵❺ƽ⑸⒌'], 43 | ['6', 'ⳒᏮ𝟞𝟨𝟔➏⓺Ϭϭ⁶б6ᧈ⑥➅₆❻⑹⒍'], 44 | ['7', '⓻𐓒➐7⁷⑦₇❼➆⑺⒎'], 45 | ['8', '𐌚➑⓼8𝟠𝟪৪⁸₈𝟴➇⑧❽𝟾𝟖⑻⒏'], 46 | ['9', 'ꝮⳊ⓽➒੧৭୨9𝟫𝟿𝟗⁹₉Գ➈⑨❾⑼⒐'], 47 | ['A', '🄰Ꭿ𐊠𝕬𝜜𝐴ꓮᎪ𝚨ꭺ𝝖🅐Å∀🇦₳🅰𝒜𝘈𝐀𝔸дǺᗅⒶAΑᾋᗩĂÃÅǍȀȂĀȺĄʌΛλƛᴀᴬДАልÄₐᕱªǞӒΆẠẢẦẨẬẮẰẲẴẶᾸᾹᾺΆᾼᾈᾉᾊᾌᾍᾎᾏἈἉἊἋἌἍἎἏḀȦǠӐÀÁÂẤẪ𝛢𝓐𝙰𝘼'], 48 | ['a', '∂⍺ⓐձǟᵃᶏ⒜аɒaαȃȁคǎმäɑāɐąᾄẚạảǡầẵḁȧӑӓãåάὰάăẩằẳặᾀᾁᾂᾃᾅᾆᾰᾱᾲᾳᾴᶐᾶᾷἀἁἂἃἄἅἆἇᾇậắàáâấẫǻⱥ𝐚𝑎𝒂𝒶𝓪𝔞𝕒𝖆𝖺𝗮𝘢𝙖𝚊𝛂𝛼𝜶𝝰𝞪⍶'], 49 | ['B', '𐌁𝑩𝕭🄱𐊡𝖡𝘽ꓐ𝗕𝘉𝜝𐊂𝚩𝐁𝛣𝝗𝐵𝙱𝔹Ᏼᏼ𝞑Ꞵ𝔅🅑฿𝓑ᗿᗾᗽ🅱ⒷBвϐᗷƁ乃ßცჩ๖βɮБՅ๒ᙖʙᴮᵇጌḄℬΒВẞḂḆɃദᗹᗸᵝᙞᙟᙝᛒᙗᙘᴃ🇧'], 50 | ['b', 'Ꮟ𝐛𝘣𝒷𝔟𝓫𝖇𝖻𝑏𝙗𝕓𝒃𝗯𝚋♭ᑳᒈbᖚᕹᕺⓑḃḅҍъḇƃɓƅᖯƄЬᑲþƂ⒝ЪᶀᑿᒀᒂᒁᑾьƀҌѢѣᔎ'], 51 | ['C', 'ᏟⲤ🄲ꓚ𐊢𐌂🅲𐐕🅒☾ČÇⒸCↃƇᑕㄈ¢८↻ĈϾՇȻᙅᶜ⒞ĆҀĊ©टƆℂℭϹС匚ḈҪʗᑖᑡᑢᑣᑤᑥⅭ𝐂𝐶𝑪𝒞𝓒𝕮𝖢𝗖𝘊𝘾ᔍ'], 52 | ['c', 'ⲥ𐐽ꮯĉcⓒćčċçҁƈḉȼↄсርᴄϲҫ꒝ςɽϛ𝙲ᑦ᧚𝐜𝑐𝒄𝒸𝓬𝔠𝕔𝖈𝖼𝗰𝘤𝙘𝚌₵🇨ᥴᒼⅽ'], 53 | ['D', 'Ꭰ🄳𝔡𝖉𝔻𝗗𝘋𝙳𝐷𝓓𝐃𝑫𝕯𝖣𝔇𝘿ꭰⅅ𝒟ꓓ🅳🅓ⒹDƉᗪƊÐԺᴅᴰↁḊĐÞⅮᗞᑯĎḌḐḒḎᗫᗬᗟᗠᶛᴆ🇩'], 54 | ['d', 'Ꮷꓒ𝓭ᵭ₫ԃⓓdḋďḍḑḓḏđƌɖɗᵈ⒟ԁⅾᶁԀᑺᑻᑼᑽᒄᑰᑱᶑ𝕕𝖽𝑑𝘥𝒅𝙙𝐝𝗱𝚍ⅆ𝒹ʠժ'], 55 | ['E', 'ꭼ🄴𝙀𝔼𐊆𝚬ꓰ𝝚𝞔𝓔𝑬𝗘🅴🅔ⒺΈEƎἝᕮƐモЄᴇᴱᵉÉ乇ЁɆꂅ€ÈℰΕЕⴹᎬĒĔĖĘĚÊËԐỀẾỄỂẼḔḖẺȄȆẸỆȨḜḘḚἘἙἚἛἜῈΈӖὲέЀϵ🇪'], 56 | ['e', '𝑒𝓮𝕖𝖊𝘦𝗲𝚎𝙚𝒆𝔢𝖾𝐞Ҿҿⓔe⒠èᧉéᶒêɘἔềếễ૯ǝєεēҽɛểẽḕḗĕėëẻěȅȇẹệȩɇₑęḝḙḛ℮еԑѐӗᥱёἐἑἒἓἕℯ'], 57 | ['F', '🄵𐊇𝔉𝘍𐊥ꓝꞘ🅵🅕𝓕ⒻFғҒᖴƑԲϝቻḞℱϜ₣🇫Ⅎ'], 58 | ['f', '𝐟𝖋ⓕfƒḟʃբᶠ⒡ſꊰʄ∱ᶂ𝘧'], 59 | ['G', 'ꓖᏳ🄶Ꮐᏻ𝔾𝓖𝑮𝕲ꮐ𝒢𝙂𝖦𝙶𝔊𝐺𝐆🅶🅖ⒼGɢƓʛĢᘜᴳǴĠԌĜḠĞǦǤԍ₲🇬⅁'], 60 | ['g', 'ⓖgǵĝḡğġǧģց૭ǥɠﻭﻮᵍ⒢ℊɡᧁ𝐠𝑔𝒈𝓰𝔤𝕘𝖌𝗀𝗴𝘨𝙜𝚐'], 61 | ['H', '🄷𝜢ꓧ𝘏𝐻𝝜𝖧𐋏𝗛ꮋℍᎻℌⲎ𝑯𝞖🅷🅗ዞǶԋⒽHĤᚺḢḦȞḤḨḪĦⱧҢңҤῊΉῌἨἩἪἫἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟӉӈҥΉн卄♓𝓗ℋН𝐇𝙃𝙷ʜ𝛨Η𝚮ᕼӇᴴᵸ🇭'], 62 | ['h', 'Һ⒣ђⓗhĥḣḧȟḥḩḫẖħⱨհһከኩኪካɦℎ𝐡𝒉𝒽𝓱𝔥𝕙𝖍𝗁𝗵𝘩𝙝𝚑իʰᑋᗁɧんɥ'], 63 | ['I', '🄸ЇꀤᏆ🅸🅘إﺇٳأﺃٲٵⒾI៸ÌÍÎĨĪĬİÏḮỈǏȈȊỊĮḬƗェエῘῙῚΊἸἹἺἻἼἽἾⅠΪΊɪᶦᑊᥣ𝛪𝐈𝙄𝙸𝓵𝙡𝐼ᴵ𝚰𝑰🇮'], 64 | ['i', 'ⓘiìíîĩīĭïḯỉǐȉȋịḭῐῑῒΐῖῗἰἱἲⅰⅼ∣ⵏ│׀ا١۱ߊᛁἳἴἵɨіὶίᶖ𝔦𝚒𝝸𝗂𝐢𝕚𝖎𝗶𝘪𝙞ίⁱᵢ𝓲⒤'], 65 | ['J', '🄹🅹🅙ⒿJЈʝᒍנフĴʆวلյʖᴊᴶﻝጋɈⱼՂๅႱįᎫȷ丿ℐℑᒘᒙᒚᒛᒴᒵᒎᒏ🇯'], 66 | ['j', 'ⓙjϳʲ⒥ɉĵǰјڶᶨ𝒿𝘫𝗷𝑗𝙟𝔧𝒋𝗃𝓳𝕛𝚓𝖏𝐣'], 67 | ['K', '𝗞🄺𝜥𝘒ꓗ𝙆𝕂Ⲕ𝔎𝛫Ꮶ𝞙𝒦🅺🅚₭ⓀKĸḰќƘкҠκқҟӄʞҚКҡᴋᴷᵏ⒦ᛕЌጕḲΚKҜҝҞĶḴǨⱩϗӃ🇰'], 68 | ['k', 'ⓚkḱǩḳķḵƙⱪᶄ𝐤𝘬𝗄𝕜𝜅𝜘𝜿𝝒𝝹𝞌𝞳𝙠𝚔𝑘𝒌ϰ𝛋𝛞𝟆𝗸𝓴𝓀'], 69 | ['L', '🄻𐐛Ⳑ𝑳𝙻𐑃𝓛ⳑꮮᏞꓡ🅻🅛ﺈ└ⓁւLĿᒪ乚ՆʟꓶιԼᴸˡĹረḶₗΓլĻᄂⅬℒⱢᥧᥨᒻᒶᒷᶫﺎᒺᒹᒸᒫ⎳ㄥŁⱠﺄȽ🇱'], 70 | ['l', 'ⓛlŀĺľḷḹļӀℓḽḻłレɭƚɫⱡ|Ɩ⒧ʅǀוןΙІ|ᶩӏ𝓘𝕀𝖨𝗜𝘐𝐥𝑙𝒍𝓁𝔩𝕝𝖑𝗅𝗹𝘭𝚕𝜤𝝞ı𝚤ɩι𝛊𝜄𝜾𝞲'], 71 | ['M', '🄼𐌑𐊰ꓟⲘᎷ🅼🅜ⓂMмṂ൱ᗰ州ᘻო๓♏ʍᙏᴍᴹᵐ⒨ḾМṀ௱ⅯℳΜϺᛖӍӎ𝐌𝑀𝑴𝓜𝔐𝕄𝕸𝖬𝗠𝘔𝙈𝙼𝚳𝛭𝜧𝝡𝞛🇲'], 72 | ['m', '₥ᵯ𝖒𝐦𝗆𝔪𝕞𝓂ⓜmനᙢ൩ḿṁⅿϻṃጠɱ៳ᶆ𝙢𝓶𝚖𝑚𝗺᧕᧗'], 73 | ['N', '🄽ℕꓠ𝛮𝝢𝙽𝚴𝑵𝑁Ⲛ𝐍𝒩𝞜𝗡𝘕𝜨𝓝𝖭🅽₦🅝ЙЍⓃҋ៷NᴎɴƝᑎ几иՈռИהЛπᴺᶰŃ刀ክṄⁿÑПΝᴨոϖǸŇṆŅṊṈทŊӢӣӤӥћѝйᥢҊᴻ🇳'], 74 | ['n', 'ח𝒏𝓷𝙣𝑛𝖓𝔫𝗇𝚗𝗻ᥒⓝήnǹᴒńñᾗηṅňṇɲņṋṉղຖՌƞŋ⒩ภกɳпʼnлԉȠἠἡῃդᾐᾑᾒᾓᾔᾕᾖῄῆῇῂἢἣἤἥἦἧὴήበቡቢባቤብቦȵ𝛈𝜂𝜼𝝶𝞰𝕟𝘯𝐧𝓃ᶇᵰᥥ∩'], 75 | [ 76 | 'O', 77 | 'ꄲ🄾𐊒𝟬ꓳⲞ𐐄𐊫𐓂𝞞🅞⍥◯ⵁ⊖0⊝𝝤Ѳϴ𝚶𝜪ѺӦӨӪΌʘ𝐎ǑÒŎÓÔÕȌȎㇿ❍ⓄOὋロ❤૦⊕ØФԾΘƠᴼᵒ⒪ŐÖₒ¤◊Φ〇ΟОՕଠഠ௦סỒỐỖỔṌȬṎŌṐṒȮȰȪỎỜỚỠỞỢỌỘǪǬǾƟⵔ߀៰⍜⎔⎕⦰⦱⦲⦳⦴⦵⦶⦷⦸⦹⦺⦻⦼⦽⦾⦿⧀⧁⧂⧃ὈὉὊὌὍ', 78 | ], 79 | [ 80 | 'o', 81 | '𝚘𝛐𝗈𝞼ဝⲟ𝙤၀𐐬𝔬𐓪𝓸🇴⍤○ϙ🅾𝒪𝖮𝟢𝟶𝙾𝘰𝗼𝕠𝜊𝐨𝝾𝞸ᐤⓞѳ᧐ᥲðoఠᦞՓòөӧóºōôǒȏŏồốȍỗổõσṍȭṏὄṑṓȯȫ๏ᴏőöѻоዐǭȱ০୦٥౦೦൦๐໐οօᴑ०੦ỏơờớỡởợọộǫøǿɵծὀὁόὸόὂὃὅ', 82 | ], 83 | ['P', '🄿ꓑ𝚸𝙿𝞠𝙋ꮲⲢ𝒫𝝦𝑃𝑷𝗣𝐏𐊕𝜬𝘗𝓟𝖯𝛲Ꮲ🅟Ҏ🅿ⓅPƤᑭ尸Ṗրφքᴘᴾᵖ⒫ṔアקРየᴩⱣℙΡῬᑸᑶᑷᑹᑬᑮ🇵₱'], 84 | ['p', 'ҏ℗ⓟpṕṗƥᵽῥρрƿǷῤ⍴𝓹𝓅𝐩𝑝𝒑𝔭𝕡𝖕𝗉𝗽𝘱𝙥𝚙𝛒𝝆𝞺𝜌𝞀'], 85 | ['Q', '🅀🆀🅠ⓆQℚⵕԚ𝐐𝑄𝑸𝒬𝓠𝚀𝘘𝙌𝖰𝕼𝔔𝗤🇶'], 86 | ['q', 'ⓠqգ⒬۹զᑫɋɊԛ𝗊𝑞𝘲𝕢𝚚𝒒𝖖𝐪𝔮𝓺𝙦'], 87 | ['R', '℞℟ꭱᏒ𐒴ꮢᎡꓣ🆁🅡ⓇRᴙȒʀᖇя尺ŔЯરƦᴿዪṚɌʁℛℜℝṘŘȐṜŖṞⱤ𝐑𝑅𝑹𝓡𝕽𝖱𝗥𝘙𝙍𝚁ᚱ🇷ᴚ'], 88 | ['r', 'ⓡrŕṙřȑȓṛṝŗгՐɾᥬṟɍʳ⒭ɼѓᴦᶉ𝐫𝑟𝒓𝓇𝓻𝔯𝕣𝖗𝗋𝗿𝘳𝙧ᵲґᵣ'], 89 | ['S', '🅂ꇙ𝓢𝗦Ꮪ𝒮Ꮥ𝚂𝐒ꓢ𝖲𝔖𝙎𐊖𝕾𐐠𝘚𝕊𝑆𝑺🆂🅢ⓈSṨŞֆՏȘˢ⒮ЅṠŠŚṤŜṦṢടᔕᔖᔢᔡᔣᔤ'], 90 | ['s', 'ⓢꜱ𐑈ꮪsśṥŝṡšṧʂṣṩѕşșȿᶊక𝐬𝑠𝒔𝓈𝓼𝔰𝕤𝖘𝗌𝘀𝘴𝙨𝚜ގ🇸'], 91 | ['T', '🅃🆃𐌕𝚻𝛵𝕋𝕿𝑻𐊱𐊗𝖳𝙏🝨𝝩𝞣𝚃𝘛𝑇ꓔ⟙𝐓Ⲧ𝗧⊤𝔗Ꭲꭲ𝒯🅣⏇⏉ⓉTтҬҭƬイŦԵτᴛᵀイፕϮŤ⊥ƮΤТ下ṪṬȚŢṰṮ丅丁ᐪ𝛕𝜏𝝉𝞃𝞽𝓣ㄒ🇹ጥ'], 92 | ['t', 'ⓣtṫẗťṭțȶ੮էʇ†ţṱṯƭŧᵗ⒯ʈեƫ𝐭𝑡𝒕𝓉𝓽𝔱𝕥𝖙𝗍𝘁𝘵𝙩𝚝ナ'], 93 | ['U', '🅄ꓴ𐓎꒤🆄🅤ŨŬŮᑗᑘǓǕǗǙⓊUȖᑌ凵ƱմԱꓵЦŪՄƲᙀᵁᵘ⒰ŰપÜՍÙÚÛṸṺǛỦȔƯỪỨỮỬỰỤṲŲṶṴɄᥩᑧ∪ᘮ⋃𝐔𝑈𝑼𝒰𝓤𝔘𝕌𝖀𝖴𝗨𝘜𝙐𝚄🇺'], 94 | ['u', 'ὺύⓤuùũūừṷṹŭǖữᥙǚǜὗυΰนսʊǘǔúůᴜűųยûṻцሁüᵾᵤµʋủȕȗưứửựụṳṵʉῠῡῢΰῦῧὐὑϋύὒὓὔὕὖᥔ𝐮𝑢𝒖𝓊𝓾𝔲𝕦𝖚𝗎ᶙ'], 95 | ['V', '🅅ꓦ𝑽𝖵𝘝Ꮩ𝚅𝙑𝐕🆅🅥ⓋVᐯѴᵛ⒱۷ṾⅴⅤṼ٧ⴸѶᐺᐻ🇻𝓥'], 96 | ['v', 'ሀⓥv𝜐𝝊ṽṿ౮งѵעᴠνטᵥѷ៴ᘁ𝙫𝚟𝛎𝜈𝝂𝝼𝞶𝘷𝘃𝓿'], 97 | ['W', '🅆ᏔᎳ𝑾ꓪ𝒲𝘞🆆Ⓦ🅦wWẂᾧᗯᥕ山ѠຟచաЩШώщฬшᙎᵂʷ⒲ฝሠẄԜẀŴẆẈധᘺѿᙡƜ₩🇼'], 98 | ['w', 'ẁꮃẃⓦ⍵ŵẇẅẘẉⱳὼὠὡὢὣωὤὥὦὧῲῳῴῶῷⱲѡԝᴡώᾠᾡᾢᾣᾤᾥᾦɯ𝝕𝟉𝞏'], 99 | ['X', '🞨🞩🞪🅇🞫🞬𐌗Ⲭꓫ𝖃𝞦𝘟𐊐𝚾𝝬𝜲Ꭓ𐌢𝖷𝑋𝕏𝔛𐊴𝗫🆇🅧❌Ⓧ𝓧XẊ᙭χㄨ𝒳ӾჯӼҳЖΧҲᵡˣ⒳אሸẌꊼⅩХ╳᙮ᕁᕽⅹᚷⵝ𝙓𝚇乂𝐗🇽'], 100 | ['x', 'ⓧxхẋ×ₓ⤫⤬⨯ẍᶍ𝙭ӽ𝘹𝐱𝚡⨰メ𝔁'], 101 | ['Y', 'Ⲩ𝚈𝑌𝗬𝐘ꓬ𝒀𝜰𐊲🆈🅨ⓎYὛƳㄚʏ⅄ϔ¥¥ՎϓγץӲЧЎሃŸɎϤΥϒҮỲÝŶỸȲẎỶỴῨῩῪΎὙὝὟΫΎӮӰҰұ𝕐🇾'], 102 | ['y', '🅈ᎽᎩⓨyỳýŷỹȳẏÿỷуყẙỵƴɏᵞɣʸᶌү⒴ӳӱӯўУʎ'], 103 | ['Z', '🅉ꓜ𝗭𝐙☡Ꮓ𝘡🆉🅩ⓏZẔƵ乙ẐȤᶻ⒵ŹℤΖŻŽẒⱫ🇿'], 104 | ['z', 'ꮓⓩzźẑżžẓẕƶȥɀᴢጊʐⱬᶎʑᙆ'], 105 | ]); 106 | -------------------------------------------------------------------------------- /src/transformer/resolve-confusables/index.ts: -------------------------------------------------------------------------------- 1 | import { remapCharactersTransformer } from '../remap-characters'; 2 | import { confusables } from './confusables'; 3 | 4 | /** 5 | * Creates a transformer that maps confusable Unicode characters to their 6 | * normalized equivalent. For example, `⓵`, `➊`, and `⑴` become `1` when using 7 | * this transformer. 8 | * 9 | * **Application order** 10 | * 11 | * It is recommended that this transformer be applied near the start of the 12 | * transformer chain. 13 | * 14 | * @example 15 | * ```typescript 16 | * const transformer = resolveConfusablesTransformer(); 17 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 18 | * ``` 19 | * @returns A container holding the transformer, which can then be passed to the 20 | * [[RegExpMatcher]]. 21 | */ 22 | export function resolveConfusablesTransformer() { 23 | return remapCharactersTransformer(confusables); 24 | } 25 | -------------------------------------------------------------------------------- /src/transformer/resolve-leetspeak/dictionary.ts: -------------------------------------------------------------------------------- 1 | export const dictionary = new Map([ 2 | ['a', '@4'], 3 | ['c', '('], 4 | ['e', '3'], 5 | ['i', '1|!'], 6 | ['g', '6'], 7 | ['o', '0'], 8 | ['s', '$5'], 9 | ['t', '7'], 10 | ['z', '2'], 11 | ]); 12 | -------------------------------------------------------------------------------- /src/transformer/resolve-leetspeak/index.ts: -------------------------------------------------------------------------------- 1 | import { remapCharactersTransformer } from '../remap-characters'; 2 | import { dictionary } from './dictionary'; 3 | 4 | /** 5 | * Creates a transformer that maps leet-speak characters to their normalized 6 | * equivalent. For example, `$` becomes `s` when using this transformer. 7 | * 8 | * **Application order** 9 | * 10 | * It is recommended that this transformer be applied near the start of the 11 | * transformer chain, but after similar transformers that map characters to 12 | * other characters, such as the [[resolveConfusablesTransformer | transformer 13 | * that resolves confusable Unicode characters]]. 14 | * 15 | * @example 16 | * ```typescript 17 | * const transformer = resolveLeetSpeakTransformer(); 18 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 19 | * ``` 20 | * @returns A container holding the transformer, which can then be passed to the 21 | * [[RegExpMatcher]]. 22 | */ 23 | export function resolveLeetSpeakTransformer() { 24 | return remapCharactersTransformer(dictionary); 25 | } 26 | -------------------------------------------------------------------------------- /src/transformer/skip-non-alphabetic/index.ts: -------------------------------------------------------------------------------- 1 | import { isAlphabetic } from '../../util/Char'; 2 | import { createSimpleTransformer } from '../Transformers'; 3 | 4 | /** 5 | * Creates a transformer that skips non-alphabetic characters (`a`-`z`, 6 | * `A`-`Z`). This is useful when matching text on patterns that are solely 7 | * comprised of alphabetic characters (the pattern `hello` does not match 8 | * `h.e.l.l.o` by default, but does with this transformer). 9 | * 10 | * **Warning** 11 | * 12 | * This transformation is not part of the default set of transformations, as 13 | * there are some known rough edges with false negatives; see 14 | * [#23](https://github.com/jo3-l/obscenity/issues/23) and 15 | * [#46](https://github.com/jo3-l/obscenity/issues/46) on the GitHub issue 16 | * tracker. 17 | * 18 | * **Application order** 19 | * 20 | * It is recommended that this transformer be applied near the end of the 21 | * transformer chain, if at all. 22 | * 23 | * @example 24 | * ```typescript 25 | * const transformer = skipNonAlphabeticTransformer(); 26 | * const matcher = new RegExpMatcher({ ..., blacklistMatcherTransformers: [transformer] }); 27 | * ``` 28 | * @returns A container holding the transformer, which can then be passed to the 29 | * [[RegExpMatcher]]. 30 | */ 31 | export function skipNonAlphabeticTransformer() { 32 | return createSimpleTransformer((c) => (isAlphabetic(c) ? c : undefined)); 33 | } 34 | -------------------------------------------------------------------------------- /src/transformer/to-ascii-lowercase/index.ts: -------------------------------------------------------------------------------- 1 | import { invertCaseOfAlphabeticChar, isUpperCase } from '../../util/Char'; 2 | import { createSimpleTransformer } from '../Transformers'; 3 | 4 | /** 5 | * Creates a transformer that changes all ASCII alphabet characters to 6 | * lower-case, leaving other characters unchanged. 7 | * 8 | * **Application order** 9 | * 10 | * It is recommended that this transformer be applied near the end of the 11 | * transformer chain. Using it before other transformers may have the effect of 12 | * making its changes useless as transformers applied after produce characters 13 | * of varying cases. 14 | * 15 | * @returns A container holding the transformer, which can then be passed to the 16 | * [[RegExpMatcher]]. 17 | */ 18 | export function toAsciiLowerCaseTransformer() { 19 | return createSimpleTransformer((c) => (isUpperCase(c) ? invertCaseOfAlphabeticChar(c) : c)); 20 | } 21 | -------------------------------------------------------------------------------- /src/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.base.json", 3 | "include": ["."], 4 | "compilerOptions": { 5 | "outDir": "../dist", 6 | "rootDir": ".", 7 | "baseUrl": ".", 8 | "composite": true, 9 | "declaration": true 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/util/Char.ts: -------------------------------------------------------------------------------- 1 | export const enum CharacterCode { 2 | LowerA = 97, 3 | LowerZ = 122, 4 | UpperA = 65, 5 | UpperZ = 90, 6 | 7 | Zero = 48, 8 | Nine = 57, 9 | 10 | LeftSquareBracket = 91, 11 | RightSquareBracket = 93, 12 | QuestionMark = 63, 13 | Backslash = 92, 14 | Newline = 10, 15 | VerticalBar = 124, 16 | 17 | HighSurrogateStart = 0xd800, 18 | HighSurrogateEnd = 0xdbff, 19 | LowSurrogateStart = 0xdc00, 20 | LowSurrogateEnd = 0xdfff, 21 | } 22 | 23 | export function isHighSurrogate(char: number) { 24 | return CharacterCode.HighSurrogateStart <= char && char <= CharacterCode.HighSurrogateEnd; 25 | } 26 | 27 | export function isLowSurrogate(char: number) { 28 | return CharacterCode.LowSurrogateStart <= char && char <= CharacterCode.LowSurrogateEnd; 29 | } 30 | 31 | // See https://unicodebook.readthedocs.io/unicode_encodings.html#utf-16-surrogate-pairs. 32 | export function convertSurrogatePairToCodePoint(highSurrogate: number, lowSurrogate: number) { 33 | return ( 34 | (highSurrogate - CharacterCode.HighSurrogateStart) * 0x400 + 35 | lowSurrogate - 36 | CharacterCode.LowSurrogateStart + 37 | 0x10000 38 | ); 39 | } 40 | 41 | export function isWordChar(char: number) { 42 | return isDigit(char) || isAlphabetic(char); 43 | } 44 | 45 | export function isDigit(char: number) { 46 | return CharacterCode.Zero <= char && char <= CharacterCode.Nine; 47 | } 48 | 49 | export function isAlphabetic(char: number) { 50 | return isLowerCase(char) || isUpperCase(char); 51 | } 52 | 53 | export function isLowerCase(char: number) { 54 | return CharacterCode.LowerA <= char && char <= CharacterCode.LowerZ; 55 | } 56 | 57 | export function isUpperCase(char: number) { 58 | return CharacterCode.UpperA <= char && char <= CharacterCode.UpperZ; 59 | } 60 | 61 | // Input must be a lower-case or upper-case ASCII alphabet character. 62 | export function invertCaseOfAlphabeticChar(char: number) { 63 | return char ^ 0x20; 64 | } 65 | 66 | // Asserts that the string is comprised of one and only one code point, 67 | // then returns said code point. 68 | export function getAndAssertSingleCodePoint(str: string) { 69 | if ([...str].length !== 1) throw new RangeError(`Expected the input string to be one code point in length.`); 70 | return str.codePointAt(0)!; 71 | } 72 | -------------------------------------------------------------------------------- /src/util/CharacterIterator.ts: -------------------------------------------------------------------------------- 1 | import { convertSurrogatePairToCodePoint, isHighSurrogate, isLowSurrogate } from './Char'; 2 | 3 | export class CharacterIterator implements IterableIterator { 4 | private _input: string; 5 | 6 | private lastPosition = -1; 7 | 8 | private currentPosition = 0; 9 | 10 | private _lastWidth = 0; 11 | 12 | public constructor(input?: string) { 13 | this._input = input ?? ''; 14 | } 15 | 16 | public get input() { 17 | return this._input; 18 | } 19 | 20 | public setInput(input: string) { 21 | this._input = input; 22 | this.reset(); 23 | return this; 24 | } 25 | 26 | public reset() { 27 | this.lastPosition = -1; 28 | this.currentPosition = 0; 29 | this._lastWidth = 0; 30 | } 31 | 32 | public next(): IteratorResult { 33 | if (this.done) return { done: true, value: undefined }; 34 | this.lastPosition = this.currentPosition; 35 | 36 | const char = this._input.charCodeAt(this.currentPosition++); 37 | this._lastWidth = 1; 38 | // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition 39 | if (this.done || !isHighSurrogate(char)) return { done: false, value: char }; 40 | 41 | // Do we have a surrogate pair? 42 | const next = this._input.charCodeAt(this.currentPosition); 43 | if (isLowSurrogate(next)) { 44 | this._lastWidth++; 45 | this.currentPosition++; 46 | return { done: false, value: convertSurrogatePairToCodePoint(char, next) }; 47 | } 48 | 49 | return { done: false, value: char }; 50 | } 51 | 52 | // Position of the iterator; equals the start index of the last character consumed. 53 | // -1 if no characters were consumed yet. 54 | public get position() { 55 | return this.lastPosition; 56 | } 57 | 58 | // Width of the last character consumed; 2 if it was a surrogate pair and 1 otherwise. 59 | // 0 if no characters were consumed yet. 60 | public get lastWidth() { 61 | return this._lastWidth; 62 | } 63 | 64 | public get done() { 65 | return this.currentPosition >= this._input.length; 66 | } 67 | 68 | public [Symbol.iterator]() { 69 | return this; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/util/Interval.ts: -------------------------------------------------------------------------------- 1 | export function compareIntervals(lowerBound0: number, upperBound0: number, lowerBound1: number, upperBound1: number) { 2 | if (lowerBound0 < lowerBound1) return -1; 3 | if (lowerBound1 < lowerBound0) return 1; 4 | if (upperBound0 < upperBound1) return -1; 5 | if (upperBound1 < upperBound0) return 1; 6 | return 0; 7 | } 8 | 9 | export type Interval = [lowerBound: number, upperBound: number]; 10 | -------------------------------------------------------------------------------- /test/censor/BuiltinStrategies.test.ts: -------------------------------------------------------------------------------- 1 | import { 2 | asteriskCensorStrategy, 3 | fixedCharCensorStrategy, 4 | fixedPhraseCensorStrategy, 5 | grawlixCensorStrategy, 6 | keepEndCensorStrategy, 7 | keepStartCensorStrategy, 8 | randomCharFromSetCensorStrategy, 9 | } from '../../src/censor/BuiltinStrategies'; 10 | import type { CensorContext } from '../../src/censor/TextCensor'; 11 | 12 | const partialCtx = { 13 | input: '', 14 | overlapsAtStart: false, 15 | overlapsAtEnd: false, 16 | termId: -1, 17 | startIndex: 0, 18 | endIndex: 0, 19 | }; 20 | 21 | describe('keepStartCensorStrategy()', () => { 22 | const baseStrategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength)); 23 | 24 | afterEach(() => { 25 | baseStrategy.mockClear(); 26 | }); 27 | 28 | it('should call the base strategy with the same arguments if overlapsAtStart is true', () => { 29 | const strategy = keepStartCensorStrategy(baseStrategy); 30 | const res = strategy({ ...partialCtx, matchLength: 5, overlapsAtStart: true }); 31 | expect(res).toBe('.....'); 32 | expect(baseStrategy).toHaveBeenCalledTimes(1); 33 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...partialCtx, matchLength: 5, overlapsAtStart: true }); 34 | }); 35 | 36 | it('should call the base strategy with matchLength-1 and add the first character of the matched region', () => { 37 | const strategy = keepStartCensorStrategy(baseStrategy); 38 | const ctx = { 39 | input: 'hello world!', 40 | overlapsAtStart: false, 41 | overlapsAtEnd: false, 42 | termId: -1, 43 | startIndex: 6, 44 | endIndex: 10, 45 | matchLength: 5, 46 | }; 47 | const res = strategy(ctx); 48 | expect(res).toBe('w....'); 49 | expect(baseStrategy).toHaveBeenCalledTimes(1); 50 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...ctx, matchLength: 4 }); 51 | }); 52 | }); 53 | 54 | describe('keepEndCensorStrategy()', () => { 55 | const baseStrategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength)); 56 | 57 | afterEach(() => { 58 | baseStrategy.mockClear(); 59 | }); 60 | 61 | it('should call the base strategy with the same arguments if overlapsAtEnd is true', () => { 62 | const strategy = keepEndCensorStrategy(baseStrategy); 63 | const res = strategy({ ...partialCtx, matchLength: 5, overlapsAtEnd: true }); 64 | expect(res).toBe('.....'); 65 | expect(baseStrategy).toHaveBeenCalledTimes(1); 66 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...partialCtx, matchLength: 5, overlapsAtEnd: true }); 67 | }); 68 | 69 | it('should call the base strategy with matchLength-1 and add the last character of the matched region', () => { 70 | const strategy = keepEndCensorStrategy(baseStrategy); 71 | const ctx = { 72 | input: 'hello world!', 73 | overlapsAtStart: false, 74 | overlapsAtEnd: false, 75 | termId: -1, 76 | startIndex: 6, 77 | endIndex: 10, 78 | matchLength: 5, 79 | }; 80 | const res = strategy(ctx); 81 | expect(res).toBe('....d'); 82 | expect(baseStrategy).toHaveBeenCalledTimes(1); 83 | expect(baseStrategy).toHaveBeenLastCalledWith({ ...ctx, matchLength: 4 }); 84 | }); 85 | }); 86 | 87 | describe('asteriskCensorStrategy()', () => { 88 | it('should return strings that are made up of asterisks', () => { 89 | const strategy = asteriskCensorStrategy(); 90 | expect(strategy({ ...partialCtx, matchLength: 8 })).toBe('********'); 91 | }); 92 | }); 93 | 94 | describe('grawlixCensorStrategy()', () => { 95 | it('should return strings that have characters taken from the charset %@$&*', () => { 96 | const charset = '%@$&*'; 97 | const strategy = grawlixCensorStrategy(); 98 | expect([...strategy({ ...partialCtx, matchLength: 20 })].every((c) => charset.includes(c))).toBeTruthy(); 99 | }); 100 | }); 101 | 102 | describe('fixedPhraseCensorStrategy()', () => { 103 | it('should simply return the phrase given', () => { 104 | const strategy = fixedPhraseCensorStrategy('fixed phrase'); 105 | expect(strategy({ ...partialCtx, matchLength: 30 })).toBe('fixed phrase'); 106 | }); 107 | }); 108 | 109 | describe('fixedCharCensorStrategy()', () => { 110 | it('should throw if the input string was empty', () => { 111 | expect(() => fixedCharCensorStrategy('')).toThrow( 112 | new RangeError(`Expected the input string to be one code point in length.`), 113 | ); 114 | }); 115 | 116 | it('should throw if the input string was comprised of more than one code point', () => { 117 | expect(() => fixedCharCensorStrategy('ab')).toThrow( 118 | new RangeError(`Expected the input string to be one code point in length.`), 119 | ); 120 | }); 121 | 122 | it('should not throw if the input string was a surrogate pair', () => { 123 | expect(() => fixedCharCensorStrategy('🌉')).not.toThrow(); 124 | }); 125 | 126 | it('should return the input string repeated N times (where N is the match length)', () => { 127 | const strategy = fixedCharCensorStrategy('x'); 128 | expect(strategy({ ...partialCtx, matchLength: 7 })).toBe('xxxxxxx'); 129 | }); 130 | }); 131 | 132 | describe('randomCharFromSetCensorStrategy()', () => { 133 | it('should throw if the charset has less than 2 characters', () => { 134 | expect(() => randomCharFromSetCensorStrategy('')).toThrow( 135 | new Error('The character set passed must have at least 2 characters.'), 136 | ); 137 | expect(() => randomCharFromSetCensorStrategy('a')).toThrow( 138 | new Error('The character set passed must have at least 2 characters.'), 139 | ); 140 | }); 141 | 142 | it('should work for matchLength 0', () => { 143 | const strategy = randomCharFromSetCensorStrategy('abcdefghijk'); 144 | expect(strategy({ ...partialCtx, matchLength: 0 })).toBe(''); 145 | }); 146 | 147 | it('should return N characters (where N is the match length) from the set of characters given', () => { 148 | const charset = 'abcdefghijk'; 149 | const strategy = randomCharFromSetCensorStrategy(charset); 150 | expect([...strategy({ ...partialCtx, matchLength: 5 })].every((c) => charset.includes(c))).toBeTruthy(); 151 | }); 152 | 153 | it('should not repeat the same character twice in a row', () => { 154 | const strategy = randomCharFromSetCensorStrategy('ab'); 155 | for (let i = 0; i < 100; i++) { 156 | expect(['aba', 'bab']).toContain(strategy({ ...partialCtx, matchLength: 3 })); 157 | } 158 | }); 159 | }); 160 | -------------------------------------------------------------------------------- /test/censor/TextCensor.test.ts: -------------------------------------------------------------------------------- 1 | import { grawlixCensorStrategy } from '../../src/censor/BuiltinStrategies'; 2 | import type { CensorContext } from '../../src/censor/TextCensor'; 3 | import { TextCensor } from '../../src/censor/TextCensor'; 4 | 5 | describe('TextCensor#setStrategy()', () => { 6 | it('should return the text censor', () => { 7 | const censor = new TextCensor(); 8 | expect(censor.setStrategy(grawlixCensorStrategy())).toStrictEqual(censor); 9 | }); 10 | }); 11 | 12 | describe('TextCensor#applyTo()', () => { 13 | const strategy = jest.fn().mockImplementation((k) => '.'.repeat(k.matchLength)); 14 | 15 | afterEach(() => { 16 | strategy.mockClear(); 17 | }); 18 | 19 | it('should return the input unmodified if there are no matches', () => { 20 | const censor = new TextCensor().setStrategy(strategy); 21 | expect(censor.applyTo('text', [])).toBe('text'); 22 | expect(strategy).not.toHaveBeenCalled(); 23 | }); 24 | 25 | it('should call the strategy for each non-overlapping match interval (no overlaps, 1 match)', () => { 26 | const censor = new TextCensor().setStrategy(strategy); 27 | const firstMatch = { termId: 0, matchLength: 11, startIndex: 3, endIndex: 13 }; 28 | expect(censor.applyTo('my interesting input', [firstMatch])).toBe('my ........... input'); 29 | expect(strategy).toHaveBeenCalledTimes(1); 30 | expect(strategy).toHaveBeenLastCalledWith({ 31 | ...firstMatch, 32 | input: 'my interesting input', 33 | overlapsAtStart: false, 34 | overlapsAtEnd: false, 35 | }); 36 | }); 37 | 38 | it('should call the strategy for each non-overlapping match interval (no overlaps, 3 matches)', () => { 39 | const censor = new TextCensor().setStrategy(strategy); 40 | const firstMatch = { termId: 0, matchLength: 4, startIndex: 0, endIndex: 3 }; 41 | const secondMatch = { termId: 0, matchLength: 2, startIndex: 8, endIndex: 9 }; 42 | const thirdMatch = { termId: 0, matchLength: 5, startIndex: 22, endIndex: 26 }; 43 | expect(censor.applyTo('this is my intriguing input', [firstMatch, secondMatch, thirdMatch])).toBe( 44 | '.... is .. intriguing .....', 45 | ); 46 | expect(strategy).toHaveBeenCalledTimes(3); 47 | expect(strategy).toHaveBeenNthCalledWith(1, { 48 | ...firstMatch, 49 | input: 'this is my intriguing input', 50 | overlapsAtStart: false, 51 | overlapsAtEnd: false, 52 | }); 53 | expect(strategy).toHaveBeenNthCalledWith(2, { 54 | ...secondMatch, 55 | input: 'this is my intriguing input', 56 | overlapsAtStart: false, 57 | overlapsAtEnd: false, 58 | }); 59 | expect(strategy).toHaveBeenNthCalledWith(3, { 60 | ...thirdMatch, 61 | input: 'this is my intriguing input', 62 | overlapsAtStart: false, 63 | overlapsAtEnd: false, 64 | }); 65 | }); 66 | 67 | it('should call the strategy for each non-overlapping match interval (some overlaps, 2 matches)', () => { 68 | const censor = new TextCensor().setStrategy(strategy); 69 | const firstMatch = { termId: 0, matchLength: 5, startIndex: 0, endIndex: 4 }; 70 | const secondMatch = { termId: 0, matchLength: 8, startIndex: 0, endIndex: 7 }; 71 | expect(censor.applyTo('thinking of good test data is hard', [firstMatch, secondMatch])).toBe( 72 | '............. of good test data is hard', 73 | ); 74 | expect(strategy).toHaveBeenCalledTimes(2); 75 | expect(strategy).toHaveBeenNthCalledWith(1, { 76 | ...firstMatch, 77 | input: 'thinking of good test data is hard', 78 | overlapsAtStart: false, 79 | overlapsAtEnd: true, 80 | }); 81 | expect(strategy).toHaveBeenNthCalledWith(2, { 82 | ...secondMatch, 83 | input: 'thinking of good test data is hard', 84 | startIndex: 5, 85 | overlapsAtStart: true, 86 | overlapsAtEnd: false, 87 | }); 88 | }); 89 | 90 | it('should not call the strategy for matched intervals which are completely contained in another one', () => { 91 | const censor = new TextCensor().setStrategy(strategy); 92 | const firstMatch = { termId: 0, matchLength: 2, startIndex: 1, endIndex: 2 }; 93 | const secondMatch = { termId: 0, matchLength: 1, startIndex: 2, endIndex: 2 }; 94 | expect(censor.applyTo('tests', [firstMatch, secondMatch])).toBe('t..ts'); 95 | expect(strategy).toHaveBeenCalledTimes(1); 96 | expect(strategy).toHaveBeenLastCalledWith({ 97 | ...firstMatch, 98 | input: 'tests', 99 | overlapsAtStart: false, 100 | overlapsAtEnd: false, 101 | }); 102 | }); 103 | 104 | it('should not call the strategy for matched intervals which are equal to some other one', () => { 105 | const censor = new TextCensor().setStrategy(strategy); 106 | const firstMatch = { termId: 0, matchLength: 3, startIndex: 1, endIndex: 3 }; 107 | const secondMatch = { termId: 1, matchLength: 3, startIndex: 1, endIndex: 3 }; 108 | expect(censor.applyTo('heretical', [firstMatch, secondMatch])).toBe('h...tical'); 109 | expect(strategy).toHaveBeenCalledTimes(1); 110 | expect(strategy).toHaveBeenLastCalledWith({ 111 | ...firstMatch, 112 | input: 'heretical', 113 | overlapsAtStart: false, 114 | overlapsAtEnd: false, 115 | }); 116 | }); 117 | }); 118 | -------------------------------------------------------------------------------- /test/jest.setup.ts: -------------------------------------------------------------------------------- 1 | expect.extend({ 2 | toBePermutationOf(this: jest.MatcherContext, received: T[], expected: T[]) { 3 | const options = { 4 | isNot: this.isNot, 5 | promise: this.promise, 6 | }; 7 | 8 | if (received.length !== expected.length) { 9 | return { 10 | message: () => `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)} 11 | 12 | Expected: array of length ${expected.length} (${this.utils.printExpected(expected)}) 13 | Received: array of length ${received.length} (${this.utils.printReceived(received)})`, 14 | pass: false, 15 | }; 16 | } 17 | 18 | const copy = [...expected]; 19 | let maxIndex = expected.length - 1; 20 | 21 | for (const element of received) { 22 | // See if there's an element in expected that hasn't been used yet and is 23 | // deeply equal to the current value. 24 | let pass = false; 25 | for (let i = maxIndex; i >= 0; i--) { 26 | pass = this.equals(element, copy[i]); 27 | if (pass) { 28 | // Swap the current element with the one at the maximum index, 29 | // then mark the maximum index as unusable. 30 | // This ensures that we don't mark two values in received as equal 31 | // to the same value in expected. 32 | copy[i] = copy[maxIndex--]; 33 | break; 34 | } 35 | } 36 | 37 | if (!pass) { 38 | // No value in expected is deeply equal to the current value in received. 39 | const message = () => { 40 | return `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)} 41 | 42 | Expected: a permutation of ${this.utils.printExpected(expected)} 43 | Received: ${this.utils.printReceived(received)} 44 | } 45 | `; 46 | }; 47 | 48 | return { message, pass: false }; 49 | } 50 | } 51 | 52 | return { 53 | message: () => `${this.utils.matcherHint('toBePermutationOf', undefined, undefined, options)} 54 | 55 | Expected: not a permutation of ${this.utils.printExpected(expected)} 56 | Received: ${this.utils.printReceived(received)}`, 57 | pass: true, 58 | }; 59 | }, 60 | }); 61 | 62 | declare global { 63 | // eslint-disable-next-line @typescript-eslint/no-namespace 64 | namespace jest { 65 | interface Matchers { 66 | toBePermutationOf(expected: readonly any[]): R; 67 | } 68 | } 69 | } 70 | 71 | export {}; 72 | -------------------------------------------------------------------------------- /test/matcher/BlacklistedTerm.test.ts: -------------------------------------------------------------------------------- 1 | import { assignIncrementingIds } from '../../src/matcher/BlacklistedTerm'; 2 | import { pattern } from '../../src/pattern/Pattern'; 3 | 4 | describe('assignIncrementingIds()', () => { 5 | it('should assign incrementing, unique IDs to the input patterns', () => { 6 | const firstPattern = pattern`|world|`; 7 | const secondPattern = pattern`:D`; 8 | const thirdPattern = pattern`??`; 9 | const fourthPattern = pattern`hmm interesting`; 10 | expect(assignIncrementingIds([firstPattern, secondPattern, thirdPattern, fourthPattern])).toStrictEqual([ 11 | { id: 0, pattern: firstPattern }, 12 | { id: 1, pattern: secondPattern }, 13 | { id: 2, pattern: thirdPattern }, 14 | { id: 3, pattern: fourthPattern }, 15 | ]); 16 | }); 17 | }); 18 | -------------------------------------------------------------------------------- /test/matcher/IntervalCollection.test.ts: -------------------------------------------------------------------------------- 1 | import { IntervalCollection } from '../../src/matcher/IntervalCollection'; 2 | 3 | let coll: IntervalCollection; 4 | 5 | beforeEach(() => { 6 | coll = new IntervalCollection(); 7 | }); 8 | 9 | describe('IntervalCollection#insert()', () => { 10 | it('should add the interval to the collection', () => { 11 | coll.insert(5, 10); 12 | expect([...coll]).toBePermutationOf([[5, 10]]); 13 | coll.insert(12, 13); 14 | expect([...coll]).toBePermutationOf([ 15 | [12, 13], 16 | [5, 10], 17 | ]); 18 | }); 19 | }); 20 | 21 | describe('IntervalCollection#query()', () => { 22 | it('should return false if the input interval does not intersect any of the stored intervals', () => { 23 | coll.insert(5, 10); 24 | coll.insert(13, 14); 25 | coll.insert(17, 19); 26 | expect(coll.query(3, 4)).toBeFalsy(); 27 | }); 28 | 29 | it('should return false if the interval collection is empty', () => { 30 | expect(coll.query(0, 0)).toBeFalsy(); 31 | }); 32 | 33 | it('should return true if there is some interval stored that such that the input interval is a subset of it', () => { 34 | coll.insert(8, 9); 35 | coll.insert(10, 12); 36 | coll.insert(13, 17); 37 | expect(coll.query(14, 15)).toBeTruthy(); 38 | }); 39 | 40 | it('should return false if the input interval simply overlaps with some of the stored intervals', () => { 41 | coll.insert(17, 19); 42 | coll.insert(20, 24); 43 | coll.insert(25, 44); 44 | expect(coll.query(34, 45)).toBeFalsy(); 45 | }); 46 | }); 47 | 48 | it('should be iterable', () => { 49 | coll.insert(30, 35); 50 | coll.insert(47, 49); 51 | coll.insert(98, 99); 52 | expect([...coll]).toBePermutationOf([ 53 | [30, 35], 54 | [47, 49], 55 | [98, 99], 56 | ]); 57 | }); 58 | -------------------------------------------------------------------------------- /test/matcher/MatchPayload.test.ts: -------------------------------------------------------------------------------- 1 | import { compareMatchByPositionAndId } from '../../src/matcher/MatchPayload'; 2 | import { compareIntervals as _compareIntervals } from '../../src/util/Interval'; 3 | 4 | jest.mock('../../src/util/Interval', () => ({ compareIntervals: jest.fn().mockReturnValue(0) })); 5 | 6 | const compareIntervals = _compareIntervals as jest.MockedFunction; 7 | 8 | afterEach(() => { 9 | compareIntervals.mockClear(); 10 | }); 11 | 12 | describe('compareMatchByPositionAndId()', () => { 13 | const termIdAndMatchLen = { termId: -1, matchLength: 0 }; 14 | 15 | it('should call compareIntervals() and return its result if not zero', () => { 16 | compareIntervals.mockImplementationOnce(() => -1); 17 | expect( 18 | compareMatchByPositionAndId( 19 | { ...termIdAndMatchLen, startIndex: 5, endIndex: 7 }, 20 | { ...termIdAndMatchLen, startIndex: 6, endIndex: 8 }, 21 | ), 22 | ).toBe(-1); 23 | expect(compareIntervals).toHaveBeenCalledTimes(1); 24 | expect(compareIntervals).toHaveBeenLastCalledWith(5, 7, 6, 8); 25 | }); 26 | 27 | const startAndEndIdxAndMatchLen = { startIndex: 0, endIndex: 0, matchLength: 0 }; 28 | 29 | it("should return -1 if the first match payload's term ID is less than the second's and their positions are identical", () => { 30 | expect( 31 | compareMatchByPositionAndId( 32 | { ...startAndEndIdxAndMatchLen, termId: 0 }, 33 | { ...startAndEndIdxAndMatchLen, termId: 3 }, 34 | ), 35 | ).toBe(-1); 36 | }); 37 | 38 | it("should return 1 if the first match payload's term ID is less than the second's and their positions are identical", () => { 39 | expect( 40 | compareMatchByPositionAndId( 41 | { ...startAndEndIdxAndMatchLen, termId: 50 }, 42 | { ...startAndEndIdxAndMatchLen, termId: 30 }, 43 | ), 44 | ).toBe(1); 45 | }); 46 | 47 | it("should return 0 if the first match payload's term ID is equal to the first's and their positions are identical", () => { 48 | expect( 49 | compareMatchByPositionAndId( 50 | { ...startAndEndIdxAndMatchLen, termId: 34 }, 51 | { ...startAndEndIdxAndMatchLen, termId: 34 }, 52 | ), 53 | ).toBe(0); 54 | }); 55 | }); 56 | -------------------------------------------------------------------------------- /test/pattern/ParserError.test.ts: -------------------------------------------------------------------------------- 1 | import { ParserError } from '../../src/pattern/ParserError'; 2 | 3 | describe('ParserError#name', () => { 4 | it("should be equal to 'ParserError'", () => { 5 | const err = new ParserError('', 0, 0); 6 | expect(err.name).toBe('ParserError'); 7 | }); 8 | }); 9 | 10 | describe('ParserError#line', () => { 11 | it('should be equal to the value passed to the constructor', () => { 12 | const err = new ParserError('', 1, 0); 13 | expect(err.line).toBe(1); 14 | }); 15 | }); 16 | 17 | describe('ParserError#column', () => { 18 | it('should be equal to the value passed to the constructor', () => { 19 | const err = new ParserError('', 0, 500); 20 | expect(err.column).toBe(500); 21 | }); 22 | }); 23 | 24 | describe('ParserError#message', () => { 25 | it("should be in the format 'line:column: message'", () => { 26 | const err = new ParserError('hi', 1, 10); 27 | expect(err.message).toBe('1:10: hi'); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /test/pattern/Pattern.test.ts: -------------------------------------------------------------------------------- 1 | import { Parser } from '../../src/pattern/Parser'; 2 | import { parseRawPattern, pattern } from '../../src/pattern/Pattern'; 3 | 4 | const parser = new Parser(); 5 | 6 | describe('pattern template tag', () => { 7 | it('should parse the pattern given', () => { 8 | expect(pattern`hello world?`).toStrictEqual(parser.parse('hello world?')); 9 | }); 10 | 11 | it('should not require double-escaping backslashes', () => { 12 | expect(pattern`hello escaped \[ :D`).toStrictEqual(parser.parse('hello escaped \\[ :D')); 13 | }); 14 | 15 | it('should interpolate one expression appropriately', () => { 16 | const value = 123; 17 | expect(pattern`value=${value}`).toStrictEqual(parser.parse('value=123')); 18 | }); 19 | 20 | it('should interpolate many expressions appropriately', () => { 21 | const value0 = 123; 22 | const value1 = 234; 23 | expect(pattern`value0=${value0} value1=${value1} something after :)`).toStrictEqual( 24 | parser.parse('value0=123 value1=234 something after :)'), 25 | ); 26 | }); 27 | 28 | it('should work with empty strings', () => { 29 | expect(pattern``).toStrictEqual(parser.parse('')); 30 | }); 31 | }); 32 | 33 | describe('parseRawPattern()', () => { 34 | it('should parse the string given', () => { 35 | expect(parseRawPattern('h[i] ?')).toStrictEqual(parser.parse('h[i] ?')); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /test/pattern/Util.test.ts: -------------------------------------------------------------------------------- 1 | import type { LiteralNode, OptionalNode } from '../../src/pattern/Nodes'; 2 | import { SyntaxKind } from '../../src/pattern/Nodes'; 3 | import { compilePatternToRegExp, getRegExpStringForNode, potentiallyMatchesEmptyString } from '../../src/pattern/Util'; 4 | import { CharacterIterator } from '../../src/util/CharacterIterator'; 5 | 6 | function toLiteralNode(str: string): LiteralNode { 7 | return { kind: SyntaxKind.Literal, chars: [...new CharacterIterator(str)] }; 8 | } 9 | 10 | describe('potentiallyMatchesEmptyString()', () => { 11 | it('should return false for patterns with wildcards', () => { 12 | expect( 13 | potentiallyMatchesEmptyString({ 14 | requireWordBoundaryAtStart: false, 15 | requireWordBoundaryAtEnd: false, 16 | nodes: [{ kind: SyntaxKind.Wildcard }], 17 | }), 18 | ).toBeFalsy(); 19 | }); 20 | 21 | it('should return false for literal patterns', () => { 22 | expect( 23 | potentiallyMatchesEmptyString({ 24 | requireWordBoundaryAtStart: false, 25 | requireWordBoundaryAtEnd: false, 26 | nodes: [toLiteralNode('foo')], 27 | }), 28 | ).toBeFalsy(); 29 | }); 30 | 31 | it('should return false for patterns composed of combo of literals and optionals', () => { 32 | expect( 33 | potentiallyMatchesEmptyString({ 34 | requireWordBoundaryAtStart: false, 35 | requireWordBoundaryAtEnd: false, 36 | nodes: [toLiteralNode('foo'), { kind: SyntaxKind.Optional, childNode: toLiteralNode('bar') }], 37 | }), 38 | ).toBeFalsy(); 39 | }); 40 | 41 | it('should return true for patterns solely composed of optionals', () => { 42 | expect( 43 | potentiallyMatchesEmptyString({ 44 | requireWordBoundaryAtStart: false, 45 | requireWordBoundaryAtEnd: false, 46 | nodes: [ 47 | { kind: SyntaxKind.Optional, childNode: { kind: SyntaxKind.Wildcard } }, 48 | { kind: SyntaxKind.Optional, childNode: toLiteralNode('bar') }, 49 | ], 50 | }), 51 | ).toBeTruthy(); 52 | }); 53 | 54 | it('should return true for empty patterns', () => { 55 | expect( 56 | potentiallyMatchesEmptyString({ requireWordBoundaryAtStart: false, requireWordBoundaryAtEnd: false, nodes: [] }), 57 | ).toBeTruthy(); 58 | }); 59 | }); 60 | 61 | describe('compilePatternToRegExp()', () => { 62 | it('should add \\b at the begin if requireWordBoundaryAtStart is true', () => { 63 | const regExp = compilePatternToRegExp({ 64 | requireWordBoundaryAtStart: true, 65 | requireWordBoundaryAtEnd: false, 66 | nodes: [toLiteralNode('bye')], 67 | }); 68 | expect(regExp.source).toBe('\\bbye'); 69 | }); 70 | 71 | it('should add a \\b at the end if requireWordBoundaryAtEnd is true', () => { 72 | const regExp = compilePatternToRegExp({ 73 | requireWordBoundaryAtStart: false, 74 | requireWordBoundaryAtEnd: true, 75 | nodes: [toLiteralNode('hi')], 76 | }); 77 | expect(regExp.source).toBe('hi\\b'); 78 | }); 79 | 80 | it('should return the regexp with dotall and global flags on', () => { 81 | const regExp = compilePatternToRegExp({ 82 | requireWordBoundaryAtStart: false, 83 | requireWordBoundaryAtEnd: true, 84 | nodes: [toLiteralNode('yo'), { kind: SyntaxKind.Wildcard }], 85 | }); 86 | expect(regExp.dotAll).toBeTruthy(); 87 | expect(regExp.global).toBeTruthy(); 88 | }); 89 | }); 90 | 91 | describe('getRegExpStringForNode()', () => { 92 | describe('literals', () => { 93 | it('should return the text of the string directly if it contains no special chars', () => { 94 | expect(getRegExpStringForNode(toLiteralNode('hi'))).toBe('hi'); 95 | expect(getRegExpStringForNode(toLiteralNode(':D'))).toBe(':D'); 96 | expect(getRegExpStringForNode(toLiteralNode('🌉'))).toBe('🌉'); 97 | }); 98 | 99 | it('should escape special characters with a backslash', () => { 100 | expect(getRegExpStringForNode(toLiteralNode('['))).toBe('\\['); 101 | expect(getRegExpStringForNode(toLiteralNode('.'))).toBe('\\.'); 102 | expect(getRegExpStringForNode(toLiteralNode('hi?'))).toBe('hi\\?'); 103 | }); 104 | }); 105 | 106 | describe('optionals', () => { 107 | it('should return (?:inner)?', () => { 108 | const optional: OptionalNode = { kind: SyntaxKind.Optional, childNode: toLiteralNode('hello') }; 109 | expect(getRegExpStringForNode(optional)).toBe('(?:hello)?'); 110 | }); 111 | }); 112 | 113 | describe('wildcards', () => { 114 | it('should return a dot', () => { 115 | expect(getRegExpStringForNode({ kind: SyntaxKind.Wildcard })).toBe('.'); 116 | }); 117 | }); 118 | }); 119 | -------------------------------------------------------------------------------- /test/transformer/TransformerSet.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerSet } from '../../src/transformer/TransformerSet'; 2 | import type { StatefulTransformer } from '../../src/transformer/Transformers'; 3 | import { createSimpleTransformer, createStatefulTransformer } from '../../src/transformer/Transformers'; 4 | 5 | it('should create multiple instances of stateful transformers', () => { 6 | const spy = jest.fn(); 7 | class MyTransformer implements StatefulTransformer { 8 | public constructor() { 9 | spy(); 10 | } 11 | 12 | public transform() { 13 | return 0; 14 | } 15 | 16 | public reset() { 17 | // do nothing 18 | } 19 | } 20 | 21 | const transformer = createStatefulTransformer(() => new MyTransformer()); 22 | new TransformerSet([transformer]); 23 | expect(spy).toHaveBeenCalledTimes(1); 24 | new TransformerSet([transformer]); 25 | expect(spy).toHaveBeenCalledTimes(2); 26 | }); 27 | 28 | describe('TransformerSet#applyTo()', () => { 29 | it('should be a noop if no transformers were provided', () => { 30 | expect(new TransformerSet([]).applyTo(32)).toBe(32); 31 | }); 32 | 33 | it('should work with simple transformers', () => { 34 | const fn = jest.fn((c: number) => c + 1); 35 | expect(new TransformerSet([createSimpleTransformer(fn)]).applyTo(5)).toBe(6); 36 | expect(fn).toHaveBeenCalledTimes(1); 37 | expect(fn).toHaveBeenLastCalledWith(5); 38 | }); 39 | 40 | it('should work with stateful transformers', () => { 41 | const instance = { 42 | transform: jest.fn((c) => c + 1), 43 | reset: jest.fn(), 44 | }; 45 | expect(new TransformerSet([createStatefulTransformer(() => instance)]).applyTo(7)).toBe(8); 46 | expect(instance.transform).toHaveBeenCalledTimes(1); 47 | expect(instance.transform).toHaveBeenLastCalledWith(7); 48 | expect(instance.reset).not.toHaveBeenCalled(); 49 | }); 50 | 51 | it('should pass the transformed value to the next transformer', () => { 52 | const fn0 = jest.fn((c: number) => c + 1); 53 | const fn1 = jest.fn((c: number) => c + 2); 54 | expect(new TransformerSet([createSimpleTransformer(fn0), createSimpleTransformer(fn1)]).applyTo(5)).toBe(8); 55 | expect(fn0).toHaveBeenCalledTimes(1); 56 | expect(fn0).toHaveBeenLastCalledWith(5); 57 | expect(fn1).toHaveBeenCalledTimes(1); 58 | expect(fn1).toHaveBeenLastCalledWith(6); 59 | }); 60 | 61 | it('should short circuit if a transformer returns undefined', () => { 62 | const fn0 = jest.fn((c: number) => c + 1); 63 | const fn1 = jest.fn(() => undefined); 64 | const fn2 = jest.fn((c: number) => c + 3); 65 | expect( 66 | new TransformerSet([ 67 | createSimpleTransformer(fn0), 68 | createSimpleTransformer(fn1), 69 | createSimpleTransformer(fn2), 70 | ]).applyTo(6), 71 | ).toBeUndefined(); 72 | expect(fn0).toHaveBeenCalledTimes(1); 73 | expect(fn0).toHaveBeenLastCalledWith(6); 74 | expect(fn1).toHaveBeenCalledTimes(1); 75 | expect(fn1).toHaveBeenLastCalledWith(7); 76 | expect(fn2).not.toHaveBeenCalled(); 77 | }); 78 | 79 | it('should work with a mix of different types of transformers', () => { 80 | const instance = { 81 | transform: jest.fn((c) => c + 1), 82 | reset: jest.fn(), 83 | }; 84 | const fn0 = jest.fn((c: number) => c + 2); 85 | const fn1 = jest.fn((c: number) => c + 3); 86 | expect( 87 | new TransformerSet([ 88 | createStatefulTransformer(() => instance), 89 | createSimpleTransformer(fn0), 90 | createSimpleTransformer(fn1), 91 | ]).applyTo(5), 92 | ).toBe(11); 93 | expect(instance.transform).toHaveBeenCalledTimes(1); 94 | expect(instance.transform).toHaveBeenLastCalledWith(5); 95 | expect(fn0).toHaveBeenCalledTimes(1); 96 | expect(fn0).toHaveBeenLastCalledWith(6); 97 | expect(fn1).toHaveBeenCalledTimes(1); 98 | expect(fn1).toHaveBeenLastCalledWith(8); 99 | }); 100 | 101 | it('should apply transformers in order', () => { 102 | const calls: number[] = []; 103 | const fn0 = (c: number) => { 104 | calls.push(0); 105 | return c + 1; 106 | }; 107 | 108 | const fn1 = (c: number) => { 109 | calls.push(1); 110 | return c + 2; 111 | }; 112 | 113 | expect(new TransformerSet([createSimpleTransformer(fn0), createSimpleTransformer(fn1)]).applyTo(5)).toBe(8); 114 | expect(calls).toStrictEqual([0, 1]); 115 | }); 116 | }); 117 | 118 | describe('TransformerSet#resetAll()', () => { 119 | it('should call the reset() method of all stateful transformers once', () => { 120 | const instance0 = { 121 | transform: (c: number) => c + 1, 122 | reset: jest.fn(), 123 | }; 124 | const fn = (c: number) => c + 1; 125 | const instance1 = { 126 | transform: (c: number) => c + 2, 127 | reset: jest.fn(), 128 | }; 129 | const transformers = new TransformerSet([ 130 | createStatefulTransformer(() => instance0), 131 | createSimpleTransformer(fn), 132 | createStatefulTransformer(() => instance1), 133 | ]); 134 | transformers.resetAll(); 135 | expect(instance0.reset).toHaveBeenCalledTimes(1); 136 | expect(instance1.reset).toHaveBeenCalledTimes(1); 137 | }); 138 | }); 139 | -------------------------------------------------------------------------------- /test/transformer/Transformers.test.ts: -------------------------------------------------------------------------------- 1 | import type { StatefulTransformer } from '../../src/transformer/Transformers'; 2 | import { 3 | createSimpleTransformer, 4 | createStatefulTransformer, 5 | TransformerType, 6 | } from '../../src/transformer/Transformers'; 7 | 8 | describe('TransformerType', () => { 9 | describe('TransformerType.Simple', () => { 10 | it('should equal 0', () => { 11 | expect(TransformerType.Simple).toBe(0); 12 | }); 13 | }); 14 | 15 | describe('TransformerType.Stateful', () => { 16 | it('should equal 1', () => { 17 | expect(TransformerType.Stateful).toBe(1); 18 | }); 19 | }); 20 | }); 21 | 22 | describe('createSimpleTransformer', () => { 23 | it('should return a container holding the function given', () => { 24 | const transformer = (c: number) => c + 1; 25 | expect(createSimpleTransformer(transformer)).toStrictEqual({ 26 | type: TransformerType.Simple, 27 | transform: transformer, 28 | }); 29 | }); 30 | }); 31 | 32 | describe('createStatefulTransformer', () => { 33 | it('should return a container holding an instance produced by the factory given', () => { 34 | const statefulTransformer: StatefulTransformer = { 35 | transform: () => undefined, 36 | reset: () => { 37 | /* do nothing */ 38 | }, 39 | }; 40 | const factory = () => statefulTransformer; 41 | expect(createStatefulTransformer(factory)).toStrictEqual({ 42 | type: TransformerType.Stateful, 43 | factory, 44 | }); 45 | }); 46 | }); 47 | -------------------------------------------------------------------------------- /test/transformer/collapse-duplicates/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import type { CollapseDuplicatesTransformerOptions } from '../../../src/transformer/collapse-duplicates/index'; 3 | import { collapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/index'; 4 | import { CollapseDuplicatesTransformer as _CollapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/transformer'; 5 | import { CharacterCode } from '../../../src/util/Char'; 6 | 7 | jest.mock('../../../src/transformer/collapse-duplicates/transformer'); 8 | 9 | // eslint-disable-next-line @typescript-eslint/naming-convention 10 | const CollapseDuplicatesTransformer = _CollapseDuplicatesTransformer as jest.MockedClass< 11 | typeof _CollapseDuplicatesTransformer 12 | >; 13 | 14 | beforeEach(() => { 15 | CollapseDuplicatesTransformer.mockClear(); 16 | }); 17 | 18 | describe('collapseDuplicatesTransformer()', () => { 19 | describe('customThresholds processing', () => { 20 | it('should throw if any threshold was < 0', () => { 21 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['a', -1]]) })).toThrow(RangeError); 22 | }); 23 | 24 | it('should not throw for threshold=0', () => { 25 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['a', 0]]) })).not.toThrow(RangeError); 26 | }); 27 | 28 | it('should throw if the string corresponding to a threshold had length 0', () => { 29 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['', 1]]) })).toThrow(RangeError); 30 | }); 31 | 32 | it('should throw if the string corresponding to a threshold was comprised of more than 1 code point', () => { 33 | expect(() => collapseDuplicatesTransformer({ customThresholds: new Map([['ab', 1]]) })).toThrow(RangeError); 34 | }); 35 | 36 | it("should create a map of character code => threshold and pass that to CollapseDuplicateTransformer's constructor", () => { 37 | collapseDuplicatesTransformer({ 38 | customThresholds: new Map([ 39 | ['a', 2], 40 | ['z', 3], 41 | ]), 42 | }).factory(); 43 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1); 44 | expect(CollapseDuplicatesTransformer.mock.calls[0][0]).toMatchObject({ 45 | customThresholds: new Map([ 46 | [CharacterCode.LowerA, 2], 47 | [CharacterCode.LowerZ, 3], 48 | ]), 49 | }); 50 | }); 51 | }); 52 | 53 | it("should pass the options given to CollapseDuplicatesTransformer's constructor", () => { 54 | const options: CollapseDuplicatesTransformerOptions = { 55 | defaultThreshold: 5, 56 | customThresholds: new Map([ 57 | ['a', 2], 58 | ['z', 3], 59 | ]), 60 | }; 61 | collapseDuplicatesTransformer(options).factory(); 62 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1); 63 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({ 64 | defaultThreshold: 5, 65 | customThresholds: new Map([ 66 | [CharacterCode.LowerA, 2], 67 | [CharacterCode.LowerZ, 3], 68 | ]), 69 | }); 70 | }); 71 | 72 | it('should use 1 as the value for defaultThreshold if not provided', () => { 73 | const options: CollapseDuplicatesTransformerOptions = { 74 | customThresholds: new Map([ 75 | ['a', 2], 76 | ['z', 3], 77 | ]), 78 | }; 79 | collapseDuplicatesTransformer(options).factory(); 80 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1); 81 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({ 82 | defaultThreshold: 1, 83 | customThresholds: new Map([ 84 | [CharacterCode.LowerA, 2], 85 | [CharacterCode.LowerZ, 3], 86 | ]), 87 | }); 88 | }); 89 | 90 | it('should use an empty map as the value for customThresholds if not provided', () => { 91 | const options: CollapseDuplicatesTransformerOptions = { 92 | defaultThreshold: 1, 93 | }; 94 | collapseDuplicatesTransformer(options).factory(); 95 | expect(CollapseDuplicatesTransformer).toHaveBeenCalledTimes(1); 96 | expect(CollapseDuplicatesTransformer).toHaveBeenLastCalledWith({ ...options, customThresholds: new Map() }); 97 | }); 98 | 99 | it('should return a stateful transformer container', () => { 100 | const container = collapseDuplicatesTransformer(); 101 | expect(container.type).toBe(TransformerType.Stateful); 102 | expect(container.factory).toStrictEqual(expect.any(Function)); 103 | }); 104 | }); 105 | -------------------------------------------------------------------------------- /test/transformer/collapse-duplicates/transformer.test.ts: -------------------------------------------------------------------------------- 1 | import { CollapseDuplicatesTransformer } from '../../../src/transformer/collapse-duplicates/transformer'; 2 | 3 | describe('CollapseDuplicatesTransformer#transform()', () => { 4 | describe('threshold selection', () => { 5 | it('should use the default threshold if there is no corresponding custom threshold', () => { 6 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 1, customThresholds: new Map() }); 7 | expect(transformer.transform(1)).toBe(1); 8 | expect(transformer.transform(1)).toBeUndefined(); 9 | }); 10 | 11 | it('should use the custom threshold if one is provided', () => { 12 | const transformer = new CollapseDuplicatesTransformer({ 13 | defaultThreshold: 1, 14 | customThresholds: new Map([[1, 2]]), 15 | }); 16 | expect(transformer.transform(1)).toBe(1); 17 | expect(transformer.transform(1)).toBe(1); 18 | expect(transformer.transform(1)).toBeUndefined(); 19 | }); 20 | }); 21 | 22 | it('should return undefined for characters with a threshold <= 0', () => { 23 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 0, customThresholds: new Map() }); 24 | expect(transformer.transform(1)).toBeUndefined(); 25 | expect(transformer.transform(2)).toBeUndefined(); 26 | }); 27 | 28 | it('should be a noop until the threshold is hit', () => { 29 | const transformer = new CollapseDuplicatesTransformer({ defaultThreshold: 5, customThresholds: new Map() }); 30 | expect(transformer.transform(1)).toBe(1); 31 | expect(transformer.transform(1)).toBe(1); 32 | expect(transformer.transform(1)).toBe(1); 33 | expect(transformer.transform(1)).toBe(1); 34 | expect(transformer.transform(1)).toBe(1); 35 | expect(transformer.transform(1)).toBeUndefined(); 36 | }); 37 | 38 | it('should reset the threshold once a different character is seen', () => { 39 | const transformer = new CollapseDuplicatesTransformer({ 40 | defaultThreshold: 1, 41 | customThresholds: new Map([ 42 | [1, 2], 43 | [2, 3], 44 | ]), 45 | }); 46 | expect(transformer.transform(1)).toBe(1); 47 | expect(transformer.transform(1)).toBe(1); 48 | expect(transformer.transform(1)).toBeUndefined(); 49 | expect(transformer.transform(2)).toBe(2); 50 | expect(transformer.transform(2)).toBe(2); 51 | expect(transformer.transform(2)).toBe(2); 52 | expect(transformer.transform(2)).toBeUndefined(); 53 | }); 54 | }); 55 | 56 | describe('CollapseDuplicatesTransformer#reset()', () => { 57 | it('should reset the threshold and current character', () => { 58 | const transformer = new CollapseDuplicatesTransformer({ 59 | defaultThreshold: 2, 60 | customThresholds: new Map(), 61 | }); 62 | expect(transformer.transform(1)).toBe(1); 63 | expect(transformer.transform(1)).toBe(1); 64 | expect(transformer.transform(1)).toBeUndefined(); 65 | transformer.reset(); 66 | expect(transformer.transform(1)).toBe(1); 67 | expect(transformer.transform(1)).toBe(1); 68 | expect(transformer.transform(1)).toBeUndefined(); 69 | }); 70 | }); 71 | -------------------------------------------------------------------------------- /test/transformer/remap-characters/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import { remapCharactersTransformer } from '../../../src/transformer/remap-characters'; 3 | import { CharacterCode } from '../../../src/util/Char'; 4 | 5 | describe('remapCharactersTransformer()', () => { 6 | it('should return a simple transformer container', () => { 7 | const container = remapCharactersTransformer({ a: 'b' }); 8 | expect(container.type).toBe(TransformerType.Simple); 9 | expect(typeof container.transform).toBe('function'); 10 | }); 11 | 12 | describe('options', () => { 13 | it('should throw if given an object where keys are comprised of more than one codepoint', () => { 14 | expect(() => remapCharactersTransformer({ ab: 'cd' })).toThrow(RangeError); 15 | }); 16 | 17 | it('should throw if given an object where keys are empty strings', () => { 18 | // eslint-disable-next-line @typescript-eslint/naming-convention 19 | expect(() => remapCharactersTransformer({ '': 'cd' })).toThrow(RangeError); 20 | }); 21 | 22 | it('should throw if given an map where keys are comprised of more than one codepoint', () => { 23 | expect(() => remapCharactersTransformer(new Map([['ab', 'cd']]))).toThrow(RangeError); 24 | }); 25 | 26 | it('should throw if given an map where keys are empty strings', () => { 27 | expect(() => remapCharactersTransformer(new Map([['', 'cd']]))).toThrow(RangeError); 28 | }); 29 | }); 30 | 31 | describe('character remapping', () => { 32 | it('should map any of the equivalent characters to the transformed character (object version)', () => { 33 | const transformer = remapCharactersTransformer({ a: 'bc' }); 34 | expect(transformer.transform('b'.charCodeAt(0))).toBe(CharacterCode.LowerA); 35 | expect(transformer.transform('c'.charCodeAt(0))).toBe(CharacterCode.LowerA); 36 | }); 37 | 38 | it('should map any of the equivalent characters to the transformed character (map version)', () => { 39 | const transformer = remapCharactersTransformer(new Map([['a', 'bc']])); 40 | expect(transformer.transform('b'.charCodeAt(0))).toBe(CharacterCode.LowerA); 41 | expect(transformer.transform('c'.charCodeAt(0))).toBe(CharacterCode.LowerA); 42 | }); 43 | 44 | it('should leave other characters unchanged', () => { 45 | const transformer = remapCharactersTransformer({ a: 'bc' }); 46 | expect(transformer.transform('e'.charCodeAt(0))).toBe('e'.charCodeAt(0)); 47 | expect(transformer.transform('z'.charCodeAt(0))).toBe('z'.charCodeAt(0)); 48 | }); 49 | }); 50 | }); 51 | -------------------------------------------------------------------------------- /test/transformer/resolve-confusables/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import { resolveConfusablesTransformer } from '../../../src/transformer/resolve-confusables'; 3 | import { CharacterCode } from '../../../src/util/Char'; 4 | 5 | describe('resolveConfusablesTransformer()', () => { 6 | it('should return a simple transformer container', () => { 7 | const container = resolveConfusablesTransformer(); 8 | expect(container.type).toBe(TransformerType.Simple); 9 | expect(typeof container.transform).toBe('function'); 10 | }); 11 | 12 | describe('character remapping', () => { 13 | it('should remap relevant characters to their normalized equivalent', () => { 14 | const transformer = resolveConfusablesTransformer(); 15 | expect(transformer.transform('⓵'.codePointAt(0)!)).toBe('1'.charCodeAt(0)); 16 | expect(transformer.transform('❌'.codePointAt(0)!)).toBe('X'.codePointAt(0)); 17 | }); 18 | 19 | it('should leave other characters unchanged', () => { 20 | const transformer = resolveConfusablesTransformer(); 21 | expect(transformer.transform(CharacterCode.LowerA)).toBe(CharacterCode.LowerA); 22 | }); 23 | }); 24 | }); 25 | -------------------------------------------------------------------------------- /test/transformer/resolve-leetspeak/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import { resolveLeetSpeakTransformer } from '../../../src/transformer/resolve-leetspeak'; 3 | import { CharacterCode } from '../../../src/util/Char'; 4 | 5 | describe('resolveLeetSpeakTransformer()', () => { 6 | it('should return a simple transformer container', () => { 7 | const container = resolveLeetSpeakTransformer(); 8 | expect(container.type).toBe(TransformerType.Simple); 9 | expect(typeof container.transform).toBe('function'); 10 | }); 11 | 12 | describe('character remapping', () => { 13 | it('should remap relevant characters to their normalized equivalent', () => { 14 | const transformer = resolveLeetSpeakTransformer(); 15 | expect(transformer.transform('@'.charCodeAt(0))).toBe(CharacterCode.LowerA); 16 | expect(transformer.transform('4'.charCodeAt(0))).toBe(CharacterCode.LowerA); 17 | expect(transformer.transform('('.charCodeAt(0))).toBe('c'.charCodeAt(0)); 18 | expect(transformer.transform('3'.charCodeAt(0))).toBe('e'.charCodeAt(0)); 19 | expect(transformer.transform('1'.charCodeAt(0))).toBe('i'.charCodeAt(0)); 20 | expect(transformer.transform('!'.charCodeAt(0))).toBe('i'.charCodeAt(0)); 21 | expect(transformer.transform('|'.charCodeAt(0))).toBe('i'.charCodeAt(0)); 22 | expect(transformer.transform('6'.charCodeAt(0))).toBe('g'.charCodeAt(0)); 23 | expect(transformer.transform('0'.charCodeAt(0))).toBe('o'.charCodeAt(0)); 24 | expect(transformer.transform('$'.charCodeAt(0))).toBe('s'.charCodeAt(0)); 25 | expect(transformer.transform('5'.charCodeAt(0))).toBe('s'.charCodeAt(0)); 26 | expect(transformer.transform('7'.charCodeAt(0))).toBe('t'.charCodeAt(0)); 27 | expect(transformer.transform('2'.charCodeAt(0))).toBe(CharacterCode.LowerZ); 28 | }); 29 | 30 | it('should leave other characters as is', () => { 31 | const transformer = resolveLeetSpeakTransformer(); 32 | expect(transformer.transform('f'.charCodeAt(0))).toBe('f'.charCodeAt(0)); 33 | expect(transformer.transform(CharacterCode.Backslash)).toBe(CharacterCode.Backslash); 34 | }); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /test/transformer/skip-non-alphabetic/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import { skipNonAlphabeticTransformer } from '../../../src/transformer/skip-non-alphabetic'; 3 | import { CharacterCode } from '../../../src/util/Char'; 4 | 5 | describe('skipNonAlphabeticTransformer()', () => { 6 | it('should return a simple transformer container', () => { 7 | const container = skipNonAlphabeticTransformer(); 8 | expect(container.type).toBe(TransformerType.Simple); 9 | expect(typeof container.transform).toBe('function'); 10 | }); 11 | 12 | describe('character skipping', () => { 13 | it('should leave lowercase alphabet characters as is', () => { 14 | const transformer = skipNonAlphabeticTransformer(); 15 | expect(transformer.transform('c'.charCodeAt(0))).toBe('c'.charCodeAt(0)); 16 | expect(transformer.transform(CharacterCode.LowerZ)).toBe(CharacterCode.LowerZ); 17 | }); 18 | 19 | it('should skip uppercase alphabet characters', () => { 20 | const transformer = skipNonAlphabeticTransformer(); 21 | expect(transformer.transform('D'.charCodeAt(0))).toBe('D'.charCodeAt(0)); 22 | expect(transformer.transform(CharacterCode.UpperA)).toBe(CharacterCode.UpperA); 23 | }); 24 | 25 | it('should return undefined (skip) for all other characters', () => { 26 | const transformer = skipNonAlphabeticTransformer(); 27 | expect(transformer.transform(CharacterCode.Backslash)).toBeUndefined(); 28 | expect(transformer.transform(32)).toBeUndefined(); 29 | expect(transformer.transform(CharacterCode.QuestionMark)).toBeUndefined(); 30 | expect(transformer.transform(CharacterCode.Zero)).toBeUndefined(); 31 | }); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /test/transformer/to-ascii-lowercase/index.test.ts: -------------------------------------------------------------------------------- 1 | import { TransformerType } from '../../../src/transformer/Transformers'; 2 | import { toAsciiLowerCaseTransformer } from '../../../src/transformer/to-ascii-lowercase'; 3 | import { CharacterCode } from '../../../src/util/Char'; 4 | 5 | describe('toAsciiLowerCaseTransformer()', () => { 6 | it('should return a simple transformer container', () => { 7 | const container = toAsciiLowerCaseTransformer(); 8 | expect(container.type).toBe(TransformerType.Simple); 9 | expect(typeof container.transform).toBe('function'); 10 | }); 11 | 12 | describe('case folding', () => { 13 | it('should change uppercase ascii characters to lowercase', () => { 14 | const container = toAsciiLowerCaseTransformer(); 15 | expect(container.transform(CharacterCode.UpperA)).toBe(CharacterCode.LowerA); 16 | }); 17 | 18 | it('should leave lowercase chars unchanged', () => { 19 | const container = toAsciiLowerCaseTransformer(); 20 | expect(container.transform(CharacterCode.LowerA)).toBe(CharacterCode.LowerA); 21 | }); 22 | 23 | it('should leave all other characters unchanged', () => { 24 | const container = toAsciiLowerCaseTransformer(); 25 | expect(container.transform(3)).toBe(3); 26 | expect(container.transform(CharacterCode.Backslash)).toBe(CharacterCode.Backslash); 27 | }); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.base.json", 3 | "include": ["."], 4 | "compilerOptions": { "noEmit": true }, 5 | "references": [{ "path": "../src" }] 6 | } 7 | -------------------------------------------------------------------------------- /test/util/CharacterIterator.fuzz.test.ts: -------------------------------------------------------------------------------- 1 | import * as fc from 'fast-check'; 2 | import { CharacterIterator } from '../../src/util/CharacterIterator'; 3 | 4 | test('the result of the character iterator over a string s should be equal to spreading s and mapping each value into its codepoint', () => { 5 | fc.assert( 6 | fc.property(fc.string16bits(), (str) => { 7 | const expected = [...str].map((s) => s.codePointAt(0)!); 8 | expect([...new CharacterIterator(str)]).toStrictEqual(expected); 9 | }), 10 | ); 11 | }); 12 | -------------------------------------------------------------------------------- /test/util/CharacterIterator.test.ts: -------------------------------------------------------------------------------- 1 | import { CharacterCode } from '../../src/util/Char'; 2 | import { CharacterIterator } from '../../src/util/CharacterIterator'; 3 | 4 | describe('constructor', () => { 5 | it('should default the input to an empty string if not provided', () => { 6 | expect(new CharacterIterator(undefined).input).toBe(''); 7 | }); 8 | 9 | it('should set the input to that provided if not undefined', () => { 10 | expect(new CharacterIterator('hello').input).toBe('hello'); 11 | }); 12 | 13 | it('should default the position to -1', () => { 14 | expect(new CharacterIterator('world').position).toBe(-1); 15 | }); 16 | }); 17 | 18 | describe('CharacterIterator#setInput()', () => { 19 | it('should reset the position', () => { 20 | const iter = new CharacterIterator('world'); 21 | iter.next(); 22 | expect(iter.setInput('hello').position).toBe(-1); 23 | }); 24 | 25 | it('should set the input', () => { 26 | const iter = new CharacterIterator('world'); 27 | iter.next(); 28 | expect(iter.setInput('hello').input).toBe('hello'); 29 | }); 30 | }); 31 | 32 | describe('CharacterIterator#reset()', () => { 33 | it('should reset the position', () => { 34 | const iter = new CharacterIterator('world'); 35 | iter.next(); 36 | iter.reset(); 37 | expect(iter.position).toBe(-1); 38 | expect(iter.next()).toStrictEqual({ done: false, value: 'w'.charCodeAt(0) }); 39 | }); 40 | 41 | it('should not reset the input', () => { 42 | const iter = new CharacterIterator('hello'); 43 | iter.next(); 44 | iter.reset(); 45 | expect(iter.input).toBe('hello'); 46 | }); 47 | }); 48 | 49 | describe('CharacterIterator#next()', () => { 50 | it('should return done: true when done', () => { 51 | const iter = new CharacterIterator(); 52 | expect(iter.next()).toStrictEqual({ done: true, value: undefined }); 53 | }); 54 | 55 | it('should return the next character code unmodified if it does not form a surrogate pair', () => { 56 | const iter = new CharacterIterator('h'); 57 | expect(iter.next()).toStrictEqual({ done: false, value: 'h'.charCodeAt(0) }); 58 | }); 59 | 60 | it('should return the next character despite it being a high surrogate if it is the last character', () => { 61 | const highSurrogate = '🌉'.charCodeAt(0); 62 | const iter = new CharacterIterator(String.fromCharCode(highSurrogate)); 63 | expect(iter.next()).toStrictEqual({ done: false, value: highSurrogate }); 64 | }); 65 | 66 | it('should return the next character despite it being a high surrogate if the character after it is not a low surrogate', () => { 67 | const highSurrogate = '🌉'.charCodeAt(0); 68 | const iter = new CharacterIterator(String.fromCharCode(highSurrogate, CharacterCode.LowerA)); 69 | expect(iter.next()).toStrictEqual({ done: false, value: highSurrogate }); 70 | expect(iter.next()).toStrictEqual({ done: false, value: CharacterCode.LowerA }); 71 | }); 72 | 73 | it('should combine valid surrogate pairs into its corresponding code point', () => { 74 | const iter = new CharacterIterator('🌉abc'); 75 | expect(iter.next()).toStrictEqual({ done: false, value: '🌉abc'.codePointAt(0) }); 76 | expect(iter.next()).toStrictEqual({ done: false, value: CharacterCode.LowerA }); 77 | }); 78 | }); 79 | 80 | describe('CharacterIterator#position', () => { 81 | it('should start as -1', () => { 82 | expect(new CharacterIterator().position).toBe(-1); 83 | }); 84 | 85 | it('should be the start position of the last character read (no surrogate pairs)', () => { 86 | const iter = new CharacterIterator('test'); 87 | iter.next(); 88 | expect(iter.position).toBe(0); 89 | }); 90 | 91 | it('should be the start position of the last character read (with surrogate pairs)', () => { 92 | const iter = new CharacterIterator('🌉abc'); 93 | iter.next(); 94 | expect(iter.position).toBe(0); 95 | iter.next(); 96 | expect(iter.position).toBe(2); // surrogate pair takes up 2 chars 97 | }); 98 | 99 | it('should revert to -1 after resetting', () => { 100 | const iter = new CharacterIterator('hello'); 101 | iter.next(); 102 | iter.reset(); 103 | expect(iter.position).toBe(-1); 104 | }); 105 | }); 106 | 107 | describe('CharacterIterator#lastWidth', () => { 108 | it('should start as 0', () => { 109 | expect(new CharacterIterator().lastWidth).toBe(0); 110 | }); 111 | 112 | it('should be 2 if the last character consumed was a surrogate pair', () => { 113 | const iter = new CharacterIterator('🌉abc'); 114 | iter.next(); 115 | expect(iter.lastWidth).toBe(2); 116 | }); 117 | 118 | it('should be 1 if the last character consumed was not a surrogate pair', () => { 119 | const iter = new CharacterIterator('hello'); 120 | iter.next(); 121 | expect(iter.lastWidth).toBe(1); 122 | }); 123 | 124 | it('should revert to 0 after resetting', () => { 125 | const iter = new CharacterIterator('asdf'); 126 | iter.next(); 127 | iter.reset(); 128 | expect(iter.lastWidth).toBe(0); 129 | }); 130 | }); 131 | 132 | describe('CharacterIterator#done', () => { 133 | it('should be true for empty strings', () => { 134 | expect(new CharacterIterator().done).toBeTruthy(); 135 | }); 136 | 137 | it('should be false if the input has not been completely consumed', () => { 138 | expect(new CharacterIterator('hh').done).toBeFalsy(); 139 | }); 140 | 141 | it('should be true if all input has been consumed', () => { 142 | const iter = new CharacterIterator('hello'); 143 | for (let i = 0; i < 5; i++) iter.next(); 144 | expect(iter.done).toBeTruthy(); 145 | }); 146 | 147 | it('should be false after resetting', () => { 148 | const iter = new CharacterIterator('hello'); 149 | for (let i = 0; i < 5; i++) iter.next(); 150 | expect(iter.done).toBeTruthy(); 151 | iter.reset(); 152 | expect(iter.done).toBeFalsy(); 153 | }); 154 | }); 155 | 156 | describe('iterating over it', () => { 157 | it('should be iterable', () => { 158 | const iter = new CharacterIterator('hello!'); 159 | const chars: number[] = []; 160 | for (const char of iter) { 161 | chars.push(char); 162 | } 163 | 164 | expect(String.fromCodePoint(...chars)).toBe('hello!'); 165 | }); 166 | }); 167 | -------------------------------------------------------------------------------- /test/util/Interval.test.ts: -------------------------------------------------------------------------------- 1 | import { compareIntervals } from '../../src/util/Interval'; 2 | 3 | describe('compareIntervals()', () => { 4 | it("should return -1 if the first interval's lower bound is less than the second's", () => { 5 | expect(compareIntervals(1, 5, 2, 3)).toBe(-1); 6 | }); 7 | 8 | it("should return 1 if the second interval's lower bound is less than the first's", () => { 9 | expect(compareIntervals(2, 3, 1, 5)).toBe(1); 10 | }); 11 | 12 | it("should return -1 if the first interval's upper bound is less than the second's", () => { 13 | expect(compareIntervals(2, 3, 2, 5)).toBe(-1); 14 | }); 15 | 16 | it("should return 1 if the second interval's upper bound is less than the first's", () => { 17 | expect(compareIntervals(2, 5, 2, 3)).toBe(1); 18 | }); 19 | 20 | it('should return 0 if the first and second intervals are equal', () => { 21 | expect(compareIntervals(1, 5, 1, 5)).toBe(0); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /tsconfig.base.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "strict": true, 4 | "noUnusedLocals": true, 5 | "noImplicitAny": true, 6 | "alwaysStrict": true, 7 | "pretty": true, 8 | "module": "CommonJS", 9 | "moduleResolution": "Node", 10 | "target": "ES2020", 11 | "lib": ["ES2020"] 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /tsconfig.eslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.base.json", 3 | "include": ["src", "test", "examples", "jest.config.ts"] 4 | } 5 | -------------------------------------------------------------------------------- /typedoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "readme": "none", 3 | "entryPoints": ["src/index.ts"], 4 | "out": "docs/reference", 5 | "tsconfig": "src/tsconfig.json", 6 | "excludePrivate": true 7 | } 8 | --------------------------------------------------------------------------------