├── .devcontainer └── devcontainer.json ├── .github ├── CODEOWNERS └── workflows │ └── ci-validation.yml ├── .gitignore ├── LICENSE ├── README.md ├── validate.js └── well-known-bots.json /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/javascript-node 3 | { 4 | "name": "arcjet-well-known-bots", 5 | "image": "mcr.microsoft.com/devcontainers/javascript-node:1-20-bullseye", 6 | 7 | // Features to add to the dev container. More info: https://containers.dev/features. 8 | "features": { 9 | "ghcr.io/devcontainers/features/common-utils:2.3.2": {} 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @arcjet/engineering-team 2 | -------------------------------------------------------------------------------- /.github/workflows/ci-validation.yml: -------------------------------------------------------------------------------- 1 | name: CI validation 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: actions/setup-node@v4 11 | with: 12 | node-version: 20 13 | - run: node validate.js --check 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* 131 | 132 | # One-off scripts 133 | *.sh 134 | 135 | # macOS 136 | .DS_Store 137 | 138 | # turborepo 139 | .turbo/ 140 | 141 | # trunk 142 | .trunk/ 143 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Martin Monperrus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Well Known Bots 2 | 3 | This repository contains a list of Well Known Bots, including robots, crawlers, 4 | validators, monitors, and spiders, in a single JSON file. Each bot is identified 5 | and provided a RegExp `pattern` to match against an HTTP `User-Agent` header. 6 | Additional metadata is available on each item. 7 | 8 | ## Install 9 | 10 | ### Direct download 11 | 12 | Download the [`well-known-bots.json` file][raw-json-url] directly. 13 | 14 | ## Realities 15 | 16 | It's impossible to create a system that can detect all bots. Well-behaving bots 17 | identify themselves in a consistent manner, usually via the User-Agent patterns 18 | this project provides. It is straightforward to identify these well-behaving 19 | bots, but misbehaving bots pretend to be real clients and use various mechanisms 20 | to evade detection. 21 | 22 | For more details, see [Non-Technical Notes in the 23 | browser-fingerprinting][non-tech-notes-url] project. 24 | 25 | ## Structure 26 | 27 | Each entry in the JSON represents a specific bot or crawler and includes the following fields: 28 | 29 | - id: A unique identifier for the bot 30 | - categories: An array of categories the bot belongs to (e.g., "search-engine", "advertising") 31 | - pattern: A regular expression pattern used to identify the bot in user agent strings 32 | - url: (optional) A URL with more information about the bot 33 | - verification: A list of supported methods for verifying the bot's identity (if the bot is not verifiable it should be empty). 34 | - instances: An array of example user agent strings for the bot 35 | - aliases: Extra unique identifiers for the bot that can be used to identify it across other data sources 36 | 37 | ### Verification 38 | 39 | Each verification entry contains the following fields: 40 | 41 | - type: The method of verification (`dns` and `cidr` are supported) 42 | 43 | If you specify `dns` verification then these fields are expected: 44 | 45 | - masks: An array of mask patterns used for verification 46 | 47 | If you specify `cidr` verification then these fields are expected: 48 | 49 | - sources: An array of sources to pull cidr range data from (at least one is required) 50 | 51 | ### Verification mask patterns 52 | 53 | The mask patterns use the following special characters: 54 | 55 | - *: Represents 0 or 1 of any character 56 | - @: Acts as a wildcard, matching any number of characters 57 | 58 | All other characters in the mask require an exact match. 59 | 60 | ### Cidr verification sources 61 | 62 | Each cidr source requires the following fields: 63 | 64 | - type: The type of source (Currently only `http-json`) is supported 65 | - url: The url that hosts the ip ranges 66 | - selector: A JsonPath selector that selects all of the IP ranges in the source 67 | 68 | ## License 69 | 70 | The project is a hard-fork of [crawler-user-agents][forked-repo-url] at commit 71 | `46831767324e10c69c9ac6e538c9847853a0feb9`, which is distributed under the [MIT 72 | License][mit-license]. 73 | 74 | [raw-json-url]: https://raw.githubusercontent.com/arcjet/well-known-bots/main/well-known-bots.json 75 | [forked-repo-url]: https://github.com/monperrus/crawler-user-agents/commit/46831767324e10c69c9ac6e538c9847853a0feb9 76 | [non-tech-notes-url]: https://github.com/niespodd/browser-fingerprinting/blob/baecc60821cefd06eb89a54d18be39d87dd16f2e/README.md#non-technical-notes 77 | [mit-license]: https://opensource.org/licenses/MIT 78 | -------------------------------------------------------------------------------- /validate.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is used for checking and updating the format of the JSON file. 3 | * 4 | * You can check the format via `node format.js --check` and regenerate the 5 | * file with the correct formatting using `node format.js --generate`. 6 | * 7 | * The formatting logic uses `JSON.stringify` with 2 spaces, which will keep 8 | * separating commas on the same line as any closing character. This technique 9 | * was chosen for simplicity and to align with common default JSON formatters, 10 | * such as VSCode. 11 | */ 12 | 13 | const fs = require("fs"); 14 | const path = require("path"); 15 | 16 | const jsonFilePath = path.join(__dirname, "well-known-bots.json"); 17 | 18 | const original = fs.readFileSync(jsonFilePath, "utf-8"); 19 | 20 | const updated = JSON.stringify(JSON.parse(original), null, 2) + '\n'; 21 | 22 | if (process.argv[2] === "--generate") { 23 | fs.writeFileSync(jsonFilePath, updated); 24 | process.exit(0); 25 | } else if (process.argv[2] === "--check") { 26 | if (updated !== original) { 27 | console.error("JSON file format is wrong. Run `node format.js --generate` to update."); 28 | console.error("Format must be 2 spaces, with newlines for objects and arrays, and separating commas on the line with the previous closing character."); 29 | process.exit(1); 30 | } 31 | 32 | for (const item of JSON.parse(original)) { 33 | if (typeof item.id !== "string") { 34 | console.error("Item is missing required `id` string field:", item); 35 | process.exit(1); 36 | } 37 | if (typeof item.pattern !== "object" || item.pattern === null || Array.isArray(item.pattern)) { 38 | console.error("Item is missing required pattern object with accepted and forbidden arrays:", item); 39 | process.exit(1); 40 | } 41 | if (!Array.isArray(item.pattern.accepted)) { 42 | console.error("Item pattern.accepted is missing or is not an array:", item); 43 | process.exit(1); 44 | } 45 | for (const pat of item.pattern.accepted) { 46 | if (typeof pat !== "string") { 47 | console.error("Pattern (accepted) entry was not a string:", item, pat); 48 | process.exit(1); 49 | } 50 | } 51 | if (!Array.isArray(item.pattern.forbidden)) { 52 | console.error("Item pattern.forbidden is missing or is not an array:", item); 53 | process.exit(1); 54 | } 55 | for (const pat of item.pattern.forbidden) { 56 | if (typeof pat !== "string") { 57 | console.error("Pattern (forbidden) entry was not a string:", item, pat); 58 | process.exit(1); 59 | } 60 | } 61 | if (!Array.isArray(item.categories)) { 62 | console.error("Item is missing required `categories` array field:", item); 63 | process.exit(1); 64 | } 65 | if (item.categories.length < 1) { 66 | console.error("The `categories` field must contain at least one:", item); 67 | process.exit(1); 68 | } 69 | // TODO: Validate urls are still accessible 70 | if (typeof item.url !== "undefined" && typeof item.url !== "string") { 71 | console.error("Item has wrong type specified for `url` string field:", item); 72 | process.exit(1); 73 | } 74 | if (!Array.isArray(item.verification)) { 75 | console.error("Item is missing required `verification` array field:", item); 76 | process.exit(1); 77 | } 78 | for (const verify of item.verification) { 79 | if (verify.type === "cidr") { 80 | if (!Array.isArray(verify.sources)) { 81 | console.error("Item cidr validation entry is missing required `sources` array field:", item, verify); 82 | process.exit(1); 83 | } 84 | for (const source of verify.sources) { 85 | if (source.type !== "http-json") { 86 | console.error("Cidr source `type` must be a valid type (currently only `http-json` is supported)", item, verify, source); 87 | process.exit(1); 88 | } 89 | 90 | if (typeof source.url !== "string") { 91 | console.error("Cidr source `url` must be a string", item, verify, source); 92 | process.exit(1); 93 | } 94 | 95 | if (typeof source.selector !== "string") { 96 | console.error("Cidr source `selector` must be a string", item, verify, source); 97 | process.exit(1); 98 | } 99 | } 100 | } else if (verify.type === "dns") { 101 | if (!Array.isArray(verify.masks)) { 102 | console.error("Item dns validation entry is missing required `masks` array field:", item, verify); 103 | process.exit(1); 104 | } 105 | for (const mask of verify.masks) { 106 | if (typeof mask !== "string") { 107 | console.error("Mask was not a string:", item, verify, mask); 108 | process.exit(1); 109 | } 110 | } 111 | } else if (verify.type === "ip") { 112 | if (!Array.isArray(verify.sources)) { 113 | console.error("Item IP validation entry is missing required `sources` array field:", item, verify); 114 | process.exit(1); 115 | } 116 | for (const source of verify.sources) { 117 | if (source.type !== "http-json") { 118 | console.error("IP source `type` must be a valid type (currently only `http-json` is supported)", item, verify, source); 119 | process.exit(1); 120 | } 121 | 122 | if (typeof source.url !== "string") { 123 | console.error("IP source `url` must be a string", item, verify, source); 124 | process.exit(1); 125 | } 126 | 127 | if (typeof source.selector !== "string") { 128 | console.error("IP source `selector` must be a string", item, verify, source); 129 | process.exit(1); 130 | } 131 | } 132 | } else { 133 | console.error("Item validation entry is incorrect, only `ip`, `dns`, and `cidr` are supported:", item, verify); 134 | process.exit(1); 135 | } 136 | } 137 | if (typeof item.aliases !== "undefined") { 138 | if (!Array.isArray(item.aliases)) { 139 | console.error("Item has wrong type specified for `aliases` array field:", item); 140 | process.exit(1); 141 | } 142 | for (const alias of item.aliases) { 143 | if (typeof alias !== "string") { 144 | console.error("Alias was not a string:", item, alias); 145 | process.exit(1); 146 | } 147 | } 148 | } 149 | // TODO: Check `addition_date` is defined properly 150 | // TODO: Check or remove `depends_on` field 151 | if (typeof item.instances !== "undefined") { 152 | if (typeof item.instances !== "object" || item.instances === null || Array.isArray(item.instances)) { 153 | console.error( 154 | "Item has wrong type specified for instances, it must be an object with accepted and rejected arrays:", 155 | item 156 | ); 157 | process.exit(1); 158 | } 159 | if (!Array.isArray(item.instances.accepted)) { 160 | console.error("Item instances.accepted is missing or is not an array:", item); 161 | process.exit(1); 162 | } 163 | if (!Array.isArray(item.instances.rejected)) { 164 | console.error("Item instances.rejected is missing or is not an array:", item); 165 | process.exit(1); 166 | } 167 | for (const instance of item.instances.accepted) { 168 | if (typeof instance !== "string") { 169 | console.error("Instance was not a string:", item, instance); 170 | process.exit(1); 171 | } 172 | for (const pat of item.pattern.accepted) { 173 | let re; 174 | try { 175 | re = new RegExp(pat); 176 | } catch (e) { 177 | console.error("Invalid regex pattern in pattern.accepted:", pat, item); 178 | process.exit(1); 179 | } 180 | if (!re.test(instance)) { 181 | console.error("Instance in instances.accepted does not match the required accepted pattern:"); 182 | console.error(" pattern.accepted: ", pat); 183 | console.error(" instance: ", instance); 184 | process.exit(1); 185 | } 186 | } 187 | for (const pat of item.pattern.forbidden) { 188 | let re; 189 | try { 190 | re = new RegExp(pat); 191 | } catch (e) { 192 | console.error("Invalid regex pattern in pattern.forbidden:", pat, item); 193 | process.exit(1); 194 | } 195 | if (re.test(instance)) { 196 | console.error("Instance in instances.accepted should not match the forbidden pattern:"); 197 | console.error(" pattern.forbidden: ", pat); 198 | console.error(" instance: ", instance); 199 | process.exit(1); 200 | } 201 | } 202 | } 203 | // We are testing that the instances would be accepted if not for the `forbidden` array. 204 | // This ensures that the forbidden regex works correctly and its not just failing 205 | // because it doesn't match the patterns in the `accepted` array. 206 | for (const instance of item.instances.rejected) { 207 | if (typeof instance !== "string") { 208 | console.error("Rejected instance was not a string:", item, instance); 209 | process.exit(1); 210 | } 211 | let matchesAllAccepted = true; 212 | for (const pat of item.pattern.accepted) { 213 | let re; 214 | try { 215 | re = new RegExp(pat); 216 | } catch (e) { 217 | console.error("Invalid regex pattern in pattern.accepted:", pat, item); 218 | process.exit(1); 219 | } 220 | if (!re.test(instance)) { 221 | matchesAllAccepted = false; 222 | break; 223 | } 224 | } 225 | let matchesAnyForbidden = false; 226 | for (const pat of item.pattern.forbidden) { 227 | let re; 228 | try { 229 | re = new RegExp(pat); 230 | } catch (e) { 231 | console.error("Invalid regex pattern in pattern.forbidden:", pat, item); 232 | process.exit(1); 233 | } 234 | if (re.test(instance)) { 235 | matchesAnyForbidden = true; 236 | break; 237 | } 238 | } 239 | // If the instance matches all accepted regexes and none of the forbidden, 240 | // then it qualifies as a match. This is not allowed for a rejected instance. 241 | if (matchesAllAccepted && !matchesAnyForbidden) { 242 | console.error("Rejected instance in instances.rejected unexpectedly matches all accepted patterns and none of the forbidden patterns:"); 243 | console.error(" pattern.accepted: ", item.pattern.accepted); 244 | console.error(" pattern.forbidden: ", item.pattern.forbidden); 245 | console.error(" instance: ", instance); 246 | process.exit(1); 247 | } 248 | } 249 | } 250 | } 251 | } else { 252 | console.error("Valid subcommands are `--generate` or `--check`") 253 | process.exit(1); 254 | } 255 | --------------------------------------------------------------------------------