├── .gitignore ├── .prettierrc ├── LICENSE ├── README.md ├── bun.lockb ├── package.json ├── rolldown.config.js ├── shims.d.ts ├── src ├── cli.ts ├── index.ts ├── logger.ts ├── to-markdown.ts ├── types.ts └── utils.ts └── tsconfig.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | foo.txt 3 | *.log 4 | .DS_Store 5 | dist/ -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": false 3 | } 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 EGOIST 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sitefetch 2 | 3 | Fetch an entire site and save it as a text file (to be used with AI models). 4 | 5 | ![image](https://github.com/user-attachments/assets/e6877428-0e1c-444a-b7af-2fb21ded8814) 6 | 7 | ## Install 8 | 9 | One-off usage (choose one of the followings): 10 | 11 | ```bash 12 | bunx sitefetch 13 | npx sitefetch 14 | pnpx sitefetch 15 | ``` 16 | 17 | Install globally (choose one of the followings): 18 | 19 | ```bash 20 | bun i -g sitefetch 21 | npm i -g sitefetch 22 | pnpm i -g sitefetch 23 | ``` 24 | 25 | ## Usage 26 | 27 | ```bash 28 | sitefetch https://egoist.dev -o site.txt 29 | 30 | # or better concurrency 31 | sitefetch https://egoist.dev -o site.txt --concurrency 10 32 | ``` 33 | 34 | ### Match specific pages 35 | 36 | Use the `-m, --match` flag to specify the pages you want to fetch: 37 | 38 | ```bash 39 | sitefetch https://vite.dev -m "/blog/**" -m "/guide/**" 40 | ``` 41 | 42 | The match pattern is tested against the pathname of target pages, powered by micromatch, you can check out all the supported [matching features](https://github.com/micromatch/micromatch#matching-features). 43 | 44 | ### Content selector 45 | 46 | We use [mozilla/readability](https://github.com/mozilla/readability) to extract readable content from the web page, but on some pages it might return irrelevant contents, in this case you can specify a CSS selector so we know where to find the readable content: 47 | 48 | ```sitefetch 49 | sitefetch https://vite.dev --content-selector ".content" 50 | ``` 51 | 52 | ## Plug 53 | 54 | If you like this, please check out my LLM chat app: https://chatwise.app 55 | 56 | ## API 57 | 58 | ```ts 59 | import { fetchSite } from "sitefetch" 60 | 61 | await fetchSite("https://egoist.dev", { 62 | //...options 63 | }) 64 | ``` 65 | 66 | Check out options in [types.ts](./src/types.ts). 67 | 68 | ## License 69 | 70 | MIT. 71 | -------------------------------------------------------------------------------- /bun.lockb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/egoist/sitefetch/9f570a1216403a30b202bbbcbcef03b7ef4787fd/bun.lockb -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sitefetch", 3 | "version": "0.0.17", 4 | "description": "Fetch an entire site and save it as a text file", 5 | "bin": "./dist/cli.js", 6 | "main": "./dist/index.js", 7 | "types": "./dist/index.d.ts", 8 | "files": [ 9 | "dist" 10 | ], 11 | "type": "module", 12 | "scripts": { 13 | "test": "echo \"Error: no test specified\" && exit 1", 14 | "build": "rm -rf dist && rolldown -c", 15 | "prepublishOnly": "bun run build" 16 | }, 17 | "keywords": [], 18 | "author": "EGOIST ", 19 | "license": "MIT", 20 | "dependencies": { 21 | "happy-dom": "^16.5.3", 22 | "cheerio": "^1.0.0", 23 | "gpt-tokenizer": "^2.8.1", 24 | "turndown": "^7.2.0", 25 | "turndown-plugin-gfm": "^1.0.2", 26 | "micromatch": "^4.0.8" 27 | }, 28 | "devDependencies": { 29 | "@mozilla/readability": "^0.5.0", 30 | "@types/bun": "^1.1.15", 31 | "@types/micromatch": "^4.0.9", 32 | "@types/turndown": "^5.0.5", 33 | "cac": "^6.7.14", 34 | "p-queue": "^8.0.1", 35 | "picocolors": "^1.1.1", 36 | "rolldown": "^1.0.0-beta.1", 37 | "typescript": "^5.7.3", 38 | "unplugin-isolated-decl": "^0.10.4" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /rolldown.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | import fs from "node:fs" 3 | import { defineConfig } from "rolldown" 4 | import { isBuiltin } from "node:module" 5 | import UnpluginIsolatedDecl from "unplugin-isolated-decl/rolldown" 6 | 7 | const pkg = JSON.parse(fs.readFileSync("./package.json", "utf8")) 8 | 9 | export default defineConfig({ 10 | input: ["src/cli.ts", "src/index.ts"], 11 | output: { 12 | dir: "dist", 13 | format: "esm", 14 | banner(chunk) { 15 | if (chunk.fileName === "cli.js") { 16 | return `#!/usr/bin/env node` 17 | } 18 | return "" 19 | }, 20 | }, 21 | platform: "node", 22 | external: Object.keys(pkg.dependencies) 23 | .map((name) => [name, new RegExp(`^${name}/`)]) 24 | .flat(), 25 | plugins: [ 26 | process.env.NO_DTS 27 | ? undefined 28 | : UnpluginIsolatedDecl({ transformer: "typescript" }), 29 | { 30 | // make sure every node builtin module is prefixed with node: 31 | name: "add-node-prefix", 32 | renderChunk(code) { 33 | return code.replace(/import (.+) from "(.+)"/g, (m, m1, m2) => { 34 | if (isBuiltin(m2) && !m2.startsWith("node:")) { 35 | return `import ${m1} from "node:${m2}"` 36 | } 37 | return m 38 | }) 39 | }, 40 | resolveId(id) { 41 | if (isBuiltin(id) && !id.startsWith("node:")) { 42 | return { 43 | id: `node:${id}`, 44 | external: true, 45 | } 46 | } 47 | }, 48 | }, 49 | ], 50 | }) 51 | -------------------------------------------------------------------------------- /shims.d.ts: -------------------------------------------------------------------------------- 1 | declare module "turndown-plugin-gfm" 2 | -------------------------------------------------------------------------------- /src/cli.ts: -------------------------------------------------------------------------------- 1 | import path from "node:path" 2 | import fs from "node:fs" 3 | import { cac } from "cac" 4 | import { encode } from "gpt-tokenizer/model/gpt-4o" 5 | import { fetchSite, serializePages } from "./index.ts" 6 | import { logger } from "./logger.ts" 7 | import { ensureArray, formatNumber } from "./utils.ts" 8 | import { version } from "../package.json" 9 | 10 | const cli = cac("sitefetch") 11 | 12 | cli 13 | .command("[url]", "Fetch a site") 14 | .option("-o, --outfile ", "Write the fetched site to a text file") 15 | .option("--concurrency ", "Number of concurrent requests", { 16 | default: 3, 17 | }) 18 | .option("-m, --match ", "Only fetch matched pages") 19 | .option("--content-selector ", "The CSS selector to find content") 20 | .option("--limit ", "Limit the result to this amount of pages") 21 | .option("--silent", "Do not print any logs") 22 | .action(async (url, flags) => { 23 | if (!url) { 24 | cli.outputHelp() 25 | return 26 | } 27 | 28 | if (flags.silent) { 29 | logger.setLevel("silent") 30 | } 31 | 32 | const pages = await fetchSite(url, { 33 | concurrency: flags.concurrency, 34 | match: flags.match && ensureArray(flags.match), 35 | contentSelector: flags.contentSelector, 36 | limit: flags.limit, 37 | }) 38 | 39 | if (pages.size === 0) { 40 | logger.warn("No pages found") 41 | return 42 | } 43 | 44 | const pagesArr = [...pages.values()] 45 | 46 | const totalTokenCount = pagesArr.reduce( 47 | (acc, page) => acc + encode(page.content).length, 48 | 0 49 | ) 50 | 51 | logger.info( 52 | `Total token count for ${pages.size} pages: ${formatNumber( 53 | totalTokenCount 54 | )}` 55 | ) 56 | 57 | if (flags.outfile) { 58 | const output = serializePages( 59 | pages, 60 | flags.outfile.endsWith(".json") ? "json" : "text" 61 | ) 62 | fs.mkdirSync(path.dirname(flags.outfile), { recursive: true }) 63 | fs.writeFileSync(flags.outfile, output, "utf8") 64 | } else { 65 | console.log(serializePages(pages, "text")) 66 | } 67 | }) 68 | 69 | cli.version(version) 70 | cli.help() 71 | cli.parse() 72 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import Queue from "p-queue" 2 | import { Window } from "happy-dom" 3 | import { Readability } from "@mozilla/readability" 4 | import c from "picocolors" 5 | import { toMarkdown } from "./to-markdown.ts" 6 | import { logger } from "./logger.ts" 7 | import { load } from "cheerio" 8 | import { matchPath } from "./utils.ts" 9 | import type { Options, FetchSiteResult } from "./types.ts" 10 | 11 | export async function fetchSite( 12 | url: string, 13 | options: Options 14 | ): Promise { 15 | const fetcher = new Fetcher(options) 16 | 17 | return fetcher.fetchSite(url) 18 | } 19 | 20 | class Fetcher { 21 | #pages: FetchSiteResult = new Map() 22 | #fetched: Set = new Set() 23 | #queue: Queue 24 | 25 | constructor(public options: Options) { 26 | const concurrency = options.concurrency || 3 27 | this.#queue = new Queue({ concurrency }) 28 | } 29 | 30 | #limitReached() { 31 | return this.options.limit && this.#pages.size >= this.options.limit 32 | } 33 | 34 | #getContentSelector(pathname: string) { 35 | if (typeof this.options.contentSelector === "function") 36 | return this.options.contentSelector({ pathname }) 37 | 38 | return this.options.contentSelector 39 | } 40 | 41 | async fetchSite(url: string) { 42 | logger.info( 43 | `Started fetching ${c.green(url)} with a concurrency of ${ 44 | this.#queue.concurrency 45 | }` 46 | ) 47 | 48 | await this.#fetchPage(url, { 49 | skipMatch: true, 50 | }) 51 | 52 | await this.#queue.onIdle() 53 | 54 | return this.#pages 55 | } 56 | 57 | async #fetchPage( 58 | url: string, 59 | options: { 60 | skipMatch?: boolean 61 | } 62 | ) { 63 | const { host, pathname } = new URL(url) 64 | 65 | if (this.#fetched.has(pathname) || this.#limitReached()) { 66 | return 67 | } 68 | 69 | this.#fetched.add(pathname) 70 | 71 | // return if not matched 72 | // we don't need to extract content for this page 73 | if ( 74 | !options.skipMatch && 75 | this.options.match && 76 | !matchPath(pathname, this.options.match) 77 | ) { 78 | return 79 | } 80 | 81 | logger.info(`Fetching ${c.green(url)}`) 82 | 83 | const res = await (this.options.fetch || fetch)(url, { 84 | headers: { 85 | "user-agent": "Sitefetch (https://github.com/egoist/sitefetch)", 86 | }, 87 | }) 88 | 89 | if (!res.ok) { 90 | logger.warn(`Failed to fetch ${url}: ${res.statusText}`) 91 | return 92 | } 93 | 94 | if (this.#limitReached()) { 95 | return 96 | } 97 | 98 | const contentType = res.headers.get("content-type") 99 | 100 | if (!contentType?.includes("text/html")) { 101 | logger.warn(`Not a HTML page: ${url}`) 102 | return 103 | } 104 | 105 | const resUrl = new URL(res.url) 106 | 107 | // redirected to other site, ignore 108 | if (resUrl.host !== host) { 109 | logger.warn(`Redirected from ${host} to ${resUrl.host}`) 110 | return 111 | } 112 | const extraUrls: string[] = [] 113 | 114 | const $ = load(await res.text()) 115 | $("script,style,link,img,video").remove() 116 | 117 | $("a").each((_, el) => { 118 | const href = $(el).attr("href") 119 | 120 | if (!href) { 121 | return 122 | } 123 | 124 | try { 125 | const thisUrl = new URL(href, url) 126 | if (thisUrl.host !== host) { 127 | return 128 | } 129 | 130 | extraUrls.push(thisUrl.href) 131 | } catch { 132 | logger.warn(`Failed to parse URL: ${href}`) 133 | } 134 | }) 135 | 136 | if (extraUrls.length > 0) { 137 | for (const url of extraUrls) { 138 | this.#queue.add(() => 139 | this.#fetchPage(url, { ...options, skipMatch: false }) 140 | ) 141 | } 142 | } 143 | 144 | const window = new Window({ 145 | url, 146 | settings: { 147 | disableJavaScriptFileLoading: true, 148 | disableJavaScriptEvaluation: true, 149 | disableCSSFileLoading: true, 150 | }, 151 | }) 152 | 153 | const pageTitle = $("title").text() 154 | const contentSelector = this.#getContentSelector(pathname) 155 | const html = contentSelector 156 | ? $(contentSelector).prop("outerHTML") 157 | : $.html() 158 | 159 | if (!html) { 160 | logger.warn(`No readable content on ${pathname}`) 161 | return 162 | } 163 | 164 | window.document.write(html) 165 | 166 | await window.happyDOM.waitUntilComplete() 167 | 168 | const article = new Readability(window.document as any).parse() 169 | 170 | await window.happyDOM.close() 171 | 172 | if (!article) { 173 | return 174 | } 175 | 176 | const content = toMarkdown(article.content) 177 | 178 | this.#pages.set(pathname, { 179 | title: article.title || pageTitle, 180 | url, 181 | content, 182 | }) 183 | } 184 | } 185 | 186 | export function serializePages( 187 | pages: FetchSiteResult, 188 | format: "json" | "text" 189 | ): string { 190 | if (format === "json") { 191 | return JSON.stringify([...pages.values()]) 192 | } 193 | 194 | return [...pages.values()] 195 | .map((page) => 196 | ` 197 | ${page.title} 198 | ${page.url} 199 | ${page.content} 200 | `.trim() 201 | ) 202 | .join("\n\n") 203 | } 204 | -------------------------------------------------------------------------------- /src/logger.ts: -------------------------------------------------------------------------------- 1 | import c from "picocolors" 2 | 3 | type LoggerLevel = "silent" | "warn" 4 | 5 | class Logger { 6 | private level?: LoggerLevel 7 | 8 | setLevel(level: LoggerLevel): void { 9 | this.level = level 10 | } 11 | 12 | info(...args: any[]): void { 13 | if (this.level === "silent") return 14 | console.log(c.cyan("INFO"), ...args) 15 | } 16 | 17 | warn(...args: any[]): void { 18 | if (this.level === "silent") return 19 | console.warn(c.yellow("WARN"), ...args) 20 | } 21 | } 22 | 23 | export const logger: Logger = new Logger() 24 | -------------------------------------------------------------------------------- /src/to-markdown.ts: -------------------------------------------------------------------------------- 1 | import Turndown from "turndown" 2 | import { gfm } from "turndown-plugin-gfm" 3 | 4 | const turndown = new Turndown() 5 | turndown.use(gfm) 6 | 7 | export function toMarkdown(html: string): string { 8 | return turndown.turndown(html) 9 | } 10 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export type Options = { 2 | /** How many requests can be made at the same time */ 3 | concurrency?: number 4 | 5 | /** 6 | * Match pathname by specific patterns, powered by micromatch 7 | * Only pages matched by this will be fetched 8 | */ 9 | match?: string[] 10 | 11 | /** 12 | * The CSS selector to find content 13 | */ 14 | contentSelector?: 15 | | string 16 | | ((ctx: { pathname: string }) => string | void | undefined) 17 | 18 | /** 19 | * Limit the result to this amount of pages 20 | */ 21 | limit?: number 22 | 23 | /** 24 | * A custom function to fetch URL 25 | */ 26 | fetch?: (url: string, init: RequestInit) => Promise 27 | } 28 | 29 | export type Page = { 30 | title: string 31 | url: string 32 | content: string 33 | } 34 | 35 | export type FetchSiteResult = Map 36 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | import micromatch from "micromatch" 2 | 3 | // xK or xM 4 | export function formatNumber(num: number): string { 5 | return num > 1000000 6 | ? `${(num / 1000000).toFixed(1)}M` 7 | : num > 1000 8 | ? `${(num / 1000).toFixed(1)}K` 9 | : num.toString() 10 | } 11 | 12 | export function matchPath(path: string, pattern: string | string[]): boolean { 13 | return micromatch.isMatch(path, pattern) 14 | } 15 | 16 | export function ensureArray(input: T | T[]): T[] { 17 | return Array.isArray(input) ? input : [input] 18 | } 19 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | // Enable latest features 4 | "lib": ["ESNext"], 5 | "target": "ESNext", 6 | "module": "ESNext", 7 | "moduleDetection": "force", 8 | "jsx": "react-jsx", 9 | "isolatedDeclarations": true, 10 | "declaration": true, 11 | 12 | // Bundler mode 13 | "moduleResolution": "bundler", 14 | "allowImportingTsExtensions": true, 15 | "verbatimModuleSyntax": true, 16 | "noEmit": true, 17 | 18 | // Best practices 19 | "strict": true, 20 | "skipLibCheck": true, 21 | "noFallthroughCasesInSwitch": true, 22 | 23 | // Some stricter flags 24 | "noUnusedLocals": true, 25 | "noUnusedParameters": true, 26 | "noPropertyAccessFromIndexSignature": true 27 | } 28 | } --------------------------------------------------------------------------------