├── .gitignore
├── .prettierrc
├── LICENSE
├── README.md
├── bun.lockb
├── package.json
├── rolldown.config.js
├── shims.d.ts
├── src
    ├── cli.ts
    ├── index.ts
    ├── logger.ts
    ├── to-markdown.ts
    ├── types.ts
    └── utils.ts
└── tsconfig.json


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | foo.txt
3 | *.log
4 | .DS_Store
5 | dist/


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "semi": false
3 | }
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 EGOIST <hi@egoist.dev>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sitefetch
 2 | 
 3 | Fetch an entire site and save it as a text file (to be used with AI models).
 4 | 
 5 | ![image](https://github.com/user-attachments/assets/e6877428-0e1c-444a-b7af-2fb21ded8814)
 6 | 
 7 | ## Install
 8 | 
 9 | One-off usage (choose one of the followings):
10 | 
11 | ```bash
12 | bunx sitefetch
13 | npx sitefetch
14 | pnpx sitefetch
15 | ```
16 | 
17 | Install globally (choose one of the followings):
18 | 
19 | ```bash
20 | bun i -g sitefetch
21 | npm i -g sitefetch
22 | pnpm i -g sitefetch
23 | ```
24 | 
25 | ## Usage
26 | 
27 | ```bash
28 | sitefetch https://egoist.dev -o site.txt
29 | 
30 | # or better concurrency
31 | sitefetch https://egoist.dev -o site.txt --concurrency 10
32 | ```
33 | 
34 | ### Match specific pages
35 | 
36 | Use the `-m, --match` flag to specify the pages you want to fetch:
37 | 
38 | ```bash
39 | sitefetch https://vite.dev -m "/blog/**" -m "/guide/**"
40 | ```
41 | 
42 | The match pattern is tested against the pathname of target pages, powered by micromatch, you can check out all the supported [matching features](https://github.com/micromatch/micromatch#matching-features).
43 | 
44 | ### Content selector
45 | 
46 | We use [mozilla/readability](https://github.com/mozilla/readability) to extract readable content from the web page, but on some pages it might return irrelevant contents, in this case you can specify a CSS selector so we know where to find the readable content:
47 | 
48 | ```sitefetch
49 | sitefetch https://vite.dev --content-selector ".content"
50 | ```
51 | 
52 | ## Plug
53 | 
54 | If you like this, please check out my LLM chat app: https://chatwise.app
55 | 
56 | ## API
57 | 
58 | ```ts
59 | import { fetchSite } from "sitefetch"
60 | 
61 | await fetchSite("https://egoist.dev", {
62 |   //...options
63 | })
64 | ```
65 | 
66 | Check out options in [types.ts](./src/types.ts).
67 | 
68 | ## License
69 | 
70 | MIT.
71 | 


--------------------------------------------------------------------------------
/bun.lockb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/egoist/sitefetch/9f570a1216403a30b202bbbcbcef03b7ef4787fd/bun.lockb


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "sitefetch",
 3 |   "version": "0.0.17",
 4 |   "description": "Fetch an entire site and save it as a text file",
 5 |   "bin": "./dist/cli.js",
 6 |   "main": "./dist/index.js",
 7 |   "types": "./dist/index.d.ts",
 8 |   "files": [
 9 |     "dist"
10 |   ],
11 |   "type": "module",
12 |   "scripts": {
13 |     "test": "echo \"Error: no test specified\" && exit 1",
14 |     "build": "rm -rf dist && rolldown -c",
15 |     "prepublishOnly": "bun run build"
16 |   },
17 |   "keywords": [],
18 |   "author": "EGOIST <hi@egoist.dev>",
19 |   "license": "MIT",
20 |   "dependencies": {
21 |     "happy-dom": "^16.5.3",
22 |     "cheerio": "^1.0.0",
23 |     "gpt-tokenizer": "^2.8.1",
24 |     "turndown": "^7.2.0",
25 |     "turndown-plugin-gfm": "^1.0.2",
26 |     "micromatch": "^4.0.8"
27 |   },
28 |   "devDependencies": {
29 |     "@mozilla/readability": "^0.5.0",
30 |     "@types/bun": "^1.1.15",
31 |     "@types/micromatch": "^4.0.9",
32 |     "@types/turndown": "^5.0.5",
33 |     "cac": "^6.7.14",
34 |     "p-queue": "^8.0.1",
35 |     "picocolors": "^1.1.1",
36 |     "rolldown": "^1.0.0-beta.1",
37 |     "typescript": "^5.7.3",
38 |     "unplugin-isolated-decl": "^0.10.4"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/rolldown.config.js:
--------------------------------------------------------------------------------
 1 | // @ts-check
 2 | import fs from "node:fs"
 3 | import { defineConfig } from "rolldown"
 4 | import { isBuiltin } from "node:module"
 5 | import UnpluginIsolatedDecl from "unplugin-isolated-decl/rolldown"
 6 | 
 7 | const pkg = JSON.parse(fs.readFileSync("./package.json", "utf8"))
 8 | 
 9 | export default defineConfig({
10 |   input: ["src/cli.ts", "src/index.ts"],
11 |   output: {
12 |     dir: "dist",
13 |     format: "esm",
14 |     banner(chunk) {
15 |       if (chunk.fileName === "cli.js") {
16 |         return `#!/usr/bin/env node`
17 |       }
18 |       return ""
19 |     },
20 |   },
21 |   platform: "node",
22 |   external: Object.keys(pkg.dependencies)
23 |     .map((name) => [name, new RegExp(`^${name}/`)])
24 |     .flat(),
25 |   plugins: [
26 |     process.env.NO_DTS
27 |       ? undefined
28 |       : UnpluginIsolatedDecl({ transformer: "typescript" }),
29 |     {
30 |       // make sure every node builtin module is prefixed with node:
31 |       name: "add-node-prefix",
32 |       renderChunk(code) {
33 |         return code.replace(/import (.+) from "(.+)"/g, (m, m1, m2) => {
34 |           if (isBuiltin(m2) && !m2.startsWith("node:")) {
35 |             return `import ${m1} from "node:${m2}"`
36 |           }
37 |           return m
38 |         })
39 |       },
40 |       resolveId(id) {
41 |         if (isBuiltin(id) && !id.startsWith("node:")) {
42 |           return {
43 |             id: `node:${id}`,
44 |             external: true,
45 |           }
46 |         }
47 |       },
48 |     },
49 |   ],
50 | })
51 | 


--------------------------------------------------------------------------------
/shims.d.ts:
--------------------------------------------------------------------------------
1 | declare module "turndown-plugin-gfm"
2 | 


--------------------------------------------------------------------------------
/src/cli.ts:
--------------------------------------------------------------------------------
 1 | import path from "node:path"
 2 | import fs from "node:fs"
 3 | import { cac } from "cac"
 4 | import { encode } from "gpt-tokenizer/model/gpt-4o"
 5 | import { fetchSite, serializePages } from "./index.ts"
 6 | import { logger } from "./logger.ts"
 7 | import { ensureArray, formatNumber } from "./utils.ts"
 8 | import { version } from "../package.json"
 9 | 
10 | const cli = cac("sitefetch")
11 | 
12 | cli
13 |   .command("[url]", "Fetch a site")
14 |   .option("-o, --outfile <path>", "Write the fetched site to a text file")
15 |   .option("--concurrency <number>", "Number of concurrent requests", {
16 |     default: 3,
17 |   })
18 |   .option("-m, --match <pattern>", "Only fetch matched pages")
19 |   .option("--content-selector <selector>", "The CSS selector to find content")
20 |   .option("--limit <limit>", "Limit the result to this amount of pages")
21 |   .option("--silent", "Do not print any logs")
22 |   .action(async (url, flags) => {
23 |     if (!url) {
24 |       cli.outputHelp()
25 |       return
26 |     }
27 | 
28 |     if (flags.silent) {
29 |       logger.setLevel("silent")
30 |     }
31 | 
32 |     const pages = await fetchSite(url, {
33 |       concurrency: flags.concurrency,
34 |       match: flags.match && ensureArray(flags.match),
35 |       contentSelector: flags.contentSelector,
36 |       limit: flags.limit,
37 |     })
38 | 
39 |     if (pages.size === 0) {
40 |       logger.warn("No pages found")
41 |       return
42 |     }
43 | 
44 |     const pagesArr = [...pages.values()]
45 | 
46 |     const totalTokenCount = pagesArr.reduce(
47 |       (acc, page) => acc + encode(page.content).length,
48 |       0
49 |     )
50 | 
51 |     logger.info(
52 |       `Total token count for ${pages.size} pages: ${formatNumber(
53 |         totalTokenCount
54 |       )}`
55 |     )
56 | 
57 |     if (flags.outfile) {
58 |       const output = serializePages(
59 |         pages,
60 |         flags.outfile.endsWith(".json") ? "json" : "text"
61 |       )
62 |       fs.mkdirSync(path.dirname(flags.outfile), { recursive: true })
63 |       fs.writeFileSync(flags.outfile, output, "utf8")
64 |     } else {
65 |       console.log(serializePages(pages, "text"))
66 |     }
67 |   })
68 | 
69 | cli.version(version)
70 | cli.help()
71 | cli.parse()
72 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
  1 | import Queue from "p-queue"
  2 | import { Window } from "happy-dom"
  3 | import { Readability } from "@mozilla/readability"
  4 | import c from "picocolors"
  5 | import { toMarkdown } from "./to-markdown.ts"
  6 | import { logger } from "./logger.ts"
  7 | import { load } from "cheerio"
  8 | import { matchPath } from "./utils.ts"
  9 | import type { Options, FetchSiteResult } from "./types.ts"
 10 | 
 11 | export async function fetchSite(
 12 |   url: string,
 13 |   options: Options
 14 | ): Promise<FetchSiteResult> {
 15 |   const fetcher = new Fetcher(options)
 16 | 
 17 |   return fetcher.fetchSite(url)
 18 | }
 19 | 
 20 | class Fetcher {
 21 |   #pages: FetchSiteResult = new Map()
 22 |   #fetched: Set<string> = new Set()
 23 |   #queue: Queue
 24 | 
 25 |   constructor(public options: Options) {
 26 |     const concurrency = options.concurrency || 3
 27 |     this.#queue = new Queue({ concurrency })
 28 |   }
 29 | 
 30 |   #limitReached() {
 31 |     return this.options.limit && this.#pages.size >= this.options.limit
 32 |   }
 33 | 
 34 |   #getContentSelector(pathname: string) {
 35 |     if (typeof this.options.contentSelector === "function")
 36 |       return this.options.contentSelector({ pathname })
 37 | 
 38 |     return this.options.contentSelector
 39 |   }
 40 | 
 41 |   async fetchSite(url: string) {
 42 |     logger.info(
 43 |       `Started fetching ${c.green(url)} with a concurrency of ${
 44 |         this.#queue.concurrency
 45 |       }`
 46 |     )
 47 | 
 48 |     await this.#fetchPage(url, {
 49 |       skipMatch: true,
 50 |     })
 51 | 
 52 |     await this.#queue.onIdle()
 53 | 
 54 |     return this.#pages
 55 |   }
 56 | 
 57 |   async #fetchPage(
 58 |     url: string,
 59 |     options: {
 60 |       skipMatch?: boolean
 61 |     }
 62 |   ) {
 63 |     const { host, pathname } = new URL(url)
 64 | 
 65 |     if (this.#fetched.has(pathname) || this.#limitReached()) {
 66 |       return
 67 |     }
 68 | 
 69 |     this.#fetched.add(pathname)
 70 | 
 71 |     // return if not matched
 72 |     // we don't need to extract content for this page
 73 |     if (
 74 |       !options.skipMatch &&
 75 |       this.options.match &&
 76 |       !matchPath(pathname, this.options.match)
 77 |     ) {
 78 |       return
 79 |     }
 80 | 
 81 |     logger.info(`Fetching ${c.green(url)}`)
 82 | 
 83 |     const res = await (this.options.fetch || fetch)(url, {
 84 |       headers: {
 85 |         "user-agent": "Sitefetch (https://github.com/egoist/sitefetch)",
 86 |       },
 87 |     })
 88 | 
 89 |     if (!res.ok) {
 90 |       logger.warn(`Failed to fetch ${url}: ${res.statusText}`)
 91 |       return
 92 |     }
 93 | 
 94 |     if (this.#limitReached()) {
 95 |       return
 96 |     }
 97 | 
 98 |     const contentType = res.headers.get("content-type")
 99 | 
100 |     if (!contentType?.includes("text/html")) {
101 |       logger.warn(`Not a HTML page: ${url}`)
102 |       return
103 |     }
104 | 
105 |     const resUrl = new URL(res.url)
106 | 
107 |     // redirected to other site, ignore
108 |     if (resUrl.host !== host) {
109 |       logger.warn(`Redirected from ${host} to ${resUrl.host}`)
110 |       return
111 |     }
112 |     const extraUrls: string[] = []
113 | 
114 |     const $ = load(await res.text())
115 |     $("script,style,link,img,video").remove()
116 | 
117 |     $("a").each((_, el) => {
118 |       const href = $(el).attr("href")
119 | 
120 |       if (!href) {
121 |         return
122 |       }
123 | 
124 |       try {
125 |         const thisUrl = new URL(href, url)
126 |         if (thisUrl.host !== host) {
127 |           return
128 |         }
129 | 
130 |         extraUrls.push(thisUrl.href)
131 |       } catch {
132 |         logger.warn(`Failed to parse URL: ${href}`)
133 |       }
134 |     })
135 | 
136 |     if (extraUrls.length > 0) {
137 |       for (const url of extraUrls) {
138 |         this.#queue.add(() =>
139 |           this.#fetchPage(url, { ...options, skipMatch: false })
140 |         )
141 |       }
142 |     }
143 | 
144 |     const window = new Window({
145 |       url,
146 |       settings: {
147 |         disableJavaScriptFileLoading: true,
148 |         disableJavaScriptEvaluation: true,
149 |         disableCSSFileLoading: true,
150 |       },
151 |     })
152 | 
153 |     const pageTitle = $("title").text()
154 |     const contentSelector = this.#getContentSelector(pathname)
155 |     const html = contentSelector
156 |       ? $(contentSelector).prop("outerHTML")
157 |       : $.html()
158 | 
159 |     if (!html) {
160 |       logger.warn(`No readable content on ${pathname}`)
161 |       return
162 |     }
163 | 
164 |     window.document.write(html)
165 | 
166 |     await window.happyDOM.waitUntilComplete()
167 | 
168 |     const article = new Readability(window.document as any).parse()
169 | 
170 |     await window.happyDOM.close()
171 | 
172 |     if (!article) {
173 |       return
174 |     }
175 | 
176 |     const content = toMarkdown(article.content)
177 | 
178 |     this.#pages.set(pathname, {
179 |       title: article.title || pageTitle,
180 |       url,
181 |       content,
182 |     })
183 |   }
184 | }
185 | 
186 | export function serializePages(
187 |   pages: FetchSiteResult,
188 |   format: "json" | "text"
189 | ): string {
190 |   if (format === "json") {
191 |     return JSON.stringify([...pages.values()])
192 |   }
193 | 
194 |   return [...pages.values()]
195 |     .map((page) =>
196 |       `<page>
197 |   <title>${page.title}</title>
198 |   <url>${page.url}</url>
199 |   <content>${page.content}</content>
200 | </page>`.trim()
201 |     )
202 |     .join("\n\n")
203 | }
204 | 


--------------------------------------------------------------------------------
/src/logger.ts:
--------------------------------------------------------------------------------
 1 | import c from "picocolors"
 2 | 
 3 | type LoggerLevel = "silent" | "warn"
 4 | 
 5 | class Logger {
 6 |   private level?: LoggerLevel
 7 | 
 8 |   setLevel(level: LoggerLevel): void {
 9 |     this.level = level
10 |   }
11 | 
12 |   info(...args: any[]): void {
13 |     if (this.level === "silent") return
14 |     console.log(c.cyan("INFO"), ...args)
15 |   }
16 | 
17 |   warn(...args: any[]): void {
18 |     if (this.level === "silent") return
19 |     console.warn(c.yellow("WARN"), ...args)
20 |   }
21 | }
22 | 
23 | export const logger: Logger = new Logger()
24 | 


--------------------------------------------------------------------------------
/src/to-markdown.ts:
--------------------------------------------------------------------------------
 1 | import Turndown from "turndown"
 2 | import { gfm } from "turndown-plugin-gfm"
 3 | 
 4 | const turndown = new Turndown()
 5 | turndown.use(gfm)
 6 | 
 7 | export function toMarkdown(html: string): string {
 8 |   return turndown.turndown(html)
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
 1 | export type Options = {
 2 |   /** How many requests can be made at the same time */
 3 |   concurrency?: number
 4 | 
 5 |   /**
 6 |    * Match pathname by specific patterns, powered by micromatch
 7 |    * Only pages matched by this will be fetched
 8 |    */
 9 |   match?: string[]
10 | 
11 |   /**
12 |    * The CSS selector to find content
13 |    */
14 |   contentSelector?:
15 |     | string
16 |     | ((ctx: { pathname: string }) => string | void | undefined)
17 | 
18 |   /**
19 |    * Limit the result to this amount of pages
20 |    */
21 |   limit?: number
22 | 
23 |   /**
24 |    * A custom function to fetch URL
25 |    */
26 |   fetch?: (url: string, init: RequestInit) => Promise<Response>
27 | }
28 | 
29 | export type Page = {
30 |   title: string
31 |   url: string
32 |   content: string
33 | }
34 | 
35 | export type FetchSiteResult = Map<string, Page>
36 | 


--------------------------------------------------------------------------------
/src/utils.ts:
--------------------------------------------------------------------------------
 1 | import micromatch from "micromatch"
 2 | 
 3 | // xK or xM
 4 | export function formatNumber(num: number): string {
 5 |   return num > 1000000
 6 |     ? `${(num / 1000000).toFixed(1)}M`
 7 |     : num > 1000
 8 |     ? `${(num / 1000).toFixed(1)}K`
 9 |     : num.toString()
10 | }
11 | 
12 | export function matchPath(path: string, pattern: string | string[]): boolean {
13 |   return micromatch.isMatch(path, pattern)
14 | }
15 | 
16 | export function ensureArray<T>(input: T | T[]): T[] {
17 |   return Array.isArray(input) ? input : [input]
18 | }
19 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "compilerOptions": {
 3 |       // Enable latest features
 4 |       "lib": ["ESNext"],
 5 |       "target": "ESNext",
 6 |       "module": "ESNext",
 7 |       "moduleDetection": "force",
 8 |       "jsx": "react-jsx",
 9 |       "isolatedDeclarations": true,
10 |       "declaration": true,
11 |   
12 |       // Bundler mode
13 |       "moduleResolution": "bundler",
14 |       "allowImportingTsExtensions": true,
15 |       "verbatimModuleSyntax": true,
16 |       "noEmit": true,
17 |   
18 |       // Best practices
19 |       "strict": true,
20 |       "skipLibCheck": true,
21 |       "noFallthroughCasesInSwitch": true,
22 |   
23 |       // Some stricter flags
24 |       "noUnusedLocals": true,
25 |       "noUnusedParameters": true,
26 |       "noPropertyAccessFromIndexSignature": true
27 |     }
28 |   }


--------------------------------------------------------------------------------