├── .gitignore
├── LICENSE
├── README.md
├── llmstxt.gif
├── package-lock.json
├── package.json
└── src
├── cli
├── actions
│ └── gen.js
└── llmstxt.js
└── lib
└── helpers
└── packageJson.js
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .nyc_output
3 | .tap
4 | coverage/
5 | node_modules/
6 | dist/*
7 | .env*
8 | llms.txt
9 | llms-full.txt
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2024, Scott Motte
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | 3. Neither the name of the copyright holder nor the names of its
16 | contributors may be used to endorse or promote products derived from
17 | this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # llmstxt
2 |
3 | > *generate `llms.txt`*–using your `sitemap.xml`. A `llms.txt` file is a curated list of your website's pages in markdown format, perfect for training or fine-tuning language models with your content.
4 |
5 |

6 |
7 |
8 |
9 | ### Quickstart [](https://www.npmjs.com/package/llmstxt)
10 |
11 | ```sh
12 | $ npx -y llmstxt gen https://vercel.com/sitemap.xml
13 | ```
14 |
15 | * expand example
16 |
17 | ```
18 | $ npx -y llmstxt gen https://vercel.com/sitemap.xml
19 | - [Vercel Documentation](https://vercel.com/docs): Vercel's Frontend Cloud gives developers frameworks, workflows, and infrastructure to build a faster, more personalized web
20 | - [Accounts on Vercel](https://vercel.com/docs/accounts): Learn how to manage your Vercel account and team members.
21 | - [Create a Team](https://vercel.com/docs/accounts/create-a-team): Teams on Vercel allow you to collaborate with members on projects, and grant you access to additional resources. Learn how to create or join a team on Vercel.
22 | - [Create an Account](https://vercel.com/docs/accounts/create-an-account): Learn how to create a Hobby team on Vercel and manage your login connections through your dashboard.
23 | - [Manage Emails](https://vercel.com/docs/accounts/manage-emails): Learn how to manage your email addresses on Vercel.
24 | - [Account Plans on Vercel](https://vercel.com/docs/accounts/plans): Learn about the different plans available on Vercel.
25 | - [Vercel Enterprise Plan](https://vercel.com/docs/accounts/plans/enterprise): Learn about the Enterprise plan for Vercel, including features, pricing, and more.
26 | ...
27 | ```
28 |
29 |
30 |
31 |
32 |
33 | ## Basics
34 |
35 | > Basic usage
36 | >
37 |
38 | * `gen https://yoursite.com/sitemap.xml`
39 |
40 | Outputs to stdout.
41 |
42 | ```sh
43 | $ llmstxt gen https://vercel.com/sitemap.xml
44 | - [Vercel Documentation](https://vercel.com/docs): Vercel's Frontend Cloud gives developers frameworks, workflows, and infrastructure to build a faster, more personalized web
45 | - [Accounts on Vercel](https://vercel.com/docs/accounts): Learn how to manage your Vercel account and team members.
46 | - [Create a Team](https://vercel.com/docs/accounts/create-a-team): Teams on Vercel allow you to collaborate with members on projects, and grant you access to additional resources. Learn how to create or join a team on Vercel.
47 | - [Create an Account](https://vercel.com/docs/accounts/create-an-account): Learn how to create a Hobby team on Vercel and manage your login connections through your dashboard.
48 | - [Manage Emails](https://vercel.com/docs/accounts/manage-emails): Learn how to manage your email addresses on Vercel.
49 | - [Account Plans on Vercel](https://vercel.com/docs/accounts/plans): Learn about the different plans available on Vercel.
50 | - [Vercel Enterprise Plan](https://vercel.com/docs/accounts/plans/enterprise): Learn about the Enterprise plan for Vercel, including features, pricing, and more.
51 | ...
52 | ```
53 |
54 |
55 | * `gen https://yoursite.com/sitemap.xml > llms.txt`
56 |
57 | Write to file.
58 |
59 | ```sh
60 | $ llmstxt gen https://vercel.com/sitemap.xml > llms.txt
61 | ```
62 |
63 |
64 |
65 |
66 |
67 | ## Advanced
68 |
69 | > Advanced options
70 | >
71 |
72 | * `gen --exclude-path` - Exclude path(s)
73 |
74 | Exclude paths from generation.
75 |
76 | ```sh
77 | # exclude all blog posts
78 | $ llmstxt gen https://vercel.com/sitemap.xml --exclude-path "**/blog/**"
79 |
80 | # exclude all docs
81 | $ llmstxt gen https://vercel.com/sitemap.xml --exclude-path "**/docs/**"
82 | ```
83 |
84 |
85 | * `gen --include-path` - Include path(s)
86 |
87 | Include paths for generation.
88 |
89 | ```sh
90 | # include all docs only
91 | $ llmstxt gen https://vercel.com/sitemap.xml --include-path "**/docs/**"
92 |
93 | # include all blogs only
94 | $ llmstxt gen https://vercel.com/sitemap.xml -ip "**/blog/**"
95 | ```
96 |
97 |
98 | * `gen --replace-title s/pattern/replacement/` - Replace string(s) from title
99 |
100 | Use `--replace-title` to remove redundant text from your page titles. For example, dotenvx's titles all end with `| dotenvx`. I want to replace those with empty string.
101 |
102 | ```sh
103 | $ llmstxt gen https://vercel.com/sitemap.xml --replace-title 's/\| dotenvx//'
104 | ```
105 |
106 |
107 | * `gen --title 'Your Heading'` - set title
108 |
109 | Set your website's heading 1 title.
110 |
111 | ```sh
112 | $ llmstxt gen https://vercel.com/sitemap.xml --title 'dotenvx'
113 | ```
114 |
115 |
116 | * `gen --description 'Some description'` - set description
117 |
118 | Set your website's description.
119 |
120 | ```sh
121 | $ llmstxt gen https://vercel.com/sitemap.xml --description 'This is a description'
122 | ```
123 |
124 |
125 |
126 |
127 |
128 | ## FAQ
129 |
130 | #### Can you give me a real world example?
131 |
132 | I'm using it to generate [dotenvx.com/llms.txt](https://dotenvx.com/llms.txt) with the following command:
133 |
134 | ```sh
135 | npx -y llmstxt@latest gen https://example.com/sitemap.xml -ep "**/privacy**" -ep "**/terms**" -ep "**/blog/**" -ep "**/stats/**" -ep "**/support/**" -rt 's/\| dotenvx//' -t 'dotenvx' > llms.txt
136 | ```
137 |
--------------------------------------------------------------------------------
/llmstxt.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dotenvx/llmstxt/73632e0caeeb18412efe8df4a5a320356ac064e7/llmstxt.gif
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.7.0",
3 | "name": "llmstxt",
4 | "description": "convert `sitemap.xml` to `llms.txt`",
5 | "author": "@motdotla",
6 | "keywords": [
7 | "llms.txt",
8 | "llms",
9 | "txt"
10 | ],
11 | "homepage": "https://github.com/dotenvx/llmstxt",
12 | "repository": {
13 | "type": "git",
14 | "url": "git+https://github.com/dotenvx/llmstxt.git"
15 | },
16 | "license": "BSD-3-Clause",
17 | "files": [
18 | "src/**/*",
19 | "CHANGELOG.md"
20 | ],
21 | "bin": {
22 | "llmstxt": "./src/cli/llmstxt.js"
23 | },
24 | "scripts": {
25 | "standard": "standard",
26 | "standard:fix": "standard --fix"
27 | },
28 | "funding": "https://dotenvx.com",
29 | "dependencies": {
30 | "cheerio": "^1.0.0",
31 | "commander": "^11.1.0",
32 | "ora": "^5.4.1",
33 | "picomatch": "^4.0.2",
34 | "replace-in-file": "^8.2.0",
35 | "sitemapper": "^3.2.18",
36 | "turndown": "^7.2.0",
37 | "undici": "^6.21.0"
38 | },
39 | "devDependencies": {
40 | "standard": "^17.1.2"
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/cli/actions/gen.js:
--------------------------------------------------------------------------------
1 | const { URL } = require('url')
2 | const cheerio = require('cheerio')
3 | const picomatch = require('picomatch')
4 | const { request } = require('undici')
5 | const Sitemapper = require('sitemapper')
6 | const sitemap = new Sitemapper()
7 | const ora = require('ora')
8 | const TurndownService = require('turndown')
9 |
10 | async function fetchHtml (url) {
11 | try {
12 | const { body } = await request(url)
13 | const rawHtml = await body.text()
14 | return rawHtml
15 | } catch (_error) {
16 | return null
17 | }
18 | }
19 |
20 | async function getTitle (html) {
21 | try {
22 | const $ = cheerio.load(html)
23 | return $('head > title').text().trim()
24 | } catch (_error) {
25 | return null
26 | }
27 | }
28 |
29 | async function getDescription (html) {
30 | try {
31 | const $ = cheerio.load(html)
32 |
33 | // Check for
34 | let description = $('head > meta[name="description"]').attr('content')
35 |
36 | // Fallback to
37 | if (!description) {
38 | description = $('head > meta[property="og:description"]').attr('content')
39 | }
40 |
41 | // Fallback to
42 | if (!description) {
43 | description = $('head > meta[name="twitter:description"]').attr('content')
44 | }
45 |
46 | return description
47 | } catch (_error) {
48 | return null
49 | }
50 | }
51 |
52 | function parseSubstitutionCommand (command) {
53 | const match = command.match(/^s\/(.*?)\/(.*?)\/([gimsuy]*)$/) // Capture optional flags
54 |
55 | if (match) {
56 | const pattern = match[1] // The pattern to search for
57 | const replacement = match[2] // The replacement string
58 | const flags = match[3] || '' // Extract flags (e.g., 'g', 'i')
59 | return { pattern: new RegExp(pattern, flags), replacement }
60 | } else {
61 | throw new Error('Invalid substitution command format')
62 | }
63 | }
64 |
65 | function parseSection(uri) {
66 | try {
67 | const url = new URL(uri)
68 | const segments = url.pathname.split('/').filter(Boolean)
69 | return segments[0] || 'ROOT'
70 | } catch (_error) {
71 | return 'ROOT'
72 | }
73 | }
74 |
75 | function substituteTitle (title, command) {
76 | if (!command || command.length < 1 || !command.startsWith('s/')) {
77 | return title
78 | }
79 |
80 | const { pattern, replacement } = parseSubstitutionCommand(command)
81 |
82 | return title.replace(pattern, replacement)
83 | }
84 |
85 | function isRootUrl (uri) {
86 | try {
87 | const url = new URL(uri)
88 | return url.pathname === '/'
89 | } catch (_error) {
90 | return false
91 | }
92 | }
93 |
94 | function capitalizeString(str) {
95 | if (!str || typeof str !== 'string') {
96 | return ''
97 | }
98 |
99 | return str.charAt(0).toUpperCase() + str.slice(1).toLowerCase()
100 | }
101 |
102 | function cleanTitle(title) {
103 | if (!title) return '';
104 | // Remove leading '|' and whitespace
105 | return title.replace(/^\|\s*/, '').trim();
106 | }
107 |
108 | /**
109 | * Process URLs in batches with limited concurrency
110 | * @param {Array} items - Array of items to process
111 | * @param {Function} processor - Async function to process each item
112 | * @param {number} concurrency - Maximum number of concurrent operations
113 | * @returns {Array} - Results array
114 | */
115 | async function processInBatches(items, processor, concurrency = 10) {
116 | const results = [];
117 | const totalItems = items.length;
118 | let processedItems = 0;
119 |
120 | // Process items in batches
121 | for (let i = 0; i < totalItems; i += concurrency) {
122 | const batch = items.slice(i, i + concurrency);
123 | const batchPromises = batch.map(async (item, index) => {
124 | const result = await processor(item, i + index);
125 | processedItems++;
126 | return result;
127 | });
128 |
129 | // Wait for the current batch to complete
130 | const batchResults = await Promise.all(batchPromises);
131 | results.push(...batchResults);
132 | }
133 |
134 | return results.filter(Boolean); // Remove null/undefined results
135 | }
136 |
137 | async function gen (sitemapUrl) {
138 | const options = this.opts()
139 |
140 | const spinner = ora('generating').start()
141 |
142 | // include/exclude logic
143 | const excludePaths = options.excludePath || []
144 | const includePaths = options.includePath || []
145 | const isExcluded = picomatch(excludePaths)
146 | const isIncluded = picomatch(includePaths, { ignore: excludePaths })
147 |
148 | // replaceTitle logic
149 | const replaceTitle = options.replaceTitle || []
150 |
151 | const sections = {}
152 | const concurrency = options.concurrency || 5
153 |
154 | try {
155 | spinner.text = sitemapUrl
156 | const sites = await sitemap.fetch(sitemapUrl)
157 |
158 | // Define the URL processor function
159 | const processUrl = async (url, index) => {
160 | spinner.text = `Processing [${index + 1}/${sites.sites.length}]: ${url}`
161 |
162 | // path excluded - don't process it
163 | if (isExcluded(url)) {
164 | return null;
165 | }
166 |
167 | // path effectively excluded (by not being in the list of includes) - don't process it
168 | if (includePaths.length > 0 && !isIncluded(url)) {
169 | return null;
170 | }
171 |
172 | // html
173 | const html = await fetchHtml(url)
174 | if (!html) {
175 | return null;
176 | }
177 |
178 | // title
179 | let title = await getTitle(html)
180 | if (!title) {
181 | return null;
182 | }
183 | for (command of replaceTitle) {
184 | title = substituteTitle(title, command)
185 | }
186 | title = cleanTitle(title)
187 |
188 | // description
189 | const description = await getDescription(html)
190 |
191 | // section
192 | const section = parseSection(url)
193 |
194 | return { title, url, description, section };
195 | };
196 |
197 | // Process URLs concurrently
198 | const results = await processInBatches(sites.sites, processUrl, concurrency);
199 |
200 | // Organize results into sections
201 | for (const result of results) {
202 | if (!result) continue;
203 |
204 | const { title, url, description, section } = result;
205 |
206 | // set up section
207 | sections[section] ||= []
208 |
209 | // add line
210 | sections[section].push({ title, url, description });
211 | }
212 | } catch (error) {
213 | console.error('Error processing sitemap:', error.message)
214 | }
215 |
216 | let output = ''
217 |
218 | // handle root
219 | const root = sections.ROOT || []
220 | delete sections.ROOT
221 |
222 | // Default values if root doesn't exist
223 | const defaultTitle = options.title || 'Documentation'
224 | const defaultDescription = options.description || 'Generated documentation'
225 |
226 | output += `# ${options.title || (root.length > 0 ? root[0].title : defaultTitle)}`
227 | output += '\n'
228 | output += '\n'
229 | output += `> ${options.description || (root.length > 0 ? root[0].description : defaultDescription)}`
230 | output += '\n'
231 | output += '\n'
232 |
233 | spinner.text = options.title || (root.length > 0 ? root[0].title : defaultTitle)
234 |
235 | // handle sections
236 | for (const section in sections) {
237 | output += `## ${capitalizeString(section)}`
238 | output += '\n'
239 | for (const line of sections[section]) {
240 | const { title, url, description } = line
241 | output += '\n'
242 | output += `- [${title}](${url})${description ? ': ' + description : ''}`
243 |
244 | spinner.text = title
245 | }
246 | output += '\n'
247 | output += '\n'
248 | }
249 | spinner.succeed('generated')
250 |
251 | console.log(output)
252 | }
253 |
254 | async function genFull(sitemapUrl) {
255 | const options = this.opts ? this.opts() : {};
256 | const spinner = ora('generating full content').start();
257 | const excludePaths = options.excludePath || [];
258 | const includePaths = options.includePath || [];
259 | const isExcluded = picomatch(excludePaths);
260 | const isIncluded = picomatch(includePaths, { ignore: excludePaths });
261 | const replaceTitle = options.replaceTitle || [];
262 | const concurrency = options.concurrency || 5;
263 | // Configure Turndown for better markdown
264 | const turndownService = new TurndownService({
265 | codeBlockStyle: 'fenced',
266 | headingStyle: 'atx',
267 | bulletListMarker: '-',
268 | emDelimiter: '*',
269 | hr: '---',
270 | });
271 | turndownService.addRule('table', {
272 | filter: 'table',
273 | replacement: function(content, node) {
274 | return '\n' + turndownService.turndown(node.outerHTML) + '\n';
275 | }
276 | });
277 | let output = '';
278 | let toc = '';
279 | let skipped = [];
280 | let pageSections = [];
281 |
282 | try {
283 | spinner.text = sitemapUrl;
284 | const sites = await sitemap.fetch(sitemapUrl);
285 | // Try to get lastmod from sitemap if available
286 | const urlToLastMod = {};
287 | if (sites.urls && Array.isArray(sites.urls)) {
288 | for (const entry of sites.urls) {
289 | if (entry.loc && entry.lastmod) urlToLastMod[entry.loc] = entry.lastmod;
290 | }
291 | }
292 | const pageInfos = [];
293 | const processUrl = async (url, index) => {
294 | spinner.text = `Processing [${index + 1}/${sites.sites.length}]: ${url}`;
295 | if (isExcluded(url)) { skipped.push({url, reason: 'excluded'}); return null; }
296 | if (includePaths.length > 0 && !isIncluded(url)) { skipped.push({url, reason: 'not included'}); return null; }
297 | const html = await fetchHtml(url);
298 | if (!html) { skipped.push({url, reason: 'fetch failed'}); return null; }
299 | let title = await getTitle(html);
300 | if (!title) { skipped.push({url, reason: 'no title'}); return null; }
301 | for (const command of replaceTitle) {
302 | title = substituteTitle(title, command);
303 | }
304 | title = cleanTitle(title);
305 | let $ = cheerio.load(html);
306 | let mainHtml =
307 | $('main').html() ||
308 | $('[role=main]').html() ||
309 | $('.content, #content, .post, .docs, .article').first().html() ||
310 | $('article').html() ||
311 | $('body').html() ||
312 | html;
313 | let markdown = turndownService.turndown(mainHtml);
314 | // Try to extract H2/H3 sections for TOC anchors
315 | const anchor = title.toLowerCase().replace(/[^a-z0-9]+/g, '-');
316 | pageInfos.push({ title, url, description: await getDescription(html), markdown, anchor, lastmod: urlToLastMod[url] });
317 | return true;
318 | };
319 | await processInBatches(sites.sites, processUrl, concurrency);
320 | // Build TOC
321 | toc += '# Table of Contents\n';
322 | for (const page of pageInfos) {
323 | toc += `- [${page.title}](#${page.anchor})\n`;
324 | }
325 | // Build output
326 | output += `# ${options.title || 'Full Documentation'}\n\n`;
327 | output += toc + '\n';
328 | for (const page of pageInfos) {
329 | output += `\n\n---\n\n`;
330 | output += `## ${page.title}\n\n`;
331 | output += `[${page.url}](${page.url})\n\n`;
332 | if (page.description) output += `> ${page.description}\n\n`;
333 | if (page.lastmod) output += `*Last modified: ${page.lastmod}*\n\n`;
334 | output += page.markdown + '\n';
335 | }
336 | if (skipped.length > 0) {
337 | output += '\n\n---\n\n## Skipped Pages\n';
338 | for (const s of skipped) {
339 | output += `- ${s.url} (${s.reason})\n`;
340 | }
341 | }
342 | spinner.succeed('full content generated');
343 | console.log(output);
344 | } catch (error) {
345 | spinner.fail('Error processing sitemap: ' + error.message);
346 | }
347 | }
348 |
349 | module.exports = Object.assign(gen, { genFull });
350 |
--------------------------------------------------------------------------------
/src/cli/llmstxt.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | const { Command } = require('commander')
4 | const program = new Command()
5 |
6 | const packageJson = require('./../lib/helpers/packageJson')
7 |
8 | // cli
9 | program
10 | .name('llmstxt')
11 | .description(packageJson.description)
12 | .version(packageJson.version)
13 |
14 | // llmstxt gen
15 | const genAction = require('./actions/gen')
16 | program.command('gen')
17 | .description('generate llms.txt')
18 | .argument('[url]', 'sitemap url', 'https://vercel.com/sitemap.xml')
19 | .option('-ep, --exclude-path ', 'path(s) to exclude from generation (default: none)')
20 | .option('-ip, --include-path ', 'path(s) to include from generation (default: all)')
21 | .option('-rt, --replace-title ', 'replace string(s) from title (default: none)')
22 | .option('-t, --title ', 'set title (default: root page title)')
23 | .option('-d, --description ', 'set description (default: root page description)')
24 | .option('-c, --concurrency ', 'maximum number of concurrent connections (default: 5)', parseInt)
25 | .action(genAction)
26 |
27 | // Add gen-full command
28 | program.command('gen-full')
29 | .description('generate llms-full.txt (full markdown content for each page)')
30 | .argument('[url]', 'sitemap url', 'https://vercel.com/sitemap.xml')
31 | .option('-ep, --exclude-path ', 'path(s) to exclude from generation (default: none)')
32 | .option('-ip, --include-path ', 'path(s) to include from generation (default: all)')
33 | .option('-rt, --replace-title ', 'replace string(s) from title (default: none)')
34 | .option('-t, --title ', 'set title (default: root page title)')
35 | .option('-d, --description ', 'set description (default: root page description)')
36 | .option('-c, --concurrency ', 'maximum number of concurrent connections (default: 5)', parseInt)
37 | .action(genAction.genFull)
38 |
39 | program.parse()
40 |
--------------------------------------------------------------------------------
/src/lib/helpers/packageJson.js:
--------------------------------------------------------------------------------
1 | const { name, version, description } = require('../../../package.json')
2 |
3 | module.exports = { name, version, description }
4 |
--------------------------------------------------------------------------------