├── .gitignore ├── LICENSE ├── README.md ├── llmstxt.gif ├── package-lock.json ├── package.json └── src ├── cli ├── actions │ └── gen.js └── llmstxt.js └── lib └── helpers └── packageJson.js /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .nyc_output 3 | .tap 4 | coverage/ 5 | node_modules/ 6 | dist/* 7 | .env* 8 | llms.txt 9 | llms-full.txt 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, Scott Motte 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llmstxt 2 | 3 | > *generate `llms.txt`*–using your `sitemap.xml`. A `llms.txt` file is a curated list of your website's pages in markdown format, perfect for training or fine-tuning language models with your content. 4 | 5 |



6 | 7 |   8 | 9 | ### Quickstart [![npm version](https://img.shields.io/npm/v/llmstxt.svg)](https://www.npmjs.com/package/llmstxt) 10 | 11 | ```sh 12 | $ npx -y llmstxt gen https://vercel.com/sitemap.xml 13 | ``` 14 | 15 | *
expand example
16 | 17 | ``` 18 | $ npx -y llmstxt gen https://vercel.com/sitemap.xml 19 | - [Vercel Documentation](https://vercel.com/docs): Vercel's Frontend Cloud gives developers frameworks, workflows, and infrastructure to build a faster, more personalized web 20 | - [Accounts on Vercel](https://vercel.com/docs/accounts): Learn how to manage your Vercel account and team members. 21 | - [Create a Team](https://vercel.com/docs/accounts/create-a-team): Teams on Vercel allow you to collaborate with members on projects, and grant you access to additional resources. Learn how to create or join a team on Vercel. 22 | - [Create an Account](https://vercel.com/docs/accounts/create-an-account): Learn how to create a Hobby team on Vercel and manage your login connections through your dashboard. 23 | - [Manage Emails](https://vercel.com/docs/accounts/manage-emails): Learn how to manage your email addresses on Vercel. 24 | - [Account Plans on Vercel](https://vercel.com/docs/accounts/plans): Learn about the different plans available on Vercel. 25 | - [Vercel Enterprise Plan](https://vercel.com/docs/accounts/plans/enterprise): Learn about the Enterprise plan for Vercel, including features, pricing, and more. 26 | ... 27 | ``` 28 | 29 |
30 | 31 |   32 | 33 | ## Basics 34 | 35 | > Basic usage 36 | > 37 | 38 | *
`gen https://yoursite.com/sitemap.xml`
39 | 40 | Outputs to stdout. 41 | 42 | ```sh 43 | $ llmstxt gen https://vercel.com/sitemap.xml 44 | - [Vercel Documentation](https://vercel.com/docs): Vercel's Frontend Cloud gives developers frameworks, workflows, and infrastructure to build a faster, more personalized web 45 | - [Accounts on Vercel](https://vercel.com/docs/accounts): Learn how to manage your Vercel account and team members. 46 | - [Create a Team](https://vercel.com/docs/accounts/create-a-team): Teams on Vercel allow you to collaborate with members on projects, and grant you access to additional resources. Learn how to create or join a team on Vercel. 47 | - [Create an Account](https://vercel.com/docs/accounts/create-an-account): Learn how to create a Hobby team on Vercel and manage your login connections through your dashboard. 48 | - [Manage Emails](https://vercel.com/docs/accounts/manage-emails): Learn how to manage your email addresses on Vercel. 49 | - [Account Plans on Vercel](https://vercel.com/docs/accounts/plans): Learn about the different plans available on Vercel. 50 | - [Vercel Enterprise Plan](https://vercel.com/docs/accounts/plans/enterprise): Learn about the Enterprise plan for Vercel, including features, pricing, and more. 51 | ... 52 | ``` 53 | 54 |
55 | *
`gen https://yoursite.com/sitemap.xml > llms.txt`
56 | 57 | Write to file. 58 | 59 | ```sh 60 | $ llmstxt gen https://vercel.com/sitemap.xml > llms.txt 61 | ``` 62 | 63 |
64 | 65 |   66 | 67 | ## Advanced 68 | 69 | > Advanced options 70 | > 71 | 72 | *
`gen --exclude-path` - Exclude path(s)
73 | 74 | Exclude paths from generation. 75 | 76 | ```sh 77 | # exclude all blog posts 78 | $ llmstxt gen https://vercel.com/sitemap.xml --exclude-path "**/blog/**" 79 | 80 | # exclude all docs 81 | $ llmstxt gen https://vercel.com/sitemap.xml --exclude-path "**/docs/**" 82 | ``` 83 | 84 |
85 | *
`gen --include-path` - Include path(s)
86 | 87 | Include paths for generation. 88 | 89 | ```sh 90 | # include all docs only 91 | $ llmstxt gen https://vercel.com/sitemap.xml --include-path "**/docs/**" 92 | 93 | # include all blogs only 94 | $ llmstxt gen https://vercel.com/sitemap.xml -ip "**/blog/**" 95 | ``` 96 | 97 |
98 | *
`gen --replace-title s/pattern/replacement/` - Replace string(s) from title
99 | 100 | Use `--replace-title` to remove redundant text from your page titles. For example, dotenvx's titles all end with `| dotenvx`. I want to replace those with empty string. 101 | 102 | ```sh 103 | $ llmstxt gen https://vercel.com/sitemap.xml --replace-title 's/\| dotenvx//' 104 | ``` 105 | 106 |
107 | *
`gen --title 'Your Heading'` - set title
108 | 109 | Set your website's heading 1 title. 110 | 111 | ```sh 112 | $ llmstxt gen https://vercel.com/sitemap.xml --title 'dotenvx' 113 | ``` 114 | 115 |
116 | *
`gen --description 'Some description'` - set description
117 | 118 | Set your website's description. 119 | 120 | ```sh 121 | $ llmstxt gen https://vercel.com/sitemap.xml --description 'This is a description' 122 | ``` 123 | 124 |
125 | 126 |   127 | 128 | ## FAQ 129 | 130 | #### Can you give me a real world example? 131 | 132 | I'm using it to generate [dotenvx.com/llms.txt](https://dotenvx.com/llms.txt) with the following command: 133 | 134 | ```sh 135 | npx -y llmstxt@latest gen https://example.com/sitemap.xml -ep "**/privacy**" -ep "**/terms**" -ep "**/blog/**" -ep "**/stats/**" -ep "**/support/**" -rt 's/\| dotenvx//' -t 'dotenvx' > llms.txt 136 | ``` 137 | -------------------------------------------------------------------------------- /llmstxt.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dotenvx/llmstxt/73632e0caeeb18412efe8df4a5a320356ac064e7/llmstxt.gif -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.7.0", 3 | "name": "llmstxt", 4 | "description": "convert `sitemap.xml` to `llms.txt`", 5 | "author": "@motdotla", 6 | "keywords": [ 7 | "llms.txt", 8 | "llms", 9 | "txt" 10 | ], 11 | "homepage": "https://github.com/dotenvx/llmstxt", 12 | "repository": { 13 | "type": "git", 14 | "url": "git+https://github.com/dotenvx/llmstxt.git" 15 | }, 16 | "license": "BSD-3-Clause", 17 | "files": [ 18 | "src/**/*", 19 | "CHANGELOG.md" 20 | ], 21 | "bin": { 22 | "llmstxt": "./src/cli/llmstxt.js" 23 | }, 24 | "scripts": { 25 | "standard": "standard", 26 | "standard:fix": "standard --fix" 27 | }, 28 | "funding": "https://dotenvx.com", 29 | "dependencies": { 30 | "cheerio": "^1.0.0", 31 | "commander": "^11.1.0", 32 | "ora": "^5.4.1", 33 | "picomatch": "^4.0.2", 34 | "replace-in-file": "^8.2.0", 35 | "sitemapper": "^3.2.18", 36 | "turndown": "^7.2.0", 37 | "undici": "^6.21.0" 38 | }, 39 | "devDependencies": { 40 | "standard": "^17.1.2" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/cli/actions/gen.js: -------------------------------------------------------------------------------- 1 | const { URL } = require('url') 2 | const cheerio = require('cheerio') 3 | const picomatch = require('picomatch') 4 | const { request } = require('undici') 5 | const Sitemapper = require('sitemapper') 6 | const sitemap = new Sitemapper() 7 | const ora = require('ora') 8 | const TurndownService = require('turndown') 9 | 10 | async function fetchHtml (url) { 11 | try { 12 | const { body } = await request(url) 13 | const rawHtml = await body.text() 14 | return rawHtml 15 | } catch (_error) { 16 | return null 17 | } 18 | } 19 | 20 | async function getTitle (html) { 21 | try { 22 | const $ = cheerio.load(html) 23 | return $('head > title').text().trim() 24 | } catch (_error) { 25 | return null 26 | } 27 | } 28 | 29 | async function getDescription (html) { 30 | try { 31 | const $ = cheerio.load(html) 32 | 33 | // Check for 34 | let description = $('head > meta[name="description"]').attr('content') 35 | 36 | // Fallback to 37 | if (!description) { 38 | description = $('head > meta[property="og:description"]').attr('content') 39 | } 40 | 41 | // Fallback to 42 | if (!description) { 43 | description = $('head > meta[name="twitter:description"]').attr('content') 44 | } 45 | 46 | return description 47 | } catch (_error) { 48 | return null 49 | } 50 | } 51 | 52 | function parseSubstitutionCommand (command) { 53 | const match = command.match(/^s\/(.*?)\/(.*?)\/([gimsuy]*)$/) // Capture optional flags 54 | 55 | if (match) { 56 | const pattern = match[1] // The pattern to search for 57 | const replacement = match[2] // The replacement string 58 | const flags = match[3] || '' // Extract flags (e.g., 'g', 'i') 59 | return { pattern: new RegExp(pattern, flags), replacement } 60 | } else { 61 | throw new Error('Invalid substitution command format') 62 | } 63 | } 64 | 65 | function parseSection(uri) { 66 | try { 67 | const url = new URL(uri) 68 | const segments = url.pathname.split('/').filter(Boolean) 69 | return segments[0] || 'ROOT' 70 | } catch (_error) { 71 | return 'ROOT' 72 | } 73 | } 74 | 75 | function substituteTitle (title, command) { 76 | if (!command || command.length < 1 || !command.startsWith('s/')) { 77 | return title 78 | } 79 | 80 | const { pattern, replacement } = parseSubstitutionCommand(command) 81 | 82 | return title.replace(pattern, replacement) 83 | } 84 | 85 | function isRootUrl (uri) { 86 | try { 87 | const url = new URL(uri) 88 | return url.pathname === '/' 89 | } catch (_error) { 90 | return false 91 | } 92 | } 93 | 94 | function capitalizeString(str) { 95 | if (!str || typeof str !== 'string') { 96 | return '' 97 | } 98 | 99 | return str.charAt(0).toUpperCase() + str.slice(1).toLowerCase() 100 | } 101 | 102 | function cleanTitle(title) { 103 | if (!title) return ''; 104 | // Remove leading '|' and whitespace 105 | return title.replace(/^\|\s*/, '').trim(); 106 | } 107 | 108 | /** 109 | * Process URLs in batches with limited concurrency 110 | * @param {Array} items - Array of items to process 111 | * @param {Function} processor - Async function to process each item 112 | * @param {number} concurrency - Maximum number of concurrent operations 113 | * @returns {Array} - Results array 114 | */ 115 | async function processInBatches(items, processor, concurrency = 10) { 116 | const results = []; 117 | const totalItems = items.length; 118 | let processedItems = 0; 119 | 120 | // Process items in batches 121 | for (let i = 0; i < totalItems; i += concurrency) { 122 | const batch = items.slice(i, i + concurrency); 123 | const batchPromises = batch.map(async (item, index) => { 124 | const result = await processor(item, i + index); 125 | processedItems++; 126 | return result; 127 | }); 128 | 129 | // Wait for the current batch to complete 130 | const batchResults = await Promise.all(batchPromises); 131 | results.push(...batchResults); 132 | } 133 | 134 | return results.filter(Boolean); // Remove null/undefined results 135 | } 136 | 137 | async function gen (sitemapUrl) { 138 | const options = this.opts() 139 | 140 | const spinner = ora('generating').start() 141 | 142 | // include/exclude logic 143 | const excludePaths = options.excludePath || [] 144 | const includePaths = options.includePath || [] 145 | const isExcluded = picomatch(excludePaths) 146 | const isIncluded = picomatch(includePaths, { ignore: excludePaths }) 147 | 148 | // replaceTitle logic 149 | const replaceTitle = options.replaceTitle || [] 150 | 151 | const sections = {} 152 | const concurrency = options.concurrency || 5 153 | 154 | try { 155 | spinner.text = sitemapUrl 156 | const sites = await sitemap.fetch(sitemapUrl) 157 | 158 | // Define the URL processor function 159 | const processUrl = async (url, index) => { 160 | spinner.text = `Processing [${index + 1}/${sites.sites.length}]: ${url}` 161 | 162 | // path excluded - don't process it 163 | if (isExcluded(url)) { 164 | return null; 165 | } 166 | 167 | // path effectively excluded (by not being in the list of includes) - don't process it 168 | if (includePaths.length > 0 && !isIncluded(url)) { 169 | return null; 170 | } 171 | 172 | // html 173 | const html = await fetchHtml(url) 174 | if (!html) { 175 | return null; 176 | } 177 | 178 | // title 179 | let title = await getTitle(html) 180 | if (!title) { 181 | return null; 182 | } 183 | for (command of replaceTitle) { 184 | title = substituteTitle(title, command) 185 | } 186 | title = cleanTitle(title) 187 | 188 | // description 189 | const description = await getDescription(html) 190 | 191 | // section 192 | const section = parseSection(url) 193 | 194 | return { title, url, description, section }; 195 | }; 196 | 197 | // Process URLs concurrently 198 | const results = await processInBatches(sites.sites, processUrl, concurrency); 199 | 200 | // Organize results into sections 201 | for (const result of results) { 202 | if (!result) continue; 203 | 204 | const { title, url, description, section } = result; 205 | 206 | // set up section 207 | sections[section] ||= [] 208 | 209 | // add line 210 | sections[section].push({ title, url, description }); 211 | } 212 | } catch (error) { 213 | console.error('Error processing sitemap:', error.message) 214 | } 215 | 216 | let output = '' 217 | 218 | // handle root 219 | const root = sections.ROOT || [] 220 | delete sections.ROOT 221 | 222 | // Default values if root doesn't exist 223 | const defaultTitle = options.title || 'Documentation' 224 | const defaultDescription = options.description || 'Generated documentation' 225 | 226 | output += `# ${options.title || (root.length > 0 ? root[0].title : defaultTitle)}` 227 | output += '\n' 228 | output += '\n' 229 | output += `> ${options.description || (root.length > 0 ? root[0].description : defaultDescription)}` 230 | output += '\n' 231 | output += '\n' 232 | 233 | spinner.text = options.title || (root.length > 0 ? root[0].title : defaultTitle) 234 | 235 | // handle sections 236 | for (const section in sections) { 237 | output += `## ${capitalizeString(section)}` 238 | output += '\n' 239 | for (const line of sections[section]) { 240 | const { title, url, description } = line 241 | output += '\n' 242 | output += `- [${title}](${url})${description ? ': ' + description : ''}` 243 | 244 | spinner.text = title 245 | } 246 | output += '\n' 247 | output += '\n' 248 | } 249 | spinner.succeed('generated') 250 | 251 | console.log(output) 252 | } 253 | 254 | async function genFull(sitemapUrl) { 255 | const options = this.opts ? this.opts() : {}; 256 | const spinner = ora('generating full content').start(); 257 | const excludePaths = options.excludePath || []; 258 | const includePaths = options.includePath || []; 259 | const isExcluded = picomatch(excludePaths); 260 | const isIncluded = picomatch(includePaths, { ignore: excludePaths }); 261 | const replaceTitle = options.replaceTitle || []; 262 | const concurrency = options.concurrency || 5; 263 | // Configure Turndown for better markdown 264 | const turndownService = new TurndownService({ 265 | codeBlockStyle: 'fenced', 266 | headingStyle: 'atx', 267 | bulletListMarker: '-', 268 | emDelimiter: '*', 269 | hr: '---', 270 | }); 271 | turndownService.addRule('table', { 272 | filter: 'table', 273 | replacement: function(content, node) { 274 | return '\n' + turndownService.turndown(node.outerHTML) + '\n'; 275 | } 276 | }); 277 | let output = ''; 278 | let toc = ''; 279 | let skipped = []; 280 | let pageSections = []; 281 | 282 | try { 283 | spinner.text = sitemapUrl; 284 | const sites = await sitemap.fetch(sitemapUrl); 285 | // Try to get lastmod from sitemap if available 286 | const urlToLastMod = {}; 287 | if (sites.urls && Array.isArray(sites.urls)) { 288 | for (const entry of sites.urls) { 289 | if (entry.loc && entry.lastmod) urlToLastMod[entry.loc] = entry.lastmod; 290 | } 291 | } 292 | const pageInfos = []; 293 | const processUrl = async (url, index) => { 294 | spinner.text = `Processing [${index + 1}/${sites.sites.length}]: ${url}`; 295 | if (isExcluded(url)) { skipped.push({url, reason: 'excluded'}); return null; } 296 | if (includePaths.length > 0 && !isIncluded(url)) { skipped.push({url, reason: 'not included'}); return null; } 297 | const html = await fetchHtml(url); 298 | if (!html) { skipped.push({url, reason: 'fetch failed'}); return null; } 299 | let title = await getTitle(html); 300 | if (!title) { skipped.push({url, reason: 'no title'}); return null; } 301 | for (const command of replaceTitle) { 302 | title = substituteTitle(title, command); 303 | } 304 | title = cleanTitle(title); 305 | let $ = cheerio.load(html); 306 | let mainHtml = 307 | $('main').html() || 308 | $('[role=main]').html() || 309 | $('.content, #content, .post, .docs, .article').first().html() || 310 | $('article').html() || 311 | $('body').html() || 312 | html; 313 | let markdown = turndownService.turndown(mainHtml); 314 | // Try to extract H2/H3 sections for TOC anchors 315 | const anchor = title.toLowerCase().replace(/[^a-z0-9]+/g, '-'); 316 | pageInfos.push({ title, url, description: await getDescription(html), markdown, anchor, lastmod: urlToLastMod[url] }); 317 | return true; 318 | }; 319 | await processInBatches(sites.sites, processUrl, concurrency); 320 | // Build TOC 321 | toc += '# Table of Contents\n'; 322 | for (const page of pageInfos) { 323 | toc += `- [${page.title}](#${page.anchor})\n`; 324 | } 325 | // Build output 326 | output += `# ${options.title || 'Full Documentation'}\n\n`; 327 | output += toc + '\n'; 328 | for (const page of pageInfos) { 329 | output += `\n\n---\n\n`; 330 | output += `## ${page.title}\n\n`; 331 | output += `[${page.url}](${page.url})\n\n`; 332 | if (page.description) output += `> ${page.description}\n\n`; 333 | if (page.lastmod) output += `*Last modified: ${page.lastmod}*\n\n`; 334 | output += page.markdown + '\n'; 335 | } 336 | if (skipped.length > 0) { 337 | output += '\n\n---\n\n## Skipped Pages\n'; 338 | for (const s of skipped) { 339 | output += `- ${s.url} (${s.reason})\n`; 340 | } 341 | } 342 | spinner.succeed('full content generated'); 343 | console.log(output); 344 | } catch (error) { 345 | spinner.fail('Error processing sitemap: ' + error.message); 346 | } 347 | } 348 | 349 | module.exports = Object.assign(gen, { genFull }); 350 | -------------------------------------------------------------------------------- /src/cli/llmstxt.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { Command } = require('commander') 4 | const program = new Command() 5 | 6 | const packageJson = require('./../lib/helpers/packageJson') 7 | 8 | // cli 9 | program 10 | .name('llmstxt') 11 | .description(packageJson.description) 12 | .version(packageJson.version) 13 | 14 | // llmstxt gen 15 | const genAction = require('./actions/gen') 16 | program.command('gen') 17 | .description('generate llms.txt') 18 | .argument('[url]', 'sitemap url', 'https://vercel.com/sitemap.xml') 19 | .option('-ep, --exclude-path ', 'path(s) to exclude from generation (default: none)') 20 | .option('-ip, --include-path ', 'path(s) to include from generation (default: all)') 21 | .option('-rt, --replace-title ', 'replace string(s) from title (default: none)') 22 | .option('-t, --title ', 'set title (default: root page title)') 23 | .option('-d, --description <description>', 'set description (default: root page description)') 24 | .option('-c, --concurrency <concurrency>', 'maximum number of concurrent connections (default: 5)', parseInt) 25 | .action(genAction) 26 | 27 | // Add gen-full command 28 | program.command('gen-full') 29 | .description('generate llms-full.txt (full markdown content for each page)') 30 | .argument('[url]', 'sitemap url', 'https://vercel.com/sitemap.xml') 31 | .option('-ep, --exclude-path <excludePath...>', 'path(s) to exclude from generation (default: none)') 32 | .option('-ip, --include-path <includePath...>', 'path(s) to include from generation (default: all)') 33 | .option('-rt, --replace-title <replaceTitle...>', 'replace string(s) from title (default: none)') 34 | .option('-t, --title <title>', 'set title (default: root page title)') 35 | .option('-d, --description <description>', 'set description (default: root page description)') 36 | .option('-c, --concurrency <concurrency>', 'maximum number of concurrent connections (default: 5)', parseInt) 37 | .action(genAction.genFull) 38 | 39 | program.parse() 40 | -------------------------------------------------------------------------------- /src/lib/helpers/packageJson.js: -------------------------------------------------------------------------------- 1 | const { name, version, description } = require('../../../package.json') 2 | 3 | module.exports = { name, version, description } 4 | --------------------------------------------------------------------------------