├── .DS_Store ├── tsconfig.json ├── src ├── tokenCounter.ts ├── index.ts ├── clijson.ts ├── clitoon.ts ├── clitext.ts └── scraper.ts ├── .github └── workflows │ ├── npm-publish.yml │ └── node.js.yaml ├── package.json ├── LICENSE ├── example.js ├── benchmark ├── results.md ├── results.json └── benchmark.js ├── .gitignore ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arnab2001/git-repo-parser/HEAD/.DS_Store -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "commonjs", 5 | "moduleResolution": "node", 6 | "declaration": true, 7 | "outDir": "./dist", 8 | "strict": true, 9 | "skipLibCheck": true 10 | }, 11 | "include": ["src/**/*.ts", "node_modules/git-repo-to-json/src/**/*.ts"], 12 | "exclude": ["node_modules"] 13 | } 14 | -------------------------------------------------------------------------------- /src/tokenCounter.ts: -------------------------------------------------------------------------------- 1 | import { encode } from 'gpt-tokenizer'; 2 | 3 | export interface TokenCountOptions { 4 | /** 5 | * Optional delimiter to join multiple segments prior to tokenising. 6 | * Defaults to a single newline. 7 | */ 8 | joinWith?: string; 9 | } 10 | 11 | export function countTokens( 12 | segments: string | string[], 13 | { joinWith = '\n' }: TokenCountOptions = {} 14 | ): number { 15 | const text = Array.isArray(segments) ? segments.join(joinWith) : segments; 16 | return encode(text).length; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /.github/workflows/npm-publish.yml: -------------------------------------------------------------------------------- 1 | name: Node.js Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - uses: actions/setup-node@v3 13 | with: 14 | node-version: 22 15 | 16 | 17 | publish-npm: 18 | needs: build 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - uses: actions/setup-node@v3 23 | with: 24 | node-version: 22 25 | registry-url: https://registry.npmjs.org/ 26 | - run: npm install 27 | - run: npm run build 28 | - run: npm publish 29 | env: 30 | NODE_AUTH_TOKEN: ${{secrets.npm_token}} -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "git-repo-parser", 3 | "version": "3.0.0", 4 | "description": "A tool to scrape all files from a GitHub repository and turn it into a JSON file", 5 | "bin": { 6 | "git-repo-to-json": "dist/clijson.js", 7 | "git-repo-to-text": "dist/clitext.js", 8 | "git-repo-to-toon": "dist/clitoon.js" 9 | }, 10 | "files": [ 11 | "dist" 12 | ], 13 | "main": "dist/index.js", 14 | "scripts": { 15 | "build": "tsc", 16 | "start": "node dist/cli.js", 17 | "benchmark": "node benchmark/benchmark.js" 18 | }, 19 | "keywords": [ 20 | "github", 21 | "scraper", 22 | "json" 23 | ], 24 | "author": "arnab2001", 25 | "license": "MIT", 26 | "dependencies": { 27 | "@toon-format/toon": "^0.8.0", 28 | "gpt-tokenizer": "^3.4.0", 29 | "simple-git": "^2.41.0" 30 | }, 31 | "devDependencies": { 32 | "@types/node": "^18.11.19", 33 | "typescript": "^5.4.5" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { 2 | scrapeRepositoryToJson, 3 | scrapeRepositoryToPlainText, 4 | scrapeRepositoryToTranscript, 5 | scrapeRepositoryToToon, 6 | scrapeRepositoryToToonWithTokenCount, 7 | scrapeRepositoryToJsonWithTokenCount, 8 | scrapeRepositoryToPlainTextWithTokenCount, 9 | type ToonScrapeResult, 10 | type JsonScrapeResult, 11 | type TranscriptScrapeResult, 12 | type PlainTextScrapeResult, 13 | type TranscriptFormatOptions, 14 | } from './scraper'; 15 | import { countTokens, type TokenCountOptions } from './tokenCounter'; 16 | 17 | export { 18 | scrapeRepositoryToJson, 19 | scrapeRepositoryToPlainText, 20 | scrapeRepositoryToTranscript, 21 | scrapeRepositoryToToon, 22 | scrapeRepositoryToToonWithTokenCount, 23 | scrapeRepositoryToJsonWithTokenCount, 24 | scrapeRepositoryToPlainTextWithTokenCount, 25 | type ToonScrapeResult, 26 | type JsonScrapeResult, 27 | type TranscriptScrapeResult, 28 | type PlainTextScrapeResult, 29 | type TranscriptFormatOptions, 30 | countTokens, 31 | type TokenCountOptions, 32 | }; 33 | -------------------------------------------------------------------------------- /.github/workflows/node.js.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs 3 | 4 | name: Node.js CI 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [ 22.x] 20 | # See supported Node.js release schedule at https://nodejs.org/en/about/releases/ 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Use Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v3 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | cache: 'npm' 29 | - run: npm install 30 | - run: npm run build 31 | - run: npm link 32 | - run: git-repo-to-json https://github.com/arnab2001/css-modules-transformer 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Arnab Chatterjee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/clijson.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { scrapeRepositoryToJsonWithTokenCount } from './scraper'; 4 | import { promises as fs } from 'fs'; 5 | 6 | async function main() { 7 | const args = process.argv.slice(2); 8 | 9 | let repoUrl: string | undefined; 10 | const flags = new Set(); 11 | 12 | for (const arg of args) { 13 | if (arg.startsWith('-')) { 14 | flags.add(arg); 15 | } else if (!repoUrl) { 16 | repoUrl = arg; 17 | } else { 18 | console.warn(`Ignoring unexpected argument: ${arg}`); 19 | } 20 | } 21 | 22 | if (!repoUrl) { 23 | console.error('Please provide a GitHub repository URL.'); 24 | process.exit(1); 25 | } 26 | 27 | const showTokenCount = 28 | flags.has('--tokens') || 29 | flags.has('--token-count') || 30 | flags.has('--token') || 31 | flags.has('-t'); 32 | 33 | const { json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(repoUrl); 34 | await fs.writeFile('files.json', `${json}\n`, { encoding: 'utf-8' }); 35 | console.log('File list has been saved to files.json'); 36 | 37 | if (showTokenCount) { 38 | console.log(`Token count (cl100k_base): ${tokenCount}`); 39 | } 40 | } 41 | 42 | main().catch(err => { 43 | console.error(err); 44 | process.exitCode = 1; 45 | }); -------------------------------------------------------------------------------- /src/clitoon.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { scrapeRepositoryToToonWithTokenCount } from './scraper'; 4 | import { promises as fs } from 'fs'; 5 | 6 | async function main() { 7 | const args = process.argv.slice(2); 8 | 9 | let repoUrl: string | undefined; 10 | const flags = new Set(); 11 | 12 | for (const arg of args) { 13 | if (arg.startsWith('-')) { 14 | flags.add(arg); 15 | } else if (!repoUrl) { 16 | repoUrl = arg; 17 | } else { 18 | console.warn(`Ignoring unexpected argument: ${arg}`); 19 | } 20 | } 21 | 22 | if (!repoUrl) { 23 | console.error('Please provide a GitHub repository URL.'); 24 | process.exit(1); 25 | } 26 | 27 | const showTokenCount = 28 | flags.has('--tokens') || 29 | flags.has('--token-count') || 30 | flags.has('--token') || 31 | flags.has('-t'); 32 | 33 | const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(repoUrl); 34 | await fs.writeFile('files.toon', `${toon}\n`, { encoding: 'utf-8' }); 35 | console.log('File list has been saved to files.toon'); 36 | 37 | if (showTokenCount) { 38 | console.log(`Token count (cl100k_base): ${tokenCount}`); 39 | } 40 | } 41 | 42 | main().catch(err => { 43 | console.error(err); 44 | process.exitCode = 1; 45 | }); 46 | 47 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs/promises'); 2 | const path = require('path'); 3 | const { 4 | scrapeRepositoryToJsonWithTokenCount, 5 | scrapeRepositoryToToonWithTokenCount, 6 | scrapeRepositoryToPlainTextWithTokenCount, 7 | } = require('./dist'); 8 | 9 | const OUTPUT_FILE = path.join(process.cwd(), 'output.txt'); 10 | const MAX_PREVIEW_LINES = 50; 11 | 12 | function previewLines(content, maxLines = MAX_PREVIEW_LINES) { 13 | return content.split('\n').slice(0, maxLines).join('\n'); 14 | } 15 | 16 | async function main() { 17 | const repoUrl = process.argv[2] ?? 'https://github.com/kitops-ml/gh-kit-setup'; 18 | 19 | const sections = []; 20 | sections.push(`# git-repo-parser sample export 21 | Repository: ${repoUrl} 22 | Generated: ${new Date().toISOString()} 23 | `); 24 | 25 | const { json, tokenCount: jsonTokens } = await scrapeRepositoryToJsonWithTokenCount(repoUrl); 26 | sections.push(`## JSON Export 27 | Token usage (cl100k_base): ${jsonTokens} 28 | Preview (first ${MAX_PREVIEW_LINES} lines): 29 | 30 | ${previewLines(json)} 31 | `); 32 | 33 | const { toon, tokenCount: toonTokens } = await scrapeRepositoryToToonWithTokenCount(repoUrl); 34 | sections.push(`## TOON Export 35 | Token usage (cl100k_base): ${toonTokens} 36 | Preview (first ${MAX_PREVIEW_LINES} lines): 37 | 38 | ${previewLines(toon)} 39 | `); 40 | 41 | const { text, tokenCount: textTokens } = await scrapeRepositoryToPlainTextWithTokenCount(repoUrl); 42 | sections.push(`## Plain Text Export 43 | Token usage (cl100k_base): ${textTokens} 44 | Preview (first ${MAX_PREVIEW_LINES} lines): 45 | 46 | ${previewLines(text)} 47 | `); 48 | 49 | const body = sections.join('\n'); 50 | await fs.writeFile(OUTPUT_FILE, `${body.trimEnd()}\n`, { encoding: 'utf-8' }); 51 | console.log(`Output written to ${OUTPUT_FILE}`); 52 | } 53 | 54 | main().catch((error) => { 55 | console.error(error); 56 | process.exitCode = 1; 57 | }); -------------------------------------------------------------------------------- /benchmark/results.md: -------------------------------------------------------------------------------- 1 | # git-repo-parser Benchmark 2 | 3 | Generated: 2025-11-09T21:18:49.257Z 4 | 5 | ## octocat-hello-world 6 | 7 | Repository: https://github.com/octocat/Hello-World 8 | 9 | | Format | Duration | Token Count | Output Bytes | Extra | 10 | | --- | ---: | ---: | ---: | --- | 11 | | JSON | 936.95ms | 37 | 107 | files: 1 | 12 | | TOON | 880.82ms | 22 | 71 | | 13 | | Plain Text | 919.81ms | 15 | 52 | | 14 | 15 | ## octocat-spoon-knife 16 | 17 | Repository: https://github.com/octocat/Spoon-Knife 18 | 19 | | Format | Duration | Token Count | Output Bytes | Extra | 20 | | --- | ---: | ---: | ---: | --- | 21 | | JSON | 904.20ms | 523 | 1,759 | files: 3 | 22 | | TOON | 906.01ms | 460 | 1,589 | | 23 | | Plain Text | 895.69ms | 414 | 1,530 | | 24 | 25 | ## axios-axios 26 | 27 | Repository: https://github.com/axios/axios 28 | 29 | | Format | Duration | Token Count | Output Bytes | Extra | 30 | | --- | ---: | ---: | ---: | --- | 31 | | JSON | 3263.69ms | 261,071 | 922,326 | files: 30 | 32 | | TOON | 3151.31ms | 255,992 | 898,912 | | 33 | | Plain Text | 3269.70ms | 222,580 | 864,449 | | 34 | 35 | ## sindresorhus-slugify 36 | 37 | Repository: https://github.com/sindresorhus/slugify 38 | 39 | | Format | Duration | Token Count | Output Bytes | Extra | 40 | | --- | ---: | ---: | ---: | --- | 41 | | JSON | 941.90ms | 10,610 | 31,520 | files: 12 | 42 | | TOON | 924.59ms | 10,462 | 31,053 | | 43 | | Plain Text | 936.36ms | 8,599 | 28,697 | | 44 | 45 | ## lodash-lodash 46 | 47 | Repository: https://github.com/lodash/lodash 48 | 49 | | Format | Duration | Token Count | Output Bytes | Extra | 50 | | --- | ---: | ---: | ---: | --- | 51 | | JSON | 7290.95ms | 1,367,886 | 4,652,891 | files: 25 | 52 | | TOON | 6984.51ms | 1,365,925 | 4,643,896 | | 53 | | Plain Text | 7281.70ms | 1,184,440 | 4,469,746 | | 54 | 55 | ## octokit-request 56 | 57 | Repository: https://github.com/octokit/request.js 58 | 59 | | Format | Duration | Token Count | Output Bytes | Extra | 60 | | --- | ---: | ---: | ---: | --- | 61 | | JSON | 1493.24ms | 44,982 | 154,107 | files: 11 | 62 | | TOON | 1471.80ms | 44,256 | 151,406 | | 63 | | Plain Text | 1470.12ms | 38,016 | 143,952 | | 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | .pnpm-debug.log* 9 | 10 | # Diagnostic reports (https://nodejs.org/api/report.html) 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 12 | 13 | # Runtime data 14 | pids 15 | *.pid 16 | *.seed 17 | *.pid.lock 18 | 19 | # Directory for instrumented libs generated by jscoverage/JSCover 20 | lib-cov 21 | 22 | # Coverage directory used by tools like istanbul 23 | coverage 24 | *.lcov 25 | 26 | # nyc test coverage 27 | .nyc_output 28 | 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 30 | .grunt 31 | 32 | # Bower dependency directory (https://bower.io/) 33 | bower_components 34 | 35 | # node-waf configuration 36 | .lock-wscript 37 | 38 | # Compiled binary addons (https://nodejs.org/api/addons.html) 39 | build/Release 40 | 41 | # Dependency directories 42 | node_modules/ 43 | jspm_packages/ 44 | 45 | # Snowpack dependency directory (https://snowpack.dev/) 46 | web_modules/ 47 | 48 | # TypeScript cache 49 | *.tsbuildinfo 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional stylelint cache 58 | .stylelintcache 59 | 60 | # Microbundle cache 61 | .rpt2_cache/ 62 | .rts2_cache_cjs/ 63 | .rts2_cache_es/ 64 | .rts2_cache_umd/ 65 | 66 | # Optional REPL history 67 | .node_repl_history 68 | 69 | # Output of 'npm pack' 70 | *.tgz 71 | 72 | # Yarn Integrity file 73 | .yarn-integrity 74 | 75 | # dotenv environment variable files 76 | .env 77 | .env.development.local 78 | .env.test.local 79 | .env.production.local 80 | .env.local 81 | 82 | # parcel-bundler cache (https://parceljs.org/) 83 | .cache 84 | .parcel-cache 85 | 86 | # Next.js build output 87 | .next 88 | out 89 | 90 | # Nuxt.js build / generate output 91 | .nuxt 92 | dist 93 | 94 | # Gatsby files 95 | .cache/ 96 | # Comment in the public line in if your project uses Gatsby and not Next.js 97 | # https://nextjs.org/blog/next-9-1#public-directory-support 98 | # public 99 | 100 | # vuepress build output 101 | .vuepress/dist 102 | 103 | # vuepress v2.x temp and cache directory 104 | .temp 105 | .cache 106 | 107 | # Docusaurus cache and generated files 108 | .docusaurus 109 | 110 | # Serverless directories 111 | .serverless/ 112 | 113 | # FuseBox cache 114 | .fusebox/ 115 | 116 | # DynamoDB Local files 117 | .dynamodb/ 118 | 119 | # TernJS port file 120 | .tern-port 121 | 122 | # Stores VSCode versions used for testing VSCode extensions 123 | .vscode-test 124 | 125 | # yarn v2 126 | .yarn/cache 127 | .yarn/unplugged 128 | .yarn/build-state.yml 129 | .yarn/install-state.gz 130 | .pnp.* -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribute to git-repo-parser 2 | 3 | ## Table of Contents 4 | 5 | * [Introduction](#introduction) 6 | * [Getting Started](#getting-started) 7 | * [Development Process](#development-process) 8 | * [Code Contributions](#code-contributions) 9 | * [Reporting Issues](#reporting-issues) 10 | * [Open Source Community](open-source-community) 11 | * [Conduct](conduct) 12 | * [Contributor License Agreement](contributor-license-agreement) 13 | * [Documentation](documentation) 14 | 15 | ## Introduction 16 | 17 | Thank you for your interest in contributing to git-repo-parser! This document outlines the guidelines for contributing to the project. By adhering to these guidelines, you can help us maintain a high-quality and sustainable codebase. 18 | 19 | ## Getting Started 20 | 21 | Before you start contributing, please take the following steps: 22 | 23 | 1. **Fork the Repository:** Fork the git-repo-parser repository on GitHub. This will create a copy of the repository in your own GitHub account. 24 | 2. **Clone the Forked Repository:** Clone your forked repository to your local machine using the following command: 25 | 26 | ``` 27 | git clone https://github.com//css-modules-transformer.git 28 | ``` 29 | 30 | 3. **Set Up a Development Environment:** Set up a development environment on your local machine. This may involve installing necessary dependencies, configuring a code editor, and setting up a local development server. 31 | 32 | ## Development Process 33 | 34 | 1. **Create a Feature Branch:** When working on a new feature or fixing a bug, create a new feature branch from the `main` branch: 35 | 36 | ``` 37 | git checkout -b 38 | ``` 39 | 40 | 2. **Make Changes:** Make your changes to the codebase in your feature branch. Follow the coding conventions and best practices to ensure code quality. 41 | 42 | 3. **Commit Changes:** Commit your changes regularly using meaningful commit messages: 43 | 44 | ``` 45 | git add 46 | git commit -m "" 47 | ``` 48 | 49 | 4. **Push Changes:** Push your changes to your forked repository on GitHub: 50 | 51 | ``` 52 | git push origin 53 | ``` 54 | 55 | 5. **Create a Pull Request:** Create a pull request to merge your feature branch into the `main` branch of the original repository. Provide a clear and concise description of your changes in the pull request. 56 | 57 | 6. **Review and Feedback:** The project maintainers will review your pull request and provide feedback. Be prepared to address any feedback or suggestions to improve your contribution. 58 | 59 | ## Code Contributions 60 | 61 | When contributing code to git-repo-parser, please adhere to the following guidelines: 62 | 63 | * Use concise and descriptive variable and function names. 64 | * Follow the coding conventions and best practices used in the existing codebase. 65 | * Write well-commented code to help other contributors understand your changes. 66 | * Ensure that your code is tested and passes all the existing tests. 67 | * Keep your changes focused and avoid introducing unrelated changes. 68 | 69 | ## Reporting Issues 70 | 71 | If you encounter any bugs or issues with git-repo-parser, please report them using the GitHub issue tracker. When reporting an issue, please provide the following information: 72 | 73 | * A concise and descriptive title that summarizes the issue. 74 | * A detailed description of the issue, including steps to reproduce the issue and any relevant error messages or logs. 75 | * The version of git-repo-parser you are using. 76 | * If possible, provide a minimal reproducible example that demonstrates the issue. 77 | 78 | ## Open Source Community 79 | 80 | ### Conduct 81 | We are committed to fostering a welcoming and inclusive open-source community. We expect all contributors to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md) to create a respectful and collaborative environment. 82 | 83 | ### Contributor License Agreement 84 | By contributing to git-repo-parser, you agree to the terms of our [Contributor License Agreement (CLA)](CLA.md). The CLA ensures that we can use your contributions in accordance with the project's license. 85 | 86 | ### Documentation 87 | If you have any questions or need help using or contributing to git-repo-parser, don't hesitate to ask for help on the project's GitHub page or through the project's communication channels (e.g., Discord, Slack, etc.). Additionally, our [documentation](DOCUMENTATION.md) provides comprehensive information about using and contributing to the project. 88 | -------------------------------------------------------------------------------- /src/clitext.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { promises as fs } from 'fs'; 4 | import { 5 | scrapeRepositoryToJsonWithTokenCount, 6 | scrapeRepositoryToToonWithTokenCount, 7 | scrapeRepositoryToPlainTextWithTokenCount, 8 | type TranscriptFormatOptions 9 | } from './scraper'; 10 | 11 | type OutputFormat = 'json' | 'toon' | 'transcript'; 12 | 13 | interface ParsedOptions { 14 | repoUrl?: string; 15 | format: OutputFormat; 16 | includeMeta: boolean; 17 | showTokenCount: boolean; 18 | } 19 | 20 | function parseArgs(args: string[]): ParsedOptions { 21 | let repoUrl: string | undefined; 22 | let format: OutputFormat = 'transcript'; 23 | let includeMeta = false; 24 | let showTokenCount = false; 25 | 26 | for (let i = 0; i < args.length; i += 1) { 27 | const arg = args[i]; 28 | 29 | if (arg === '--format') { 30 | const value = args[i + 1]; 31 | if (!value) { 32 | throw new Error('Expected a value after --format'); 33 | } 34 | format = parseFormat(value); 35 | i += 1; 36 | continue; 37 | } 38 | 39 | if (arg.startsWith('--format=')) { 40 | const value = arg.split('=', 2)[1] ?? ''; 41 | format = parseFormat(value); 42 | continue; 43 | } 44 | 45 | if (arg === '--meta') { 46 | includeMeta = true; 47 | continue; 48 | } 49 | 50 | if (arg === '--no-meta') { 51 | includeMeta = false; 52 | continue; 53 | } 54 | 55 | if (arg === '--tokens' || arg === '--token-count' || arg === '--token' || arg === '-t') { 56 | showTokenCount = true; 57 | continue; 58 | } 59 | 60 | if (arg.startsWith('-')) { 61 | console.warn(`Ignoring unrecognised flag: ${arg}`); 62 | continue; 63 | } 64 | 65 | if (!repoUrl) { 66 | repoUrl = arg; 67 | } else { 68 | console.warn(`Ignoring unexpected argument: ${arg}`); 69 | } 70 | } 71 | 72 | return { repoUrl, format, includeMeta, showTokenCount }; 73 | } 74 | 75 | function parseFormat(value: string): OutputFormat { 76 | const normalised = value.trim().toLowerCase(); 77 | if (normalised === 'json' || normalised === 'toon' || normalised === 'transcript') { 78 | return normalised; 79 | } 80 | throw new Error(`Unsupported format "${value}". Expected one of: json, toon, transcript.`); 81 | } 82 | 83 | async function main() { 84 | let options: ParsedOptions; 85 | try { 86 | options = parseArgs(process.argv.slice(2)); 87 | } catch (error) { 88 | console.error(error instanceof Error ? error.message : error); 89 | process.exit(1); 90 | return; 91 | } 92 | 93 | const { repoUrl, format, includeMeta, showTokenCount } = options; 94 | 95 | if (!repoUrl) { 96 | console.error('Please provide a GitHub repository URL.'); 97 | process.exit(1); 98 | return; 99 | } 100 | 101 | switch (format) { 102 | case 'json': 103 | await handleJson(repoUrl, showTokenCount); 104 | break; 105 | case 'toon': 106 | await handleToon(repoUrl, showTokenCount); 107 | break; 108 | case 'transcript': 109 | default: 110 | await handleTranscript(repoUrl, includeMeta, showTokenCount); 111 | break; 112 | } 113 | } 114 | 115 | async function handleJson(repoUrl: string, showTokenCount: boolean) { 116 | const { json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(repoUrl); 117 | await fs.writeFile('files.json', `${json}\n`, { encoding: 'utf-8' }); 118 | console.log('File list has been saved to files.json'); 119 | if (showTokenCount) { 120 | console.log(`Token count (cl100k_base): ${tokenCount}`); 121 | } 122 | } 123 | 124 | async function handleToon(repoUrl: string, showTokenCount: boolean) { 125 | const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(repoUrl); 126 | await fs.writeFile('files.toon', `${toon}\n`, { encoding: 'utf-8' }); 127 | console.log('File list has been saved to files.toon'); 128 | if (showTokenCount) { 129 | console.log(`Token count (cl100k_base): ${tokenCount}`); 130 | } 131 | } 132 | 133 | async function handleTranscript(repoUrl: string, includeMeta: boolean, showTokenCount: boolean) { 134 | const transcriptOptions: TranscriptFormatOptions = { includeMeta }; 135 | const { text, tokenCount } = await scrapeRepositoryToPlainTextWithTokenCount( 136 | repoUrl, 137 | undefined, 138 | transcriptOptions 139 | ); 140 | 141 | await fs.writeFile('files.txt', text, { encoding: 'utf-8' }); 142 | console.log('RepoScript transcript has been saved to files.txt'); 143 | 144 | if (showTokenCount) { 145 | console.log(`Token count (cl100k_base): ${tokenCount}`); 146 | } 147 | } 148 | 149 | main().catch(err => { 150 | console.error(err); 151 | process.exitCode = 1; 152 | }); -------------------------------------------------------------------------------- /benchmark/results.json: -------------------------------------------------------------------------------- 1 | { 2 | "generatedAt": "2025-11-09T21:18:49.257Z", 3 | "repositories": [ 4 | { 5 | "name": "octocat-hello-world", 6 | "url": "https://github.com/octocat/Hello-World", 7 | "results": [ 8 | { 9 | "format": "json", 10 | "formatLabel": "JSON", 11 | "durationMs": 936.953458, 12 | "tokenCount": 37, 13 | "outputBytes": 107, 14 | "extra": "files: 1" 15 | }, 16 | { 17 | "format": "toon", 18 | "formatLabel": "TOON", 19 | "durationMs": 880.8243749999999, 20 | "tokenCount": 22, 21 | "outputBytes": 71 22 | }, 23 | { 24 | "format": "text", 25 | "formatLabel": "Plain Text", 26 | "durationMs": 919.8080830000001, 27 | "tokenCount": 15, 28 | "outputBytes": 52 29 | } 30 | ], 31 | "error": null 32 | }, 33 | { 34 | "name": "octocat-spoon-knife", 35 | "url": "https://github.com/octocat/Spoon-Knife", 36 | "results": [ 37 | { 38 | "format": "json", 39 | "formatLabel": "JSON", 40 | "durationMs": 904.1952919999999, 41 | "tokenCount": 523, 42 | "outputBytes": 1759, 43 | "extra": "files: 3" 44 | }, 45 | { 46 | "format": "toon", 47 | "formatLabel": "TOON", 48 | "durationMs": 906.0097909999995, 49 | "tokenCount": 460, 50 | "outputBytes": 1589 51 | }, 52 | { 53 | "format": "text", 54 | "formatLabel": "Plain Text", 55 | "durationMs": 895.6935830000002, 56 | "tokenCount": 414, 57 | "outputBytes": 1530 58 | } 59 | ], 60 | "error": null 61 | }, 62 | { 63 | "name": "axios-axios", 64 | "url": "https://github.com/axios/axios", 65 | "results": [ 66 | { 67 | "format": "json", 68 | "formatLabel": "JSON", 69 | "durationMs": 3263.6907499999998, 70 | "tokenCount": 261071, 71 | "outputBytes": 922326, 72 | "extra": "files: 30" 73 | }, 74 | { 75 | "format": "toon", 76 | "formatLabel": "TOON", 77 | "durationMs": 3151.3070829999997, 78 | "tokenCount": 255992, 79 | "outputBytes": 898912 80 | }, 81 | { 82 | "format": "text", 83 | "formatLabel": "Plain Text", 84 | "durationMs": 3269.705, 85 | "tokenCount": 222580, 86 | "outputBytes": 864449 87 | } 88 | ], 89 | "error": null 90 | }, 91 | { 92 | "name": "sindresorhus-slugify", 93 | "url": "https://github.com/sindresorhus/slugify", 94 | "results": [ 95 | { 96 | "format": "json", 97 | "formatLabel": "JSON", 98 | "durationMs": 941.9017920000006, 99 | "tokenCount": 10610, 100 | "outputBytes": 31520, 101 | "extra": "files: 12" 102 | }, 103 | { 104 | "format": "toon", 105 | "formatLabel": "TOON", 106 | "durationMs": 924.5914580000008, 107 | "tokenCount": 10462, 108 | "outputBytes": 31053 109 | }, 110 | { 111 | "format": "text", 112 | "formatLabel": "Plain Text", 113 | "durationMs": 936.3613750000004, 114 | "tokenCount": 8599, 115 | "outputBytes": 28697 116 | } 117 | ], 118 | "error": null 119 | }, 120 | { 121 | "name": "lodash-lodash", 122 | "url": "https://github.com/lodash/lodash", 123 | "results": [ 124 | { 125 | "format": "json", 126 | "formatLabel": "JSON", 127 | "durationMs": 7290.946833000002, 128 | "tokenCount": 1367886, 129 | "outputBytes": 4652891, 130 | "extra": "files: 25" 131 | }, 132 | { 133 | "format": "toon", 134 | "formatLabel": "TOON", 135 | "durationMs": 6984.508209, 136 | "tokenCount": 1365925, 137 | "outputBytes": 4643896 138 | }, 139 | { 140 | "format": "text", 141 | "formatLabel": "Plain Text", 142 | "durationMs": 7281.696166999998, 143 | "tokenCount": 1184440, 144 | "outputBytes": 4469746 145 | } 146 | ], 147 | "error": null 148 | }, 149 | { 150 | "name": "octokit-request", 151 | "url": "https://github.com/octokit/request.js", 152 | "results": [ 153 | { 154 | "format": "json", 155 | "formatLabel": "JSON", 156 | "durationMs": 1493.2384579999998, 157 | "tokenCount": 44982, 158 | "outputBytes": 154107, 159 | "extra": "files: 11" 160 | }, 161 | { 162 | "format": "toon", 163 | "formatLabel": "TOON", 164 | "durationMs": 1471.8035000000018, 165 | "tokenCount": 44256, 166 | "outputBytes": 151406 167 | }, 168 | { 169 | "format": "text", 170 | "formatLabel": "Plain Text", 171 | "durationMs": 1470.116833, 172 | "tokenCount": 38016, 173 | "outputBytes": 143952 174 | } 175 | ], 176 | "error": null 177 | } 178 | ] 179 | } 180 | -------------------------------------------------------------------------------- /benchmark/benchmark.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs/promises'); 4 | const path = require('path'); 5 | const { performance } = require('perf_hooks'); 6 | const { 7 | scrapeRepositoryToJsonWithTokenCount, 8 | scrapeRepositoryToToonWithTokenCount, 9 | scrapeRepositoryToPlainTextWithTokenCount, 10 | } = require('../dist'); 11 | 12 | const REPOSITORIES = [ 13 | { name: 'octocat-hello-world', url: 'https://github.com/octocat/Hello-World' }, 14 | { name: 'octocat-spoon-knife', url: 'https://github.com/octocat/Spoon-Knife' }, 15 | { name: 'axios-axios', url: 'https://github.com/axios/axios' }, 16 | { name: 'sindresorhus-slugify', url: 'https://github.com/sindresorhus/slugify' }, 17 | { name: 'lodash-lodash', url: 'https://github.com/lodash/lodash' }, 18 | { name: 'octokit-request', url: 'https://github.com/octokit/request.js' }, 19 | ]; 20 | 21 | const FORMATS = [ 22 | { 23 | key: 'json', 24 | label: 'JSON', 25 | run: async (url) => { 26 | const { files, json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(url); 27 | return { 28 | tokenCount, 29 | output: json, 30 | meta: { 31 | fileCount: files.length, 32 | }, 33 | }; 34 | }, 35 | }, 36 | { 37 | key: 'toon', 38 | label: 'TOON', 39 | run: async (url) => { 40 | const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(url); 41 | return { 42 | tokenCount, 43 | output: toon, 44 | }; 45 | }, 46 | }, 47 | { 48 | key: 'text', 49 | label: 'Plain Text', 50 | run: async (url) => { 51 | const { text, tokenCount } = await scrapeRepositoryToPlainTextWithTokenCount(url); 52 | return { 53 | tokenCount, 54 | output: text, 55 | }; 56 | }, 57 | }, 58 | ]; 59 | 60 | function formatDuration(ms) { 61 | return `${ms.toFixed(2)}ms`; 62 | } 63 | 64 | function renderMarkdown(summary) { 65 | const lines = [ 66 | '# git-repo-parser Benchmark', 67 | '', 68 | `Generated: ${summary.generatedAt}`, 69 | '', 70 | ]; 71 | 72 | for (const repo of summary.repositories) { 73 | lines.push(`## ${repo.name}`); 74 | lines.push(''); 75 | lines.push(`Repository: ${repo.url}`); 76 | lines.push(''); 77 | lines.push('| Format | Duration | Token Count | Output Bytes | Extra |'); 78 | lines.push('| --- | ---: | ---: | ---: | --- |'); 79 | 80 | for (const result of repo.results) { 81 | lines.push( 82 | `| ${result.formatLabel} | ${formatDuration(result.durationMs)} | ${result.tokenCount.toLocaleString()} | ${result.outputBytes.toLocaleString()} | ${result.extra ?? ''} |`, 83 | ); 84 | } 85 | 86 | if (repo.error) { 87 | lines.push(''); 88 | lines.push(`⚠️ Error: ${repo.error}`); 89 | } 90 | 91 | lines.push(''); 92 | } 93 | 94 | return `${lines.join('\n').trimEnd()}\n`; 95 | } 96 | 97 | async function main() { 98 | const outDir = path.join(process.cwd(), 'benchmark'); 99 | await fs.mkdir(outDir, { recursive: true }); 100 | 101 | const summary = { 102 | generatedAt: new Date().toISOString(), 103 | repositories: [], 104 | }; 105 | 106 | for (const repo of REPOSITORIES) { 107 | const repoSummary = { 108 | name: repo.name, 109 | url: repo.url, 110 | results: [], 111 | error: null, 112 | }; 113 | 114 | console.log(`Benchmarking ${repo.url} ...`); 115 | 116 | for (const format of FORMATS) { 117 | const start = performance.now(); 118 | try { 119 | const output = await format.run(repo.url); 120 | const durationMs = performance.now() - start; 121 | const outputBytes = Buffer.byteLength(output.output, 'utf-8'); 122 | 123 | repoSummary.results.push({ 124 | format: format.key, 125 | formatLabel: format.label, 126 | durationMs, 127 | tokenCount: output.tokenCount, 128 | outputBytes, 129 | extra: output.meta?.fileCount ? `files: ${output.meta.fileCount}` : undefined, 130 | }); 131 | 132 | const previewPath = path.join(outDir, `${repo.name}.${format.key}.preview.txt`); 133 | const previewLines = output.output.split('\n').slice(0, 100).join('\n'); 134 | await fs.writeFile( 135 | previewPath, 136 | `${previewLines}${previewLines.endsWith('\n') ? '' : '\n'}`, 137 | { encoding: 'utf-8' }, 138 | ); 139 | } catch (error) { 140 | const durationMs = performance.now() - start; 141 | repoSummary.results.push({ 142 | format: format.key, 143 | formatLabel: format.label, 144 | durationMs, 145 | tokenCount: 0, 146 | outputBytes: 0, 147 | extra: `error`, 148 | }); 149 | repoSummary.error = error.stack ?? String(error); 150 | console.error(` Failed on ${format.label}:`, error); 151 | break; 152 | } 153 | } 154 | 155 | summary.repositories.push(repoSummary); 156 | } 157 | 158 | const jsonPath = path.join(outDir, 'results.json'); 159 | const mdPath = path.join(outDir, 'results.md'); 160 | 161 | await fs.writeFile(jsonPath, `${JSON.stringify(summary, null, 2)}\n`, { encoding: 'utf-8' }); 162 | await fs.writeFile(mdPath, renderMarkdown(summary), { encoding: 'utf-8' }); 163 | 164 | console.log(`Benchmark complete. Results saved to:`); 165 | console.log(` - ${jsonPath}`); 166 | console.log(` - ${mdPath}`); 167 | console.log(` - ${outDir}/*.preview.txt`); 168 | } 169 | 170 | main().catch((error) => { 171 | console.error(error); 172 | process.exitCode = 1; 173 | }); 174 | 175 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | . 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # git-repo-parser 3 | 4 | A powerful tool to scrape all files from a GitHub repository and convert them into JSON, Token-Oriented Object Notation (TOON), or RepoScript (an LLM-first transcript format). 5 | 6 | ## Installation 7 | 8 | Install the package globally using npm: 9 | 10 | ```bash 11 | npm install -g git-repo-parser 12 | ``` 13 | 14 | Or add it to your project as a dependency: 15 | 16 | ```bash 17 | npm install git-repo-parser 18 | ``` 19 | 20 | ## Usage 21 | 22 | ### Command Line Interface (CLI) 23 | 24 | This package provides three CLI commands: 25 | 26 | 1. `git-repo-to-json`: Scrapes a GitHub repository and saves the result as a JSON file. 27 | 2. `git-repo-to-toon`: Scrapes a GitHub repository and saves the result as a TOON file. 28 | 3. `git-repo-to-text`: Scrapes a GitHub repository and saves the result as a RepoScript transcript (formerly the “plain text” output). 29 | 30 | #### Example usage: 31 | 32 | ```bash 33 | # JSON and TOON exports (existing behaviour) 34 | git-repo-to-json https://github.com/username/repo-name.git 35 | git-repo-to-toon https://github.com/username/repo-name.git 36 | 37 | # RepoScript transcript without metadata (legacy plain-text behaviour) 38 | git-repo-to-text https://github.com/username/repo-name.git --format=transcript 39 | 40 | # RepoScript transcript with metadata lines and token count 41 | git-repo-to-text https://github.com/username/repo-name.git --format=transcript --meta --tokens 42 | 43 | # Alternate syntaxes 44 | git-repo-to-text https://github.com/username/repo-name.git --format=json 45 | git-repo-to-text https://github.com/username/repo-name.git --format=toon 46 | ``` 47 | 48 | The scraped data will be saved as `files.json`, `files.toon`, or `files.txt` in your current directory. When `--tokens` (or `--token`, `--token-count`, `-t`) is supplied, the CLI also prints the token count using the [CL100K vocabulary](https://github.com/openai/openai-openapi/blob/master/specification.md) for **any** export format. Use `--meta` / `--no-meta` to toggle RepoScript metadata lines (default is no metadata). 49 | 50 | ### Benchmark Suite 51 | 52 | Run the bundled benchmark to evaluate scrape runtime and token usage across multiple public repositories: 53 | 54 | ```bash 55 | npm run build 56 | npm run benchmark 57 | ``` 58 | 59 | Results are saved under `benchmark/`: 60 | 61 | - `benchmark/results.json` – machine-readable summary (durations, token counts, output sizes) 62 | - `benchmark/results.md` – markdown report per repository/format 63 | - `benchmark/*.preview.txt` – first 100 lines of each export for spot-checking 64 | 65 | ### Programmatic Usage 66 | 67 | You can also use the package in your Node.js projects: 68 | 69 | ```javascript 70 | import { 71 | scrapeRepositoryToJson, 72 | scrapeRepositoryToToon, 73 | scrapeRepositoryToTranscript, 74 | scrapeRepositoryToJsonWithTokenCount, 75 | scrapeRepositoryToToonWithTokenCount, 76 | scrapeRepositoryToPlainTextWithTokenCount, 77 | type TranscriptFormatOptions, 78 | countTokens, 79 | } from 'git-repo-parser'; 80 | 81 | const repoUrl = 'https://github.com/username/repo-name.git'; 82 | 83 | // JSON output 84 | const jsonResult = await scrapeRepositoryToJson(repoUrl); 85 | 86 | // TOON output 87 | const toonResult = await scrapeRepositoryToToon(repoUrl); 88 | 89 | // RepoScript transcript (no metadata; equivalent to legacy plain text) 90 | const transcript = await scrapeRepositoryToTranscript(repoUrl); 91 | 92 | // RepoScript with metadata lines 93 | const transcriptOptions: TranscriptFormatOptions = { includeMeta: true }; 94 | const richTranscript = await scrapeRepositoryToTranscript(repoUrl, transcriptOptions); 95 | 96 | // Token-aware helpers 97 | const { json, tokenCount: jsonTokens } = await scrapeRepositoryToJsonWithTokenCount(repoUrl); 98 | const { toon, tokenCount: toonTokens } = await scrapeRepositoryToToonWithTokenCount(repoUrl); 99 | const { text, tokenCount: transcriptTokens } = await scrapeRepositoryToPlainTextWithTokenCount( 100 | repoUrl, 101 | undefined, 102 | transcriptOptions 103 | ); 104 | 105 | // Standalone token counting helper (uses gpt-tokenizer + cl100k_base) 106 | const tokens = countTokens(toon); 107 | ``` 108 | 109 | ## API 110 | 111 | ### `scrapeRepositoryToJson(repoUrl: string): Promise` 112 | 113 | Scrapes the given GitHub repository and returns a promise that resolves to an array of `FileData` objects. 114 | 115 | ### `scrapeRepositoryToJsonWithTokenCount(repoUrl: string, indent = 2, tokenOptions?: TokenCountOptions): Promise<{ files: FileData[]; json: string; tokenCount: number }>` 116 | 117 | Scrapes the repository, returns the raw `FileData[]`, a pretty-printed JSON string, and the corresponding CL100K token consumption. 118 | 119 | ### `scrapeRepositoryToToon(repoUrl: string, options?: EncodeOptions): Promise` 120 | 121 | Scrapes the given GitHub repository and returns a promise that resolves to a TOON-formatted string. You can pass [`EncodeOptions`](https://github.com/toon-format/toon#encoding) directly to customise indentation, delimiter, or length markers. 122 | 123 | ### `scrapeRepositoryToToonWithTokenCount(repoUrl: string, encodeOptions?: EncodeOptions, tokenOptions?: TokenCountOptions): Promise<{ toon: string; tokenCount: number }>` 124 | 125 | Generates the TOON-formatted output and returns both the encoded string and its token count as measured by [gpt-tokenizer](https://www.npmjs.com/package/gpt-tokenizer) using the default CL100K vocabulary. 126 | 127 | ### `scrapeRepositoryToTranscript(repoUrl: string, options?: TranscriptFormatOptions): Promise` 128 | 129 | Scrapes the given GitHub repository and returns a RepoScript v1 transcript string. `TranscriptFormatOptions` currently supports `{ includeMeta?: boolean }` (default: `false`). The legacy `scrapeRepositoryToPlainText` export delegates to this helper with metadata disabled. 130 | 131 | ### `scrapeRepositoryToPlainTextWithTokenCount(repoUrl: string, tokenOptions?: TokenCountOptions, transcriptOptions?: TranscriptFormatOptions): Promise<{ text: string; tokenCount: number }>` 132 | 133 | Scrapes the repository to RepoScript while reporting the token footprint of the generated transcript. Supply `transcriptOptions` to mirror CLI behaviour (e.g. `{ includeMeta: true }`). 134 | 135 | ## RepoScript v1 Format 136 | 137 | RepoScript is a deterministic, LLM-friendly transcript of a repository (formerly the “plain text” output). 138 | 139 | - **Deterministic ordering** 140 | - Directories are emitted in lexical order of their full POSIX paths. 141 | - Within a directory, files are listed in lexical order by filename. 142 | - **Marker grammar** 143 | - Markers always begin at column 0 and follow `[TAG] ` (single space). 144 | - Tags in use: `[DIR_START]`, `[DIR_END]`, `[FILE_START]`, `[FILE_END]`. 145 | - Paths are POSIX (e.g. `src/index.ts`) and never contain newlines. 146 | - **Optional metadata** 147 | - When `includeMeta` is enabled, files receive lines like `meta: lang=ts size=1234`. 148 | - Metadata lines appear immediately after `[FILE_START] ` and before file contents. 149 | - **Reserved tags** 150 | - `[COMMENT]`, `[CHUNK]`, and `[META]` are reserved for future use and MUST NOT appear unless escaped or emitted intentionally once semantics are defined. 151 | 152 | ### Sample Transcript 153 | 154 | ```text 155 | REPOSCRIPT version=1 156 | repo: https://github.com/user/project 157 | commit: abc123 158 | 159 | [FILE_START] src/index.ts 160 | meta: lang=ts size=123 161 | import { foo } from './foo'; 162 | 163 | [FILE_END] src/index.ts 164 | ``` 165 | 166 | > Note: The current CLI/API emit the `[..._START]` / `[..._END]` markers (with optional metadata). The header lines shown above are illustrative and may be added via tooling or future options. 167 | 168 | ## FileData Interface 169 | 170 | The `FileData` interface represents the structure of files and directories in the JSON output: 171 | 172 | ```typescript 173 | interface FileData { 174 | name: string; 175 | path: string; 176 | type: 'file' | 'directory'; 177 | children?: FileData[]; 178 | content?: string; 179 | } 180 | ``` 181 | 182 | ## Features 183 | 184 | - Clones the repository locally (temporary) 185 | - Ignores binary files and common non-source files 186 | - Supports nested directory structures 187 | - Provides both JSON and plain text output formats 188 | - Cleans up cloned repository after scraping 189 | 190 | ## Ignored Files 191 | 192 | The following file types and patterns are ignored during scraping: 193 | 194 | - package-lock.json 195 | - Binary files (pdf, png, jpg, jpeg, gif, ico, svg, woff, woff2, eot, ttf, otf) 196 | - Media files (mp4, avi, webm, mov, mp3, wav, flac, ogg, webp) 197 | - Debug and error logs (npm-debug, yarn-debug, yarn-error) 198 | - Configuration files (tsconfig, jest.config) 199 | - The `.git` directory 200 | 201 | ## License 202 | 203 | This project is licensed under the MIT License. 204 | 205 | ## Author 206 | 207 | arnab2001 208 | 209 | ## Contributing 210 | 211 | Contributions, issues, and feature requests are welcome. Feel free to check [issues page] if you want to contribute. 212 | Also Check [Contribution Guide](CONTRIBUTION.md) 213 | Open Source Community 214 | Conduct 215 | 216 | We are committed to fostering a welcoming and inclusive open-source community. We expect all contributors to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md) to create a respectful and collaborative environment. 217 | ## Show your support 218 | 219 | Give a ⭐️ if this project helped you! 220 | ``` 221 | -------------------------------------------------------------------------------- /src/scraper.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | import type { Dirent } from 'fs'; 3 | import * as path from 'path'; 4 | import * as os from 'os'; 5 | import simpleGit from 'simple-git'; 6 | import { encode, type EncodeOptions } from '@toon-format/toon'; 7 | import { countTokens, type TokenCountOptions } from './tokenCounter'; 8 | 9 | export interface FileData { 10 | name: string; 11 | path: string; 12 | type: 'file' | 'directory'; 13 | children?: FileData[]; 14 | content?: string; 15 | } 16 | 17 | async function cloneRepository(repoUrl: string, clonePath: string) { 18 | const git = simpleGit(); 19 | await git.clone(repoUrl, clonePath); 20 | console.log(`Repository cloned to ${clonePath}`); 21 | } 22 | 23 | const MAX_CONCURRENCY = 10; 24 | const IGNORED_SEGMENTS = new Set(['.git']); 25 | 26 | const EXTENSION_LANG_MAP: Record = { 27 | '.ts': 'ts', 28 | '.tsx': 'tsx', 29 | '.js': 'js', 30 | '.jsx': 'jsx', 31 | '.mjs': 'js', 32 | '.cjs': 'js', 33 | '.json': 'json', 34 | '.md': 'md', 35 | '.py': 'py', 36 | '.rb': 'rb', 37 | '.go': 'go', 38 | '.rs': 'rs', 39 | '.java': 'java', 40 | '.kt': 'kt', 41 | '.kts': 'kt', 42 | '.swift': 'swift', 43 | '.c': 'c', 44 | '.h': 'c', 45 | '.cpp': 'cpp', 46 | '.hpp': 'cpp', 47 | '.cc': 'cpp', 48 | '.hh': 'cpp', 49 | '.cs': 'cs', 50 | '.php': 'php', 51 | '.sh': 'sh', 52 | '.bash': 'sh', 53 | '.zsh': 'sh', 54 | '.yaml': 'yaml', 55 | '.yml': 'yaml', 56 | '.toml': 'toml', 57 | '.ini': 'ini', 58 | '.cfg': 'ini', 59 | '.txt': 'txt', 60 | '.css': 'css', 61 | '.scss': 'scss', 62 | '.sass': 'sass', 63 | '.less': 'less', 64 | '.vue': 'vue', 65 | '.svelte': 'svelte' 66 | }; 67 | 68 | export interface TranscriptFormatOptions { 69 | includeMeta?: boolean; 70 | } 71 | 72 | function sanitiseRepoLabel(repoUrl: string): string { 73 | let candidate = repoUrl.trim(); 74 | 75 | try { 76 | const parsed = new URL(repoUrl); 77 | candidate = parsed.pathname.split('/').pop() ?? ''; 78 | } catch { 79 | const segments = candidate.split('/'); 80 | candidate = segments[segments.length - 1] ?? ''; 81 | } 82 | 83 | candidate = candidate.replace(/\.git$/i, ''); 84 | const sanitised = candidate.toLowerCase().replace(/[^a-z0-9._-]+/g, '-').replace(/^-+|-+$/g, ''); 85 | const truncated = sanitised.slice(0, 64); 86 | return truncated || 'repository'; 87 | } 88 | 89 | async function prepareCloneWorkspace(repoUrl: string) { 90 | const repoLabel = sanitiseRepoLabel(repoUrl); 91 | const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'git-repo-parser-')); 92 | const clonePath = path.join(tempRoot, repoLabel); 93 | 94 | async function cleanup() { 95 | try { 96 | await fs.rm(tempRoot, { recursive: true, force: true }); 97 | } catch (error) { 98 | console.warn(`Failed to clean temporary directory ${tempRoot}:`, error); 99 | } 100 | } 101 | 102 | return { clonePath, cleanup }; 103 | } 104 | 105 | function toPosixPath(filePath: string): string { 106 | return filePath.split(path.sep).join('/'); 107 | } 108 | 109 | function shouldIgnorePath(relativePath: string, ignoreSegments: Set): boolean { 110 | const segments = relativePath.split(path.sep).filter(Boolean); 111 | return segments.some(segment => ignoreSegments.has(segment)); 112 | } 113 | 114 | async function mapWithConcurrency( 115 | items: T[], 116 | limit: number, 117 | mapper: (item: T, index: number) => Promise 118 | ): Promise { 119 | const results: R[] = new Array(items.length); 120 | let nextIndex = 0; 121 | 122 | async function worker() { 123 | while (true) { 124 | const currentIndex = nextIndex++; 125 | if (currentIndex >= items.length) { 126 | break; 127 | } 128 | results[currentIndex] = await mapper(items[currentIndex], currentIndex); 129 | } 130 | } 131 | 132 | const workerCount = Math.min(limit, items.length); 133 | await Promise.all(Array.from({ length: workerCount }, () => worker())); 134 | return results; 135 | } 136 | 137 | function shouldIgnoreFile(fileName: string): boolean { 138 | const lowerCaseFileName = fileName.toLowerCase(); 139 | return ( 140 | lowerCaseFileName === 'package-lock.json' || 141 | lowerCaseFileName.endsWith('.pdf') || 142 | lowerCaseFileName.endsWith('.png') || 143 | lowerCaseFileName.endsWith('.jpg') || 144 | lowerCaseFileName.endsWith('.jpeg') || 145 | lowerCaseFileName.endsWith('.gif') || 146 | lowerCaseFileName.endsWith('.ico') || 147 | lowerCaseFileName.endsWith('.svg') || 148 | lowerCaseFileName.endsWith('.woff') || 149 | lowerCaseFileName.endsWith('.woff2') || 150 | lowerCaseFileName.endsWith('.eot') || 151 | lowerCaseFileName.endsWith('.ttf') || 152 | lowerCaseFileName.endsWith('.otf') || 153 | lowerCaseFileName.endsWith('.mp4') || 154 | lowerCaseFileName.endsWith('.avi') || 155 | lowerCaseFileName.endsWith('.webm') || 156 | lowerCaseFileName.endsWith('.mov') || 157 | lowerCaseFileName.endsWith('.mp3') || 158 | lowerCaseFileName.endsWith('.wav') || 159 | lowerCaseFileName.endsWith('.flac') || 160 | lowerCaseFileName.endsWith('.ogg') || 161 | lowerCaseFileName.endsWith('.webp') || 162 | lowerCaseFileName.startsWith('package-lock') || 163 | lowerCaseFileName.startsWith('yarn-lock') || 164 | lowerCaseFileName.startsWith('npm-debug') || 165 | lowerCaseFileName.startsWith('yarn-debug') || 166 | lowerCaseFileName.startsWith('yarn-error') || 167 | lowerCaseFileName.startsWith('tsconfig') || 168 | lowerCaseFileName.startsWith('jest.config') 169 | 170 | // Add more extensions as needed 171 | ); 172 | } 173 | 174 | async function scrapeDirectoryToJson( 175 | dir: string, 176 | baseDir: string, 177 | ignoreSegments: Set 178 | ): Promise { 179 | const entries = await fs.readdir(dir, { withFileTypes: true }); 180 | 181 | const processed = await mapWithConcurrency(entries, MAX_CONCURRENCY, async (entry) => { 182 | const entryPath = path.join(dir, entry.name); 183 | const relativePath = path.relative(baseDir, entryPath); 184 | 185 | if (!relativePath) { 186 | return null; 187 | } 188 | if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) { 189 | return null; 190 | } 191 | 192 | try { 193 | if (entry.isDirectory()) { 194 | const children = await scrapeDirectoryToJson(entryPath, baseDir, ignoreSegments); 195 | return { 196 | name: entry.name, 197 | path: toPosixPath(relativePath), 198 | type: 'directory' as const, 199 | children 200 | }; 201 | } 202 | 203 | if (entry.isFile()) { 204 | const content = await fs.readFile(entryPath, { encoding: 'utf-8' }); 205 | return { 206 | name: entry.name, 207 | path: toPosixPath(relativePath), 208 | type: 'file' as const, 209 | content 210 | }; 211 | } 212 | } catch (error) { 213 | console.warn(`Skipping ${entryPath} due to error:`, error); 214 | } 215 | 216 | return null; 217 | }); 218 | 219 | const filtered = processed.filter((item): item is NonNullable => item !== null); 220 | return filtered as FileData[]; 221 | } 222 | 223 | function detectLanguage(fileName: string): string | undefined { 224 | const ext = path.extname(fileName).toLowerCase(); 225 | return EXTENSION_LANG_MAP[ext]; 226 | } 227 | 228 | function sortDirEntries(entries: Dirent[]): { directories: Dirent[]; files: Dirent[] } { 229 | const directories = entries.filter(entry => entry.isDirectory()).sort((a, b) => a.name.localeCompare(b.name)); 230 | const files = entries.filter(entry => entry.isFile()).sort((a, b) => a.name.localeCompare(b.name)); 231 | return { directories, files }; 232 | } 233 | 234 | function createMetadataLine(fileName: string, content: string): string { 235 | const size = Buffer.byteLength(content, 'utf-8'); 236 | const metadata: string[] = [`size=${size}`]; 237 | const lang = detectLanguage(fileName); 238 | if (lang) { 239 | metadata.unshift(`lang=${lang}`); 240 | } 241 | return `meta: ${metadata.join(' ')}`; 242 | } 243 | 244 | async function generateTranscript( 245 | dir: string, 246 | baseDir: string, 247 | ignoreSegments: Set, 248 | options: TranscriptFormatOptions, 249 | prefix = '' 250 | ): Promise { 251 | let result = ''; 252 | 253 | const entries = await fs.readdir(dir, { withFileTypes: true }); 254 | 255 | const { directories, files } = sortDirEntries(entries); 256 | 257 | for (const entry of directories) { 258 | const filePath = path.join(dir, entry.name); 259 | const relativePath = path.relative(baseDir, filePath); 260 | 261 | if (!relativePath) { 262 | continue; 263 | } 264 | if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) { 265 | continue; 266 | } 267 | 268 | const displayPath = toPosixPath(path.join(prefix, entry.name)); 269 | 270 | try { 271 | result += `[DIR_START] ${displayPath}\n`; 272 | result += await generateTranscript( 273 | filePath, 274 | baseDir, 275 | ignoreSegments, 276 | options, 277 | path.join(prefix, entry.name) 278 | ); 279 | result += `[DIR_END] ${displayPath}\n\n`; 280 | } catch (error) { 281 | console.warn(`Skipping ${filePath} due to error:`, error); 282 | } 283 | } 284 | 285 | for (const entry of files) { 286 | const filePath = path.join(dir, entry.name); 287 | const relativePath = path.relative(baseDir, filePath); 288 | 289 | if (!relativePath) { 290 | continue; 291 | } 292 | if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) { 293 | continue; 294 | } 295 | 296 | const displayPath = toPosixPath(path.join(prefix, entry.name)); 297 | 298 | try { 299 | const content = await fs.readFile(filePath, { encoding: 'utf-8' }); 300 | result += `[FILE_START] ${displayPath}\n`; 301 | 302 | if (options.includeMeta) { 303 | result += `${createMetadataLine(entry.name, content)}\n`; 304 | } 305 | 306 | result += content; 307 | if (!content.endsWith('\n')) { 308 | result += '\n'; 309 | } 310 | result += `[FILE_END] ${displayPath}\n\n`; 311 | } catch (error) { 312 | console.warn(`Skipping ${filePath} due to error:`, error); 313 | } 314 | } 315 | 316 | return result; 317 | } 318 | 319 | export async function scrapeRepositoryToJson(repoUrl: string): Promise { 320 | const { clonePath, cleanup } = await prepareCloneWorkspace(repoUrl); 321 | 322 | try { 323 | await cloneRepository(repoUrl, clonePath); 324 | return await scrapeDirectoryToJson(clonePath, clonePath, IGNORED_SEGMENTS); 325 | } finally { 326 | await cleanup(); 327 | } 328 | } 329 | 330 | export async function scrapeRepositoryToPlainText(repoUrl: string): Promise { 331 | return scrapeRepositoryToTranscript(repoUrl); 332 | } 333 | 334 | export async function scrapeRepositoryToTranscript( 335 | repoUrl: string, 336 | options: TranscriptFormatOptions = {} 337 | ): Promise { 338 | const { clonePath, cleanup } = await prepareCloneWorkspace(repoUrl); 339 | 340 | try { 341 | await cloneRepository(repoUrl, clonePath); 342 | return await generateTranscript(clonePath, clonePath, IGNORED_SEGMENTS, { 343 | includeMeta: options.includeMeta ?? false 344 | }); 345 | } finally { 346 | await cleanup(); 347 | } 348 | } 349 | 350 | export async function scrapeRepositoryToToon( 351 | repoUrl: string, 352 | options?: EncodeOptions 353 | ): Promise { 354 | const { toon } = await scrapeRepositoryToToonWithTokenCount(repoUrl, options); 355 | return toon; 356 | } 357 | 358 | export interface ToonScrapeResult { 359 | toon: string; 360 | tokenCount: number; 361 | } 362 | 363 | export async function scrapeRepositoryToToonWithTokenCount( 364 | repoUrl: string, 365 | encodeOptions?: EncodeOptions, 366 | tokenOptions?: TokenCountOptions 367 | ): Promise { 368 | const files = await scrapeRepositoryToJson(repoUrl); 369 | const toon = encode({ files }, encodeOptions); 370 | return { 371 | toon, 372 | tokenCount: countTokens(toon, tokenOptions) 373 | }; 374 | } 375 | 376 | export interface JsonScrapeResult { 377 | files: FileData[]; 378 | json: string; 379 | tokenCount: number; 380 | } 381 | 382 | export async function scrapeRepositoryToJsonWithTokenCount( 383 | repoUrl: string, 384 | indent = 2, 385 | tokenOptions?: TokenCountOptions 386 | ): Promise { 387 | const files = await scrapeRepositoryToJson(repoUrl); 388 | const json = JSON.stringify(files, null, indent); 389 | return { 390 | files, 391 | json, 392 | tokenCount: countTokens(json, tokenOptions) 393 | }; 394 | } 395 | 396 | export interface TranscriptScrapeResult { 397 | text: string; 398 | tokenCount: number; 399 | } 400 | 401 | export type PlainTextScrapeResult = TranscriptScrapeResult; 402 | 403 | export async function scrapeRepositoryToPlainTextWithTokenCount( 404 | repoUrl: string, 405 | tokenOptions?: TokenCountOptions, 406 | transcriptOptions?: TranscriptFormatOptions 407 | ): Promise { 408 | const text = await scrapeRepositoryToTranscript(repoUrl, transcriptOptions); 409 | return { 410 | text, 411 | tokenCount: countTokens(text, tokenOptions) 412 | }; 413 | } 414 | --------------------------------------------------------------------------------