├── .DS_Store
├── tsconfig.json
├── src
    ├── tokenCounter.ts
    ├── index.ts
    ├── clijson.ts
    ├── clitoon.ts
    ├── clitext.ts
    └── scraper.ts
├── .github
    └── workflows
    │   ├── npm-publish.yml
    │   └── node.js.yaml
├── package.json
├── LICENSE
├── example.js
├── benchmark
    ├── results.md
    ├── results.json
    └── benchmark.js
├── .gitignore
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arnab2001/git-repo-parser/HEAD/.DS_Store


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2020",
 4 |     "module": "commonjs",
 5 |     "moduleResolution": "node",
 6 |     "declaration": true,
 7 |     "outDir": "./dist",
 8 |     "strict": true,
 9 |     "skipLibCheck": true
10 |   },
11 |   "include": ["src/**/*.ts", "node_modules/git-repo-to-json/src/**/*.ts"],
12 |   "exclude": ["node_modules"]
13 | }
14 | 


--------------------------------------------------------------------------------
/src/tokenCounter.ts:
--------------------------------------------------------------------------------
 1 | import { encode } from 'gpt-tokenizer';
 2 | 
 3 | export interface TokenCountOptions {
 4 |     /**
 5 |      * Optional delimiter to join multiple segments prior to tokenising.
 6 |      * Defaults to a single newline.
 7 |      */
 8 |     joinWith?: string;
 9 | }
10 | 
11 | export function countTokens(
12 |     segments: string | string[],
13 |     { joinWith = '\n' }: TokenCountOptions = {}
14 | ): number {
15 |     const text = Array.isArray(segments) ? segments.join(joinWith) : segments;
16 |     return encode(text).length;
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/.github/workflows/npm-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Node.js Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 |       - uses: actions/setup-node@v3
13 |         with:
14 |           node-version: 22
15 | 
16 | 
17 |   publish-npm:
18 |     needs: build
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - uses: actions/setup-node@v3
23 |         with:
24 |           node-version: 22
25 |           registry-url: https://registry.npmjs.org/
26 |       - run: npm install
27 |       - run: npm run build
28 |       - run: npm publish
29 |         env:
30 |           NODE_AUTH_TOKEN: ${{secrets.npm_token}}


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "git-repo-parser",
 3 |   "version": "3.0.0",
 4 |   "description": "A tool to scrape all files from a GitHub repository and turn it into a JSON file",
 5 |   "bin": {
 6 |     "git-repo-to-json": "dist/clijson.js",
 7 |     "git-repo-to-text": "dist/clitext.js",
 8 |     "git-repo-to-toon": "dist/clitoon.js"
 9 |   },
10 |   "files": [
11 |     "dist"
12 |   ],
13 |   "main": "dist/index.js",
14 |   "scripts": {
15 |     "build": "tsc",
16 |     "start": "node dist/cli.js",
17 |     "benchmark": "node benchmark/benchmark.js"
18 |   },
19 |   "keywords": [
20 |     "github",
21 |     "scraper",
22 |     "json"
23 |   ],
24 |   "author": "arnab2001",
25 |   "license": "MIT",
26 |   "dependencies": {
27 |     "@toon-format/toon": "^0.8.0",
28 |     "gpt-tokenizer": "^3.4.0",
29 |     "simple-git": "^2.41.0"
30 |   },
31 |   "devDependencies": {
32 |     "@types/node": "^18.11.19",
33 |     "typescript": "^5.4.5"
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   scrapeRepositoryToJson,
 3 |   scrapeRepositoryToPlainText,
 4 |   scrapeRepositoryToTranscript,
 5 |   scrapeRepositoryToToon,
 6 |   scrapeRepositoryToToonWithTokenCount,
 7 |   scrapeRepositoryToJsonWithTokenCount,
 8 |   scrapeRepositoryToPlainTextWithTokenCount,
 9 |   type ToonScrapeResult,
10 |   type JsonScrapeResult,
11 |   type TranscriptScrapeResult,
12 |   type PlainTextScrapeResult,
13 |   type TranscriptFormatOptions,
14 | } from './scraper';
15 | import { countTokens, type TokenCountOptions } from './tokenCounter';
16 | 
17 | export {
18 |   scrapeRepositoryToJson,
19 |   scrapeRepositoryToPlainText,
20 |   scrapeRepositoryToTranscript,
21 |   scrapeRepositoryToToon,
22 |   scrapeRepositoryToToonWithTokenCount,
23 |   scrapeRepositoryToJsonWithTokenCount,
24 |   scrapeRepositoryToPlainTextWithTokenCount,
25 |   type ToonScrapeResult,
26 |   type JsonScrapeResult,
27 |   type TranscriptScrapeResult,
28 |   type PlainTextScrapeResult,
29 |   type TranscriptFormatOptions,
30 |   countTokens,
31 |   type TokenCountOptions,
32 | };
33 | 


--------------------------------------------------------------------------------
/.github/workflows/node.js.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will do a clean installation of node dependencies, cache/restore them, build the source code and run tests across different versions of node
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-nodejs
 3 | 
 4 | name: Node.js CI
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     strategy:
18 |       matrix:
19 |         node-version: [ 22.x]
20 |         # See supported Node.js release schedule at https://nodejs.org/en/about/releases/
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 |     - name: Use Node.js ${{ matrix.node-version }}
25 |       uses: actions/setup-node@v3
26 |       with:
27 |         node-version: ${{ matrix.node-version }}
28 |         cache: 'npm'
29 |     - run: npm install
30 |     - run: npm run build
31 |     - run: npm link
32 |     - run:  git-repo-to-json  https://github.com/arnab2001/css-modules-transformer
33 |     


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Arnab Chatterjee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/clijson.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | import { scrapeRepositoryToJsonWithTokenCount } from './scraper';
 4 | import { promises as fs } from 'fs';
 5 | 
 6 | async function main() {
 7 |     const args = process.argv.slice(2);
 8 | 
 9 |     let repoUrl: string | undefined;
10 |     const flags = new Set<string>();
11 | 
12 |     for (const arg of args) {
13 |         if (arg.startsWith('-')) {
14 |             flags.add(arg);
15 |         } else if (!repoUrl) {
16 |             repoUrl = arg;
17 |         } else {
18 |             console.warn(`Ignoring unexpected argument: ${arg}`);
19 |         }
20 |     }
21 | 
22 |     if (!repoUrl) {
23 |         console.error('Please provide a GitHub repository URL.');
24 |         process.exit(1);
25 |     }
26 | 
27 |     const showTokenCount =
28 |         flags.has('--tokens') ||
29 |         flags.has('--token-count') ||
30 |         flags.has('--token') ||
31 |         flags.has('-t');
32 | 
33 |     const { json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(repoUrl);
34 |     await fs.writeFile('files.json', `${json}\n`, { encoding: 'utf-8' });
35 |     console.log('File list has been saved to files.json');
36 | 
37 |     if (showTokenCount) {
38 |         console.log(`Token count (cl100k_base): ${tokenCount}`);
39 |     }
40 | }
41 | 
42 | main().catch(err => {
43 |     console.error(err);
44 |     process.exitCode = 1;
45 | });


--------------------------------------------------------------------------------
/src/clitoon.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | import { scrapeRepositoryToToonWithTokenCount } from './scraper';
 4 | import { promises as fs } from 'fs';
 5 | 
 6 | async function main() {
 7 |     const args = process.argv.slice(2);
 8 | 
 9 |     let repoUrl: string | undefined;
10 |     const flags = new Set<string>();
11 | 
12 |     for (const arg of args) {
13 |         if (arg.startsWith('-')) {
14 |             flags.add(arg);
15 |         } else if (!repoUrl) {
16 |             repoUrl = arg;
17 |         } else {
18 |             console.warn(`Ignoring unexpected argument: ${arg}`);
19 |         }
20 |     }
21 | 
22 |     if (!repoUrl) {
23 |         console.error('Please provide a GitHub repository URL.');
24 |         process.exit(1);
25 |     }
26 | 
27 |     const showTokenCount =
28 |         flags.has('--tokens') ||
29 |         flags.has('--token-count') ||
30 |         flags.has('--token') ||
31 |         flags.has('-t');
32 | 
33 |     const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(repoUrl);
34 |     await fs.writeFile('files.toon', `${toon}\n`, { encoding: 'utf-8' });
35 |     console.log('File list has been saved to files.toon');
36 | 
37 |     if (showTokenCount) {
38 |         console.log(`Token count (cl100k_base): ${tokenCount}`);
39 |     }
40 | }
41 | 
42 | main().catch(err => {
43 |     console.error(err);
44 |     process.exitCode = 1;
45 | });
46 | 
47 | 


--------------------------------------------------------------------------------
/example.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs/promises');
 2 | const path = require('path');
 3 | const {
 4 |   scrapeRepositoryToJsonWithTokenCount,
 5 |   scrapeRepositoryToToonWithTokenCount,
 6 |   scrapeRepositoryToPlainTextWithTokenCount,
 7 | } = require('./dist');
 8 | 
 9 | const OUTPUT_FILE = path.join(process.cwd(), 'output.txt');
10 | const MAX_PREVIEW_LINES = 50;
11 | 
12 | function previewLines(content, maxLines = MAX_PREVIEW_LINES) {
13 |   return content.split('\n').slice(0, maxLines).join('\n');
14 | }
15 | 
16 | async function main() {
17 |   const repoUrl = process.argv[2] ?? 'https://github.com/kitops-ml/gh-kit-setup';
18 | 
19 |   const sections = [];
20 |   sections.push(`# git-repo-parser sample export
21 | Repository: ${repoUrl}
22 | Generated: ${new Date().toISOString()}
23 | `);
24 | 
25 |   const { json, tokenCount: jsonTokens } = await scrapeRepositoryToJsonWithTokenCount(repoUrl);
26 |   sections.push(`## JSON Export
27 | Token usage (cl100k_base): ${jsonTokens}
28 | Preview (first ${MAX_PREVIEW_LINES} lines):
29 | 
30 | ${previewLines(json)}
31 | `);
32 | 
33 |   const { toon, tokenCount: toonTokens } = await scrapeRepositoryToToonWithTokenCount(repoUrl);
34 |   sections.push(`## TOON Export
35 | Token usage (cl100k_base): ${toonTokens}
36 | Preview (first ${MAX_PREVIEW_LINES} lines):
37 | 
38 | ${previewLines(toon)}
39 | `);
40 | 
41 |   const { text, tokenCount: textTokens } = await scrapeRepositoryToPlainTextWithTokenCount(repoUrl);
42 |   sections.push(`## Plain Text Export
43 | Token usage (cl100k_base): ${textTokens}
44 | Preview (first ${MAX_PREVIEW_LINES} lines):
45 | 
46 | ${previewLines(text)}
47 | `);
48 | 
49 |   const body = sections.join('\n');
50 |   await fs.writeFile(OUTPUT_FILE, `${body.trimEnd()}\n`, { encoding: 'utf-8' });
51 |   console.log(`Output written to ${OUTPUT_FILE}`);
52 | }
53 | 
54 | main().catch((error) => {
55 |   console.error(error);
56 |   process.exitCode = 1;
57 | });


--------------------------------------------------------------------------------
/benchmark/results.md:
--------------------------------------------------------------------------------
 1 | # git-repo-parser Benchmark
 2 | 
 3 | Generated: 2025-11-09T21:18:49.257Z
 4 | 
 5 | ## octocat-hello-world
 6 | 
 7 | Repository: https://github.com/octocat/Hello-World
 8 | 
 9 | | Format | Duration | Token Count | Output Bytes | Extra |
10 | | --- | ---: | ---: | ---: | --- |
11 | | JSON | 936.95ms | 37 | 107 | files: 1 |
12 | | TOON | 880.82ms | 22 | 71 |  |
13 | | Plain Text | 919.81ms | 15 | 52 |  |
14 | 
15 | ## octocat-spoon-knife
16 | 
17 | Repository: https://github.com/octocat/Spoon-Knife
18 | 
19 | | Format | Duration | Token Count | Output Bytes | Extra |
20 | | --- | ---: | ---: | ---: | --- |
21 | | JSON | 904.20ms | 523 | 1,759 | files: 3 |
22 | | TOON | 906.01ms | 460 | 1,589 |  |
23 | | Plain Text | 895.69ms | 414 | 1,530 |  |
24 | 
25 | ## axios-axios
26 | 
27 | Repository: https://github.com/axios/axios
28 | 
29 | | Format | Duration | Token Count | Output Bytes | Extra |
30 | | --- | ---: | ---: | ---: | --- |
31 | | JSON | 3263.69ms | 261,071 | 922,326 | files: 30 |
32 | | TOON | 3151.31ms | 255,992 | 898,912 |  |
33 | | Plain Text | 3269.70ms | 222,580 | 864,449 |  |
34 | 
35 | ## sindresorhus-slugify
36 | 
37 | Repository: https://github.com/sindresorhus/slugify
38 | 
39 | | Format | Duration | Token Count | Output Bytes | Extra |
40 | | --- | ---: | ---: | ---: | --- |
41 | | JSON | 941.90ms | 10,610 | 31,520 | files: 12 |
42 | | TOON | 924.59ms | 10,462 | 31,053 |  |
43 | | Plain Text | 936.36ms | 8,599 | 28,697 |  |
44 | 
45 | ## lodash-lodash
46 | 
47 | Repository: https://github.com/lodash/lodash
48 | 
49 | | Format | Duration | Token Count | Output Bytes | Extra |
50 | | --- | ---: | ---: | ---: | --- |
51 | | JSON | 7290.95ms | 1,367,886 | 4,652,891 | files: 25 |
52 | | TOON | 6984.51ms | 1,365,925 | 4,643,896 |  |
53 | | Plain Text | 7281.70ms | 1,184,440 | 4,469,746 |  |
54 | 
55 | ## octokit-request
56 | 
57 | Repository: https://github.com/octokit/request.js
58 | 
59 | | Format | Duration | Token Count | Output Bytes | Extra |
60 | | --- | ---: | ---: | ---: | --- |
61 | | JSON | 1493.24ms | 44,982 | 154,107 | files: 11 |
62 | | TOON | 1471.80ms | 44,256 | 151,406 |  |
63 | | Plain Text | 1470.12ms | 38,016 | 143,952 |  |
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | .pnpm-debug.log*
  9 | 
 10 | # Diagnostic reports (https://nodejs.org/api/report.html)
 11 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 12 | 
 13 | # Runtime data
 14 | pids
 15 | *.pid
 16 | *.seed
 17 | *.pid.lock
 18 | 
 19 | # Directory for instrumented libs generated by jscoverage/JSCover
 20 | lib-cov
 21 | 
 22 | # Coverage directory used by tools like istanbul
 23 | coverage
 24 | *.lcov
 25 | 
 26 | # nyc test coverage
 27 | .nyc_output
 28 | 
 29 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 30 | .grunt
 31 | 
 32 | # Bower dependency directory (https://bower.io/)
 33 | bower_components
 34 | 
 35 | # node-waf configuration
 36 | .lock-wscript
 37 | 
 38 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 39 | build/Release
 40 | 
 41 | # Dependency directories
 42 | node_modules/
 43 | jspm_packages/
 44 | 
 45 | # Snowpack dependency directory (https://snowpack.dev/)
 46 | web_modules/
 47 | 
 48 | # TypeScript cache
 49 | *.tsbuildinfo
 50 | 
 51 | # Optional npm cache directory
 52 | .npm
 53 | 
 54 | # Optional eslint cache
 55 | .eslintcache
 56 | 
 57 | # Optional stylelint cache
 58 | .stylelintcache
 59 | 
 60 | # Microbundle cache
 61 | .rpt2_cache/
 62 | .rts2_cache_cjs/
 63 | .rts2_cache_es/
 64 | .rts2_cache_umd/
 65 | 
 66 | # Optional REPL history
 67 | .node_repl_history
 68 | 
 69 | # Output of 'npm pack'
 70 | *.tgz
 71 | 
 72 | # Yarn Integrity file
 73 | .yarn-integrity
 74 | 
 75 | # dotenv environment variable files
 76 | .env
 77 | .env.development.local
 78 | .env.test.local
 79 | .env.production.local
 80 | .env.local
 81 | 
 82 | # parcel-bundler cache (https://parceljs.org/)
 83 | .cache
 84 | .parcel-cache
 85 | 
 86 | # Next.js build output
 87 | .next
 88 | out
 89 | 
 90 | # Nuxt.js build / generate output
 91 | .nuxt
 92 | dist
 93 | 
 94 | # Gatsby files
 95 | .cache/
 96 | # Comment in the public line in if your project uses Gatsby and not Next.js
 97 | # https://nextjs.org/blog/next-9-1#public-directory-support
 98 | # public
 99 | 
100 | # vuepress build output
101 | .vuepress/dist
102 | 
103 | # vuepress v2.x temp and cache directory
104 | .temp
105 | .cache
106 | 
107 | # Docusaurus cache and generated files
108 | .docusaurus
109 | 
110 | # Serverless directories
111 | .serverless/
112 | 
113 | # FuseBox cache
114 | .fusebox/
115 | 
116 | # DynamoDB Local files
117 | .dynamodb/
118 | 
119 | # TernJS port file
120 | .tern-port
121 | 
122 | # Stores VSCode versions used for testing VSCode extensions
123 | .vscode-test
124 | 
125 | # yarn v2
126 | .yarn/cache
127 | .yarn/unplugged
128 | .yarn/build-state.yml
129 | .yarn/install-state.gz
130 | .pnp.*


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contribute to git-repo-parser
 2 | 
 3 | ## Table of Contents
 4 | 
 5 | * [Introduction](#introduction)
 6 | * [Getting Started](#getting-started)
 7 | * [Development Process](#development-process)
 8 | * [Code Contributions](#code-contributions)
 9 | * [Reporting Issues](#reporting-issues)
10 | * [Open Source Community](open-source-community)
11 |   * [Conduct](conduct)
12 |   * [Contributor License Agreement](contributor-license-agreement)
13 |   * [Documentation](documentation)
14 | 
15 | ## Introduction
16 | 
17 | Thank you for your interest in contributing to git-repo-parser! This document outlines the guidelines for contributing to the project. By adhering to these guidelines, you can help us maintain a high-quality and sustainable codebase.
18 | 
19 | ## Getting Started
20 | 
21 | Before you start contributing, please take the following steps:
22 | 
23 | 1. **Fork the Repository:** Fork the git-repo-parser repository on GitHub. This will create a copy of the repository in your own GitHub account.
24 | 2. **Clone the Forked Repository:** Clone your forked repository to your local machine using the following command:
25 | 
26 | ```
27 | git clone https://github.com/<your-username>/css-modules-transformer.git
28 | ```
29 | 
30 | 3. **Set Up a Development Environment:** Set up a development environment on your local machine. This may involve installing necessary dependencies, configuring a code editor, and setting up a local development server.
31 | 
32 | ## Development Process
33 | 
34 | 1. **Create a Feature Branch:** When working on a new feature or fixing a bug, create a new feature branch from the `main` branch:
35 | 
36 | ```
37 | git checkout -b <your-feature-branch-name>
38 | ```
39 | 
40 | 2. **Make Changes:** Make your changes to the codebase in your feature branch. Follow the coding conventions and best practices to ensure code quality.
41 | 
42 | 3. **Commit Changes:** Commit your changes regularly using meaningful commit messages:
43 | 
44 | ```
45 | git add <files>
46 | git commit -m "<commit-message>"
47 | ```
48 | 
49 | 4. **Push Changes:** Push your changes to your forked repository on GitHub:
50 | 
51 | ```
52 | git push origin <your-feature-branch-name>
53 | ```
54 | 
55 | 5. **Create a Pull Request:** Create a pull request to merge your feature branch into the `main` branch of the original repository. Provide a clear and concise description of your changes in the pull request.
56 | 
57 | 6. **Review and Feedback:** The project maintainers will review your pull request and provide feedback. Be prepared to address any feedback or suggestions to improve your contribution.
58 | 
59 | ## Code Contributions
60 | 
61 | When contributing code to git-repo-parser, please adhere to the following guidelines:
62 | 
63 | * Use concise and descriptive variable and function names.
64 | * Follow the coding conventions and best practices used in the existing codebase.
65 | * Write well-commented code to help other contributors understand your changes.
66 | * Ensure that your code is tested and passes all the existing tests.
67 | * Keep your changes focused and avoid introducing unrelated changes.
68 | 
69 | ## Reporting Issues
70 | 
71 | If you encounter any bugs or issues with git-repo-parser, please report them using the GitHub issue tracker. When reporting an issue, please provide the following information:
72 | 
73 | * A concise and descriptive title that summarizes the issue.
74 | * A detailed description of the issue, including steps to reproduce the issue and any relevant error messages or logs.
75 | * The version of git-repo-parser you are using.
76 | * If possible, provide a minimal reproducible example that demonstrates the issue.
77 | 
78 | ## Open Source Community
79 | 
80 | ### Conduct
81 | We are committed to fostering a welcoming and inclusive open-source community. We expect all contributors to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md) to create a respectful and collaborative environment.
82 | 
83 | ### Contributor License Agreement
84 | By contributing to git-repo-parser, you agree to the terms of our [Contributor License Agreement (CLA)](CLA.md). The CLA ensures that we can use your contributions in accordance with the project's license.
85 | 
86 | ### Documentation
87 | If you have any questions or need help using or contributing to git-repo-parser, don't hesitate to ask for help on the project's GitHub page or through the project's communication channels (e.g., Discord, Slack, etc.). Additionally, our [documentation](DOCUMENTATION.md) provides comprehensive information about using and contributing to the project.
88 | 


--------------------------------------------------------------------------------
/src/clitext.ts:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | import { promises as fs } from 'fs';
  4 | import {
  5 |     scrapeRepositoryToJsonWithTokenCount,
  6 |     scrapeRepositoryToToonWithTokenCount,
  7 |     scrapeRepositoryToPlainTextWithTokenCount,
  8 |     type TranscriptFormatOptions
  9 | } from './scraper';
 10 | 
 11 | type OutputFormat = 'json' | 'toon' | 'transcript';
 12 | 
 13 | interface ParsedOptions {
 14 |     repoUrl?: string;
 15 |     format: OutputFormat;
 16 |     includeMeta: boolean;
 17 |     showTokenCount: boolean;
 18 | }
 19 | 
 20 | function parseArgs(args: string[]): ParsedOptions {
 21 |     let repoUrl: string | undefined;
 22 |     let format: OutputFormat = 'transcript';
 23 |     let includeMeta = false;
 24 |     let showTokenCount = false;
 25 | 
 26 |     for (let i = 0; i < args.length; i += 1) {
 27 |         const arg = args[i];
 28 | 
 29 |         if (arg === '--format') {
 30 |             const value = args[i + 1];
 31 |             if (!value) {
 32 |                 throw new Error('Expected a value after --format');
 33 |             }
 34 |             format = parseFormat(value);
 35 |             i += 1;
 36 |             continue;
 37 |         }
 38 | 
 39 |         if (arg.startsWith('--format=')) {
 40 |             const value = arg.split('=', 2)[1] ?? '';
 41 |             format = parseFormat(value);
 42 |             continue;
 43 |         }
 44 | 
 45 |         if (arg === '--meta') {
 46 |             includeMeta = true;
 47 |             continue;
 48 |         }
 49 | 
 50 |         if (arg === '--no-meta') {
 51 |             includeMeta = false;
 52 |             continue;
 53 |         }
 54 | 
 55 |         if (arg === '--tokens' || arg === '--token-count' || arg === '--token' || arg === '-t') {
 56 |             showTokenCount = true;
 57 |             continue;
 58 |         }
 59 | 
 60 |         if (arg.startsWith('-')) {
 61 |             console.warn(`Ignoring unrecognised flag: ${arg}`);
 62 |             continue;
 63 |         }
 64 | 
 65 |         if (!repoUrl) {
 66 |             repoUrl = arg;
 67 |         } else {
 68 |             console.warn(`Ignoring unexpected argument: ${arg}`);
 69 |         }
 70 |     }
 71 | 
 72 |     return { repoUrl, format, includeMeta, showTokenCount };
 73 | }
 74 | 
 75 | function parseFormat(value: string): OutputFormat {
 76 |     const normalised = value.trim().toLowerCase();
 77 |     if (normalised === 'json' || normalised === 'toon' || normalised === 'transcript') {
 78 |         return normalised;
 79 |     }
 80 |     throw new Error(`Unsupported format "${value}". Expected one of: json, toon, transcript.`);
 81 | }
 82 | 
 83 | async function main() {
 84 |     let options: ParsedOptions;
 85 |     try {
 86 |         options = parseArgs(process.argv.slice(2));
 87 |     } catch (error) {
 88 |         console.error(error instanceof Error ? error.message : error);
 89 |         process.exit(1);
 90 |         return;
 91 |     }
 92 | 
 93 |     const { repoUrl, format, includeMeta, showTokenCount } = options;
 94 | 
 95 |     if (!repoUrl) {
 96 |         console.error('Please provide a GitHub repository URL.');
 97 |         process.exit(1);
 98 |         return;
 99 |     }
100 | 
101 |     switch (format) {
102 |         case 'json':
103 |             await handleJson(repoUrl, showTokenCount);
104 |             break;
105 |         case 'toon':
106 |             await handleToon(repoUrl, showTokenCount);
107 |             break;
108 |         case 'transcript':
109 |         default:
110 |             await handleTranscript(repoUrl, includeMeta, showTokenCount);
111 |             break;
112 |     }
113 | }
114 | 
115 | async function handleJson(repoUrl: string, showTokenCount: boolean) {
116 |     const { json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(repoUrl);
117 |     await fs.writeFile('files.json', `${json}\n`, { encoding: 'utf-8' });
118 |     console.log('File list has been saved to files.json');
119 |     if (showTokenCount) {
120 |         console.log(`Token count (cl100k_base): ${tokenCount}`);
121 |     }
122 | }
123 | 
124 | async function handleToon(repoUrl: string, showTokenCount: boolean) {
125 |     const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(repoUrl);
126 |     await fs.writeFile('files.toon', `${toon}\n`, { encoding: 'utf-8' });
127 |     console.log('File list has been saved to files.toon');
128 |     if (showTokenCount) {
129 |         console.log(`Token count (cl100k_base): ${tokenCount}`);
130 |     }
131 | }
132 | 
133 | async function handleTranscript(repoUrl: string, includeMeta: boolean, showTokenCount: boolean) {
134 |     const transcriptOptions: TranscriptFormatOptions = { includeMeta };
135 |     const { text, tokenCount } = await scrapeRepositoryToPlainTextWithTokenCount(
136 |         repoUrl,
137 |         undefined,
138 |         transcriptOptions
139 |     );
140 | 
141 |     await fs.writeFile('files.txt', text, { encoding: 'utf-8' });
142 |     console.log('RepoScript transcript has been saved to files.txt');
143 | 
144 |     if (showTokenCount) {
145 |         console.log(`Token count (cl100k_base): ${tokenCount}`);
146 |     }
147 | }
148 | 
149 | main().catch(err => {
150 |     console.error(err);
151 |     process.exitCode = 1;
152 | });


--------------------------------------------------------------------------------
/benchmark/results.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "generatedAt": "2025-11-09T21:18:49.257Z",
  3 |   "repositories": [
  4 |     {
  5 |       "name": "octocat-hello-world",
  6 |       "url": "https://github.com/octocat/Hello-World",
  7 |       "results": [
  8 |         {
  9 |           "format": "json",
 10 |           "formatLabel": "JSON",
 11 |           "durationMs": 936.953458,
 12 |           "tokenCount": 37,
 13 |           "outputBytes": 107,
 14 |           "extra": "files: 1"
 15 |         },
 16 |         {
 17 |           "format": "toon",
 18 |           "formatLabel": "TOON",
 19 |           "durationMs": 880.8243749999999,
 20 |           "tokenCount": 22,
 21 |           "outputBytes": 71
 22 |         },
 23 |         {
 24 |           "format": "text",
 25 |           "formatLabel": "Plain Text",
 26 |           "durationMs": 919.8080830000001,
 27 |           "tokenCount": 15,
 28 |           "outputBytes": 52
 29 |         }
 30 |       ],
 31 |       "error": null
 32 |     },
 33 |     {
 34 |       "name": "octocat-spoon-knife",
 35 |       "url": "https://github.com/octocat/Spoon-Knife",
 36 |       "results": [
 37 |         {
 38 |           "format": "json",
 39 |           "formatLabel": "JSON",
 40 |           "durationMs": 904.1952919999999,
 41 |           "tokenCount": 523,
 42 |           "outputBytes": 1759,
 43 |           "extra": "files: 3"
 44 |         },
 45 |         {
 46 |           "format": "toon",
 47 |           "formatLabel": "TOON",
 48 |           "durationMs": 906.0097909999995,
 49 |           "tokenCount": 460,
 50 |           "outputBytes": 1589
 51 |         },
 52 |         {
 53 |           "format": "text",
 54 |           "formatLabel": "Plain Text",
 55 |           "durationMs": 895.6935830000002,
 56 |           "tokenCount": 414,
 57 |           "outputBytes": 1530
 58 |         }
 59 |       ],
 60 |       "error": null
 61 |     },
 62 |     {
 63 |       "name": "axios-axios",
 64 |       "url": "https://github.com/axios/axios",
 65 |       "results": [
 66 |         {
 67 |           "format": "json",
 68 |           "formatLabel": "JSON",
 69 |           "durationMs": 3263.6907499999998,
 70 |           "tokenCount": 261071,
 71 |           "outputBytes": 922326,
 72 |           "extra": "files: 30"
 73 |         },
 74 |         {
 75 |           "format": "toon",
 76 |           "formatLabel": "TOON",
 77 |           "durationMs": 3151.3070829999997,
 78 |           "tokenCount": 255992,
 79 |           "outputBytes": 898912
 80 |         },
 81 |         {
 82 |           "format": "text",
 83 |           "formatLabel": "Plain Text",
 84 |           "durationMs": 3269.705,
 85 |           "tokenCount": 222580,
 86 |           "outputBytes": 864449
 87 |         }
 88 |       ],
 89 |       "error": null
 90 |     },
 91 |     {
 92 |       "name": "sindresorhus-slugify",
 93 |       "url": "https://github.com/sindresorhus/slugify",
 94 |       "results": [
 95 |         {
 96 |           "format": "json",
 97 |           "formatLabel": "JSON",
 98 |           "durationMs": 941.9017920000006,
 99 |           "tokenCount": 10610,
100 |           "outputBytes": 31520,
101 |           "extra": "files: 12"
102 |         },
103 |         {
104 |           "format": "toon",
105 |           "formatLabel": "TOON",
106 |           "durationMs": 924.5914580000008,
107 |           "tokenCount": 10462,
108 |           "outputBytes": 31053
109 |         },
110 |         {
111 |           "format": "text",
112 |           "formatLabel": "Plain Text",
113 |           "durationMs": 936.3613750000004,
114 |           "tokenCount": 8599,
115 |           "outputBytes": 28697
116 |         }
117 |       ],
118 |       "error": null
119 |     },
120 |     {
121 |       "name": "lodash-lodash",
122 |       "url": "https://github.com/lodash/lodash",
123 |       "results": [
124 |         {
125 |           "format": "json",
126 |           "formatLabel": "JSON",
127 |           "durationMs": 7290.946833000002,
128 |           "tokenCount": 1367886,
129 |           "outputBytes": 4652891,
130 |           "extra": "files: 25"
131 |         },
132 |         {
133 |           "format": "toon",
134 |           "formatLabel": "TOON",
135 |           "durationMs": 6984.508209,
136 |           "tokenCount": 1365925,
137 |           "outputBytes": 4643896
138 |         },
139 |         {
140 |           "format": "text",
141 |           "formatLabel": "Plain Text",
142 |           "durationMs": 7281.696166999998,
143 |           "tokenCount": 1184440,
144 |           "outputBytes": 4469746
145 |         }
146 |       ],
147 |       "error": null
148 |     },
149 |     {
150 |       "name": "octokit-request",
151 |       "url": "https://github.com/octokit/request.js",
152 |       "results": [
153 |         {
154 |           "format": "json",
155 |           "formatLabel": "JSON",
156 |           "durationMs": 1493.2384579999998,
157 |           "tokenCount": 44982,
158 |           "outputBytes": 154107,
159 |           "extra": "files: 11"
160 |         },
161 |         {
162 |           "format": "toon",
163 |           "formatLabel": "TOON",
164 |           "durationMs": 1471.8035000000018,
165 |           "tokenCount": 44256,
166 |           "outputBytes": 151406
167 |         },
168 |         {
169 |           "format": "text",
170 |           "formatLabel": "Plain Text",
171 |           "durationMs": 1470.116833,
172 |           "tokenCount": 38016,
173 |           "outputBytes": 143952
174 |         }
175 |       ],
176 |       "error": null
177 |     }
178 |   ]
179 | }
180 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | const fs = require('fs/promises');
  4 | const path = require('path');
  5 | const { performance } = require('perf_hooks');
  6 | const {
  7 |   scrapeRepositoryToJsonWithTokenCount,
  8 |   scrapeRepositoryToToonWithTokenCount,
  9 |   scrapeRepositoryToPlainTextWithTokenCount,
 10 | } = require('../dist');
 11 | 
 12 | const REPOSITORIES = [
 13 |   { name: 'octocat-hello-world', url: 'https://github.com/octocat/Hello-World' },
 14 |   { name: 'octocat-spoon-knife', url: 'https://github.com/octocat/Spoon-Knife' },
 15 |   { name: 'axios-axios', url: 'https://github.com/axios/axios' },
 16 |   { name: 'sindresorhus-slugify', url: 'https://github.com/sindresorhus/slugify' },
 17 |   { name: 'lodash-lodash', url: 'https://github.com/lodash/lodash' },
 18 |   { name: 'octokit-request', url: 'https://github.com/octokit/request.js' },
 19 | ];
 20 | 
 21 | const FORMATS = [
 22 |   {
 23 |     key: 'json',
 24 |     label: 'JSON',
 25 |     run: async (url) => {
 26 |       const { files, json, tokenCount } = await scrapeRepositoryToJsonWithTokenCount(url);
 27 |       return {
 28 |         tokenCount,
 29 |         output: json,
 30 |         meta: {
 31 |           fileCount: files.length,
 32 |         },
 33 |       };
 34 |     },
 35 |   },
 36 |   {
 37 |     key: 'toon',
 38 |     label: 'TOON',
 39 |     run: async (url) => {
 40 |       const { toon, tokenCount } = await scrapeRepositoryToToonWithTokenCount(url);
 41 |       return {
 42 |         tokenCount,
 43 |         output: toon,
 44 |       };
 45 |     },
 46 |   },
 47 |   {
 48 |     key: 'text',
 49 |     label: 'Plain Text',
 50 |     run: async (url) => {
 51 |       const { text, tokenCount } = await scrapeRepositoryToPlainTextWithTokenCount(url);
 52 |       return {
 53 |         tokenCount,
 54 |         output: text,
 55 |       };
 56 |     },
 57 |   },
 58 | ];
 59 | 
 60 | function formatDuration(ms) {
 61 |   return `${ms.toFixed(2)}ms`;
 62 | }
 63 | 
 64 | function renderMarkdown(summary) {
 65 |   const lines = [
 66 |     '# git-repo-parser Benchmark',
 67 |     '',
 68 |     `Generated: ${summary.generatedAt}`,
 69 |     '',
 70 |   ];
 71 | 
 72 |   for (const repo of summary.repositories) {
 73 |     lines.push(`## ${repo.name}`);
 74 |     lines.push('');
 75 |     lines.push(`Repository: ${repo.url}`);
 76 |     lines.push('');
 77 |     lines.push('| Format | Duration | Token Count | Output Bytes | Extra |');
 78 |     lines.push('| --- | ---: | ---: | ---: | --- |');
 79 | 
 80 |     for (const result of repo.results) {
 81 |       lines.push(
 82 |         `| ${result.formatLabel} | ${formatDuration(result.durationMs)} | ${result.tokenCount.toLocaleString()} | ${result.outputBytes.toLocaleString()} | ${result.extra ?? ''} |`,
 83 |       );
 84 |     }
 85 | 
 86 |     if (repo.error) {
 87 |       lines.push('');
 88 |       lines.push(`⚠️ Error: ${repo.error}`);
 89 |     }
 90 | 
 91 |     lines.push('');
 92 |   }
 93 | 
 94 |   return `${lines.join('\n').trimEnd()}\n`;
 95 | }
 96 | 
 97 | async function main() {
 98 |   const outDir = path.join(process.cwd(), 'benchmark');
 99 |   await fs.mkdir(outDir, { recursive: true });
100 | 
101 |   const summary = {
102 |     generatedAt: new Date().toISOString(),
103 |     repositories: [],
104 |   };
105 | 
106 |   for (const repo of REPOSITORIES) {
107 |     const repoSummary = {
108 |       name: repo.name,
109 |       url: repo.url,
110 |       results: [],
111 |       error: null,
112 |     };
113 | 
114 |     console.log(`Benchmarking ${repo.url} ...`);
115 | 
116 |     for (const format of FORMATS) {
117 |       const start = performance.now();
118 |       try {
119 |         const output = await format.run(repo.url);
120 |         const durationMs = performance.now() - start;
121 |         const outputBytes = Buffer.byteLength(output.output, 'utf-8');
122 | 
123 |         repoSummary.results.push({
124 |           format: format.key,
125 |           formatLabel: format.label,
126 |           durationMs,
127 |           tokenCount: output.tokenCount,
128 |           outputBytes,
129 |           extra: output.meta?.fileCount ? `files: ${output.meta.fileCount}` : undefined,
130 |         });
131 | 
132 |         const previewPath = path.join(outDir, `${repo.name}.${format.key}.preview.txt`);
133 |         const previewLines = output.output.split('\n').slice(0, 100).join('\n');
134 |         await fs.writeFile(
135 |           previewPath,
136 |           `${previewLines}${previewLines.endsWith('\n') ? '' : '\n'}`,
137 |           { encoding: 'utf-8' },
138 |         );
139 |       } catch (error) {
140 |         const durationMs = performance.now() - start;
141 |         repoSummary.results.push({
142 |           format: format.key,
143 |           formatLabel: format.label,
144 |           durationMs,
145 |           tokenCount: 0,
146 |           outputBytes: 0,
147 |           extra: `error`,
148 |         });
149 |         repoSummary.error = error.stack ?? String(error);
150 |         console.error(`  Failed on ${format.label}:`, error);
151 |         break;
152 |       }
153 |     }
154 | 
155 |     summary.repositories.push(repoSummary);
156 |   }
157 | 
158 |   const jsonPath = path.join(outDir, 'results.json');
159 |   const mdPath = path.join(outDir, 'results.md');
160 | 
161 |   await fs.writeFile(jsonPath, `${JSON.stringify(summary, null, 2)}\n`, { encoding: 'utf-8' });
162 |   await fs.writeFile(mdPath, renderMarkdown(summary), { encoding: 'utf-8' });
163 | 
164 |   console.log(`Benchmark complete. Results saved to:`);
165 |   console.log(`  - ${jsonPath}`);
166 |   console.log(`  - ${mdPath}`);
167 |   console.log(`  - ${outDir}/*.preview.txt`);
168 | }
169 | 
170 | main().catch((error) => {
171 |   console.error(error);
172 |   process.exitCode = 1;
173 | });
174 | 
175 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | .
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # git-repo-parser
  3 | 
  4 | A powerful tool to scrape all files from a GitHub repository and convert them into JSON, Token-Oriented Object Notation (TOON), or RepoScript (an LLM-first transcript format).
  5 | 
  6 | ## Installation
  7 | 
  8 | Install the package globally using npm:
  9 | 
 10 | ```bash
 11 | npm install -g git-repo-parser
 12 | ```
 13 | 
 14 | Or add it to your project as a dependency:
 15 | 
 16 | ```bash
 17 | npm install git-repo-parser
 18 | ```
 19 | 
 20 | ## Usage
 21 | 
 22 | ### Command Line Interface (CLI)
 23 | 
 24 | This package provides three CLI commands:
 25 | 
 26 | 1. `git-repo-to-json`: Scrapes a GitHub repository and saves the result as a JSON file.
 27 | 2. `git-repo-to-toon`: Scrapes a GitHub repository and saves the result as a TOON file.
 28 | 3. `git-repo-to-text`: Scrapes a GitHub repository and saves the result as a RepoScript transcript (formerly the “plain text” output).
 29 | 
 30 | #### Example usage:
 31 | 
 32 | ```bash
 33 | # JSON and TOON exports (existing behaviour)
 34 | git-repo-to-json https://github.com/username/repo-name.git
 35 | git-repo-to-toon https://github.com/username/repo-name.git
 36 | 
 37 | # RepoScript transcript without metadata (legacy plain-text behaviour)
 38 | git-repo-to-text https://github.com/username/repo-name.git --format=transcript
 39 | 
 40 | # RepoScript transcript with metadata lines and token count
 41 | git-repo-to-text https://github.com/username/repo-name.git --format=transcript --meta --tokens
 42 | 
 43 | # Alternate syntaxes
 44 | git-repo-to-text https://github.com/username/repo-name.git --format=json
 45 | git-repo-to-text https://github.com/username/repo-name.git --format=toon
 46 | ```
 47 | 
 48 | The scraped data will be saved as `files.json`, `files.toon`, or `files.txt` in your current directory. When `--tokens` (or `--token`, `--token-count`, `-t`) is supplied, the CLI also prints the token count using the [CL100K vocabulary](https://github.com/openai/openai-openapi/blob/master/specification.md) for **any** export format. Use `--meta` / `--no-meta` to toggle RepoScript metadata lines (default is no metadata).
 49 | 
 50 | ### Benchmark Suite
 51 | 
 52 | Run the bundled benchmark to evaluate scrape runtime and token usage across multiple public repositories:
 53 | 
 54 | ```bash
 55 | npm run build
 56 | npm run benchmark
 57 | ```
 58 | 
 59 | Results are saved under `benchmark/`:
 60 | 
 61 | - `benchmark/results.json` – machine-readable summary (durations, token counts, output sizes)
 62 | - `benchmark/results.md` – markdown report per repository/format
 63 | - `benchmark/*.preview.txt` – first 100 lines of each export for spot-checking
 64 | 
 65 | ### Programmatic Usage
 66 | 
 67 | You can also use the package in your Node.js projects:
 68 | 
 69 | ```javascript
 70 | import {
 71 |   scrapeRepositoryToJson,
 72 |   scrapeRepositoryToToon,
 73 |   scrapeRepositoryToTranscript,
 74 |   scrapeRepositoryToJsonWithTokenCount,
 75 |   scrapeRepositoryToToonWithTokenCount,
 76 |   scrapeRepositoryToPlainTextWithTokenCount,
 77 |   type TranscriptFormatOptions,
 78 |   countTokens,
 79 | } from 'git-repo-parser';
 80 | 
 81 | const repoUrl = 'https://github.com/username/repo-name.git';
 82 | 
 83 | // JSON output
 84 | const jsonResult = await scrapeRepositoryToJson(repoUrl);
 85 | 
 86 | // TOON output
 87 | const toonResult = await scrapeRepositoryToToon(repoUrl);
 88 | 
 89 | // RepoScript transcript (no metadata; equivalent to legacy plain text)
 90 | const transcript = await scrapeRepositoryToTranscript(repoUrl);
 91 | 
 92 | // RepoScript with metadata lines
 93 | const transcriptOptions: TranscriptFormatOptions = { includeMeta: true };
 94 | const richTranscript = await scrapeRepositoryToTranscript(repoUrl, transcriptOptions);
 95 | 
 96 | // Token-aware helpers
 97 | const { json, tokenCount: jsonTokens } = await scrapeRepositoryToJsonWithTokenCount(repoUrl);
 98 | const { toon, tokenCount: toonTokens } = await scrapeRepositoryToToonWithTokenCount(repoUrl);
 99 | const { text, tokenCount: transcriptTokens } = await scrapeRepositoryToPlainTextWithTokenCount(
100 |   repoUrl,
101 |   undefined,
102 |   transcriptOptions
103 | );
104 | 
105 | // Standalone token counting helper (uses gpt-tokenizer + cl100k_base)
106 | const tokens = countTokens(toon);
107 | ```
108 | 
109 | ## API
110 | 
111 | ### `scrapeRepositoryToJson(repoUrl: string): Promise<FileData[]>`
112 | 
113 | Scrapes the given GitHub repository and returns a promise that resolves to an array of `FileData` objects.
114 | 
115 | ### `scrapeRepositoryToJsonWithTokenCount(repoUrl: string, indent = 2, tokenOptions?: TokenCountOptions): Promise<{ files: FileData[]; json: string; tokenCount: number }>`
116 | 
117 | Scrapes the repository, returns the raw `FileData[]`, a pretty-printed JSON string, and the corresponding CL100K token consumption.
118 | 
119 | ### `scrapeRepositoryToToon(repoUrl: string, options?: EncodeOptions): Promise<string>`
120 | 
121 | Scrapes the given GitHub repository and returns a promise that resolves to a TOON-formatted string. You can pass [`EncodeOptions`](https://github.com/toon-format/toon#encoding) directly to customise indentation, delimiter, or length markers.
122 | 
123 | ### `scrapeRepositoryToToonWithTokenCount(repoUrl: string, encodeOptions?: EncodeOptions, tokenOptions?: TokenCountOptions): Promise<{ toon: string; tokenCount: number }>`
124 | 
125 | Generates the TOON-formatted output and returns both the encoded string and its token count as measured by [gpt-tokenizer](https://www.npmjs.com/package/gpt-tokenizer) using the default CL100K vocabulary.
126 | 
127 | ### `scrapeRepositoryToTranscript(repoUrl: string, options?: TranscriptFormatOptions): Promise<string>`
128 | 
129 | Scrapes the given GitHub repository and returns a RepoScript v1 transcript string. `TranscriptFormatOptions` currently supports `{ includeMeta?: boolean }` (default: `false`). The legacy `scrapeRepositoryToPlainText` export delegates to this helper with metadata disabled.
130 | 
131 | ### `scrapeRepositoryToPlainTextWithTokenCount(repoUrl: string, tokenOptions?: TokenCountOptions, transcriptOptions?: TranscriptFormatOptions): Promise<{ text: string; tokenCount: number }>`
132 | 
133 | Scrapes the repository to RepoScript while reporting the token footprint of the generated transcript. Supply `transcriptOptions` to mirror CLI behaviour (e.g. `{ includeMeta: true }`).
134 | 
135 | ## RepoScript v1 Format
136 | 
137 | RepoScript is a deterministic, LLM-friendly transcript of a repository (formerly the “plain text” output).
138 | 
139 | - **Deterministic ordering**  
140 |   - Directories are emitted in lexical order of their full POSIX paths.  
141 |   - Within a directory, files are listed in lexical order by filename.
142 | - **Marker grammar**  
143 |   - Markers always begin at column 0 and follow `[TAG] <path>` (single space).  
144 |   - Tags in use: `[DIR_START]`, `[DIR_END]`, `[FILE_START]`, `[FILE_END]`.  
145 |   - Paths are POSIX (e.g. `src/index.ts`) and never contain newlines.
146 | - **Optional metadata**  
147 |   - When `includeMeta` is enabled, files receive lines like `meta: lang=ts size=1234`.  
148 |   - Metadata lines appear immediately after `[FILE_START] <path>` and before file contents.
149 | - **Reserved tags**  
150 |   - `[COMMENT]`, `[CHUNK]`, and `[META]` are reserved for future use and MUST NOT appear unless escaped or emitted intentionally once semantics are defined.
151 | 
152 | ### Sample Transcript
153 | 
154 | ```text
155 | REPOSCRIPT version=1
156 | repo: https://github.com/user/project
157 | commit: abc123
158 | 
159 | [FILE_START] src/index.ts
160 | meta: lang=ts size=123
161 | import { foo } from './foo';
162 | 
163 | [FILE_END] src/index.ts
164 | ```
165 | 
166 | > Note: The current CLI/API emit the `[..._START]` / `[..._END]` markers (with optional metadata). The header lines shown above are illustrative and may be added via tooling or future options.
167 | 
168 | ## FileData Interface
169 | 
170 | The `FileData` interface represents the structure of files and directories in the JSON output:
171 | 
172 | ```typescript
173 | interface FileData {
174 |     name: string;
175 |     path: string;
176 |     type: 'file' | 'directory';
177 |     children?: FileData[];
178 |     content?: string;
179 | }
180 | ```
181 | 
182 | ## Features
183 | 
184 | - Clones the repository locally (temporary)
185 | - Ignores binary files and common non-source files
186 | - Supports nested directory structures
187 | - Provides both JSON and plain text output formats
188 | - Cleans up cloned repository after scraping
189 | 
190 | ## Ignored Files
191 | 
192 | The following file types and patterns are ignored during scraping:
193 | 
194 | - package-lock.json
195 | - Binary files (pdf, png, jpg, jpeg, gif, ico, svg, woff, woff2, eot, ttf, otf)
196 | - Media files (mp4, avi, webm, mov, mp3, wav, flac, ogg, webp)
197 | - Debug and error logs (npm-debug, yarn-debug, yarn-error)
198 | - Configuration files (tsconfig, jest.config)
199 | - The `.git` directory
200 | 
201 | ## License
202 | 
203 | This project is licensed under the MIT License.
204 | 
205 | ## Author
206 | 
207 | arnab2001
208 | 
209 | ## Contributing
210 | 
211 | Contributions, issues, and feature requests are welcome. Feel free to check [issues page] if you want to contribute.
212 | Also Check [Contribution Guide](CONTRIBUTION.md)
213 | Open Source Community
214 | Conduct
215 | 
216 | We are committed to fostering a welcoming and inclusive open-source community. We expect all contributors to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md) to create a respectful and collaborative environment.
217 | ## Show your support
218 | 
219 | Give a ⭐️ if this project helped you!
220 | ```
221 | 


--------------------------------------------------------------------------------
/src/scraper.ts:
--------------------------------------------------------------------------------
  1 | import { promises as fs } from 'fs';
  2 | import type { Dirent } from 'fs';
  3 | import * as path from 'path';
  4 | import * as os from 'os';
  5 | import simpleGit from 'simple-git';
  6 | import { encode, type EncodeOptions } from '@toon-format/toon';
  7 | import { countTokens, type TokenCountOptions } from './tokenCounter';
  8 | 
  9 | export interface FileData {
 10 |     name: string;
 11 |     path: string;
 12 |     type: 'file' | 'directory';
 13 |     children?: FileData[];
 14 |     content?: string;
 15 | }
 16 | 
 17 | async function cloneRepository(repoUrl: string, clonePath: string) {
 18 |     const git = simpleGit();
 19 |     await git.clone(repoUrl, clonePath);
 20 |     console.log(`Repository cloned to ${clonePath}`);
 21 | }
 22 | 
 23 | const MAX_CONCURRENCY = 10;
 24 | const IGNORED_SEGMENTS = new Set<string>(['.git']);
 25 | 
 26 | const EXTENSION_LANG_MAP: Record<string, string> = {
 27 |     '.ts': 'ts',
 28 |     '.tsx': 'tsx',
 29 |     '.js': 'js',
 30 |     '.jsx': 'jsx',
 31 |     '.mjs': 'js',
 32 |     '.cjs': 'js',
 33 |     '.json': 'json',
 34 |     '.md': 'md',
 35 |     '.py': 'py',
 36 |     '.rb': 'rb',
 37 |     '.go': 'go',
 38 |     '.rs': 'rs',
 39 |     '.java': 'java',
 40 |     '.kt': 'kt',
 41 |     '.kts': 'kt',
 42 |     '.swift': 'swift',
 43 |     '.c': 'c',
 44 |     '.h': 'c',
 45 |     '.cpp': 'cpp',
 46 |     '.hpp': 'cpp',
 47 |     '.cc': 'cpp',
 48 |     '.hh': 'cpp',
 49 |     '.cs': 'cs',
 50 |     '.php': 'php',
 51 |     '.sh': 'sh',
 52 |     '.bash': 'sh',
 53 |     '.zsh': 'sh',
 54 |     '.yaml': 'yaml',
 55 |     '.yml': 'yaml',
 56 |     '.toml': 'toml',
 57 |     '.ini': 'ini',
 58 |     '.cfg': 'ini',
 59 |     '.txt': 'txt',
 60 |     '.css': 'css',
 61 |     '.scss': 'scss',
 62 |     '.sass': 'sass',
 63 |     '.less': 'less',
 64 |     '.vue': 'vue',
 65 |     '.svelte': 'svelte'
 66 | };
 67 | 
 68 | export interface TranscriptFormatOptions {
 69 |     includeMeta?: boolean;
 70 | }
 71 | 
 72 | function sanitiseRepoLabel(repoUrl: string): string {
 73 |     let candidate = repoUrl.trim();
 74 | 
 75 |     try {
 76 |         const parsed = new URL(repoUrl);
 77 |         candidate = parsed.pathname.split('/').pop() ?? '';
 78 |     } catch {
 79 |         const segments = candidate.split('/');
 80 |         candidate = segments[segments.length - 1] ?? '';
 81 |     }
 82 | 
 83 |     candidate = candidate.replace(/\.git$/i, '');
 84 |     const sanitised = candidate.toLowerCase().replace(/[^a-z0-9._-]+/g, '-').replace(/^-+|-+$/g, '');
 85 |     const truncated = sanitised.slice(0, 64);
 86 |     return truncated || 'repository';
 87 | }
 88 | 
 89 | async function prepareCloneWorkspace(repoUrl: string) {
 90 |     const repoLabel = sanitiseRepoLabel(repoUrl);
 91 |     const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'git-repo-parser-'));
 92 |     const clonePath = path.join(tempRoot, repoLabel);
 93 | 
 94 |     async function cleanup() {
 95 |         try {
 96 |             await fs.rm(tempRoot, { recursive: true, force: true });
 97 |         } catch (error) {
 98 |             console.warn(`Failed to clean temporary directory ${tempRoot}:`, error);
 99 |         }
100 |     }
101 | 
102 |     return { clonePath, cleanup };
103 | }
104 | 
105 | function toPosixPath(filePath: string): string {
106 |     return filePath.split(path.sep).join('/');
107 | }
108 | 
109 | function shouldIgnorePath(relativePath: string, ignoreSegments: Set<string>): boolean {
110 |     const segments = relativePath.split(path.sep).filter(Boolean);
111 |     return segments.some(segment => ignoreSegments.has(segment));
112 | }
113 | 
114 | async function mapWithConcurrency<T, R>(
115 |     items: T[],
116 |     limit: number,
117 |     mapper: (item: T, index: number) => Promise<R>
118 | ): Promise<R[]> {
119 |     const results: R[] = new Array(items.length);
120 |     let nextIndex = 0;
121 | 
122 |     async function worker() {
123 |         while (true) {
124 |             const currentIndex = nextIndex++;
125 |             if (currentIndex >= items.length) {
126 |                 break;
127 |             }
128 |             results[currentIndex] = await mapper(items[currentIndex], currentIndex);
129 |         }
130 |     }
131 | 
132 |     const workerCount = Math.min(limit, items.length);
133 |     await Promise.all(Array.from({ length: workerCount }, () => worker()));
134 |     return results;
135 | }
136 | 
137 | function shouldIgnoreFile(fileName: string): boolean {
138 |     const lowerCaseFileName = fileName.toLowerCase();
139 |     return (
140 |         lowerCaseFileName === 'package-lock.json' ||
141 |         lowerCaseFileName.endsWith('.pdf') ||
142 |         lowerCaseFileName.endsWith('.png') ||
143 |         lowerCaseFileName.endsWith('.jpg') ||
144 |         lowerCaseFileName.endsWith('.jpeg') ||
145 |         lowerCaseFileName.endsWith('.gif') ||
146 |         lowerCaseFileName.endsWith('.ico') ||
147 |         lowerCaseFileName.endsWith('.svg') ||
148 |         lowerCaseFileName.endsWith('.woff') ||
149 |         lowerCaseFileName.endsWith('.woff2') ||
150 |         lowerCaseFileName.endsWith('.eot') ||
151 |         lowerCaseFileName.endsWith('.ttf') ||
152 |         lowerCaseFileName.endsWith('.otf') ||
153 |         lowerCaseFileName.endsWith('.mp4') ||
154 |         lowerCaseFileName.endsWith('.avi') ||
155 |         lowerCaseFileName.endsWith('.webm') ||
156 |         lowerCaseFileName.endsWith('.mov') ||
157 |         lowerCaseFileName.endsWith('.mp3') ||
158 |         lowerCaseFileName.endsWith('.wav') ||
159 |         lowerCaseFileName.endsWith('.flac') ||
160 |         lowerCaseFileName.endsWith('.ogg') ||
161 |         lowerCaseFileName.endsWith('.webp') ||
162 |         lowerCaseFileName.startsWith('package-lock') ||
163 |         lowerCaseFileName.startsWith('yarn-lock') ||
164 |         lowerCaseFileName.startsWith('npm-debug') ||
165 |         lowerCaseFileName.startsWith('yarn-debug') ||
166 |         lowerCaseFileName.startsWith('yarn-error') ||
167 |         lowerCaseFileName.startsWith('tsconfig') ||
168 |         lowerCaseFileName.startsWith('jest.config') 
169 | 
170 |         // Add more extensions as needed
171 |     );
172 | }
173 | 
174 | async function scrapeDirectoryToJson(
175 |     dir: string,
176 |     baseDir: string,
177 |     ignoreSegments: Set<string>
178 | ): Promise<FileData[]> {
179 |     const entries = await fs.readdir(dir, { withFileTypes: true });
180 | 
181 |     const processed = await mapWithConcurrency(entries, MAX_CONCURRENCY, async (entry) => {
182 |         const entryPath = path.join(dir, entry.name);
183 |         const relativePath = path.relative(baseDir, entryPath);
184 | 
185 |         if (!relativePath) {
186 |             return null;
187 |         }
188 |         if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) {
189 |             return null;
190 |         }
191 | 
192 |         try {
193 |             if (entry.isDirectory()) {
194 |                 const children = await scrapeDirectoryToJson(entryPath, baseDir, ignoreSegments);
195 |                 return {
196 |                     name: entry.name,
197 |                     path: toPosixPath(relativePath),
198 |                     type: 'directory' as const,
199 |                     children
200 |                 };
201 |             }
202 | 
203 |             if (entry.isFile()) {
204 |                 const content = await fs.readFile(entryPath, { encoding: 'utf-8' });
205 |                 return {
206 |                     name: entry.name,
207 |                     path: toPosixPath(relativePath),
208 |                     type: 'file' as const,
209 |                     content
210 |                 };
211 |             }
212 |         } catch (error) {
213 |             console.warn(`Skipping ${entryPath} due to error:`, error);
214 |         }
215 | 
216 |         return null;
217 |     });
218 | 
219 |     const filtered = processed.filter((item): item is NonNullable<typeof item> => item !== null);
220 |     return filtered as FileData[];
221 | }
222 | 
223 | function detectLanguage(fileName: string): string | undefined {
224 |     const ext = path.extname(fileName).toLowerCase();
225 |     return EXTENSION_LANG_MAP[ext];
226 | }
227 | 
228 | function sortDirEntries(entries: Dirent[]): { directories: Dirent[]; files: Dirent[] } {
229 |     const directories = entries.filter(entry => entry.isDirectory()).sort((a, b) => a.name.localeCompare(b.name));
230 |     const files = entries.filter(entry => entry.isFile()).sort((a, b) => a.name.localeCompare(b.name));
231 |     return { directories, files };
232 | }
233 | 
234 | function createMetadataLine(fileName: string, content: string): string {
235 |     const size = Buffer.byteLength(content, 'utf-8');
236 |     const metadata: string[] = [`size=${size}`];
237 |     const lang = detectLanguage(fileName);
238 |     if (lang) {
239 |         metadata.unshift(`lang=${lang}`);
240 |     }
241 |     return `meta: ${metadata.join(' ')}`;
242 | }
243 | 
244 | async function generateTranscript(
245 |     dir: string,
246 |     baseDir: string,
247 |     ignoreSegments: Set<string>,
248 |     options: TranscriptFormatOptions,
249 |     prefix = ''
250 | ): Promise<string> {
251 |     let result = '';
252 | 
253 |     const entries = await fs.readdir(dir, { withFileTypes: true });
254 | 
255 |     const { directories, files } = sortDirEntries(entries);
256 | 
257 |     for (const entry of directories) {
258 |         const filePath = path.join(dir, entry.name);
259 |         const relativePath = path.relative(baseDir, filePath);
260 | 
261 |         if (!relativePath) {
262 |             continue;
263 |         }
264 |         if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) {
265 |             continue;
266 |         }
267 | 
268 |         const displayPath = toPosixPath(path.join(prefix, entry.name));
269 | 
270 |         try {
271 |             result += `[DIR_START] ${displayPath}\n`;
272 |             result += await generateTranscript(
273 |                 filePath,
274 |                 baseDir,
275 |                 ignoreSegments,
276 |                 options,
277 |                 path.join(prefix, entry.name)
278 |             );
279 |             result += `[DIR_END] ${displayPath}\n\n`;
280 |         } catch (error) {
281 |             console.warn(`Skipping ${filePath} due to error:`, error);
282 |         }
283 |     }
284 | 
285 |     for (const entry of files) {
286 |         const filePath = path.join(dir, entry.name);
287 |         const relativePath = path.relative(baseDir, filePath);
288 | 
289 |         if (!relativePath) {
290 |             continue;
291 |         }
292 |         if (shouldIgnorePath(relativePath, ignoreSegments) || shouldIgnoreFile(entry.name)) {
293 |             continue;
294 |         }
295 | 
296 |         const displayPath = toPosixPath(path.join(prefix, entry.name));
297 | 
298 |         try {
299 |             const content = await fs.readFile(filePath, { encoding: 'utf-8' });
300 |             result += `[FILE_START] ${displayPath}\n`;
301 | 
302 |             if (options.includeMeta) {
303 |                 result += `${createMetadataLine(entry.name, content)}\n`;
304 |             }
305 | 
306 |             result += content;
307 |             if (!content.endsWith('\n')) {
308 |                 result += '\n';
309 |             }
310 |             result += `[FILE_END] ${displayPath}\n\n`;
311 |         } catch (error) {
312 |             console.warn(`Skipping ${filePath} due to error:`, error);
313 |         }
314 |     }
315 | 
316 |     return result;
317 | }
318 | 
319 | export async function scrapeRepositoryToJson(repoUrl: string): Promise<FileData[]> {
320 |     const { clonePath, cleanup } = await prepareCloneWorkspace(repoUrl);
321 | 
322 |     try {
323 |         await cloneRepository(repoUrl, clonePath);
324 |         return await scrapeDirectoryToJson(clonePath, clonePath, IGNORED_SEGMENTS);
325 |     } finally {
326 |         await cleanup();
327 |     }
328 | }
329 | 
330 | export async function scrapeRepositoryToPlainText(repoUrl: string): Promise<string> {
331 |     return scrapeRepositoryToTranscript(repoUrl);
332 | }
333 | 
334 | export async function scrapeRepositoryToTranscript(
335 |     repoUrl: string,
336 |     options: TranscriptFormatOptions = {}
337 | ): Promise<string> {
338 |     const { clonePath, cleanup } = await prepareCloneWorkspace(repoUrl);
339 | 
340 |     try {
341 |         await cloneRepository(repoUrl, clonePath);
342 |         return await generateTranscript(clonePath, clonePath, IGNORED_SEGMENTS, {
343 |             includeMeta: options.includeMeta ?? false
344 |         });
345 |     } finally {
346 |         await cleanup();
347 |     }
348 | }
349 | 
350 | export async function scrapeRepositoryToToon(
351 |     repoUrl: string,
352 |     options?: EncodeOptions
353 | ): Promise<string> {
354 |     const { toon } = await scrapeRepositoryToToonWithTokenCount(repoUrl, options);
355 |     return toon;
356 | }
357 | 
358 | export interface ToonScrapeResult {
359 |     toon: string;
360 |     tokenCount: number;
361 | }
362 | 
363 | export async function scrapeRepositoryToToonWithTokenCount(
364 |     repoUrl: string,
365 |     encodeOptions?: EncodeOptions,
366 |     tokenOptions?: TokenCountOptions
367 | ): Promise<ToonScrapeResult> {
368 |     const files = await scrapeRepositoryToJson(repoUrl);
369 |     const toon = encode({ files }, encodeOptions);
370 |     return {
371 |         toon,
372 |         tokenCount: countTokens(toon, tokenOptions)
373 |     };
374 | }
375 | 
376 | export interface JsonScrapeResult {
377 |     files: FileData[];
378 |     json: string;
379 |     tokenCount: number;
380 | }
381 | 
382 | export async function scrapeRepositoryToJsonWithTokenCount(
383 |     repoUrl: string,
384 |     indent = 2,
385 |     tokenOptions?: TokenCountOptions
386 | ): Promise<JsonScrapeResult> {
387 |     const files = await scrapeRepositoryToJson(repoUrl);
388 |     const json = JSON.stringify(files, null, indent);
389 |     return {
390 |         files,
391 |         json,
392 |         tokenCount: countTokens(json, tokenOptions)
393 |     };
394 | }
395 | 
396 | export interface TranscriptScrapeResult {
397 |     text: string;
398 |     tokenCount: number;
399 | }
400 | 
401 | export type PlainTextScrapeResult = TranscriptScrapeResult;
402 | 
403 | export async function scrapeRepositoryToPlainTextWithTokenCount(
404 |     repoUrl: string,
405 |     tokenOptions?: TokenCountOptions,
406 |     transcriptOptions?: TranscriptFormatOptions
407 | ): Promise<TranscriptScrapeResult> {
408 |     const text = await scrapeRepositoryToTranscript(repoUrl, transcriptOptions);
409 |     return {
410 |         text,
411 |         tokenCount: countTokens(text, tokenOptions)
412 |     };
413 | }
414 | 


--------------------------------------------------------------------------------