├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── gitin ├── __init__.py ├── __version__.py └── gitin.py ├── repo_code.md ├── requirements.txt ├── setup.py └── test_output.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | todo.md -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.1.0] - 2024-12-31 9 | 10 | ### Added 11 | - Initial release 12 | - Command-line interface for extracting GitHub repository content 13 | - Progress bars for directory scanning and file processing 14 | - Support for file pattern inclusion/exclusion 15 | - Content search functionality 16 | - Size limit filtering 17 | - Markdown output formatting 18 | - Version command 19 | - Summary statistics after processing 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 unclecode 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | include gitin/__version__.py 5 | recursive-include gitin *.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gitin 2 | 3 | [![PyPI version](https://badge.fury.io/py/gitin.svg)](https://badge.fury.io/py/gitin) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | [![Python Versions](https://img.shields.io/pypi/pyversions/gitin.svg)](https://pypi.org/project/gitin/) 6 | 7 | 👋 Hi there! I'm [unclecode](https://x.com/unclecode), the author of [Crawl4AI](https://github.com/unclecode/crawl4ai) - a no 1 trending GitHub repository that's crawl the web in LLms friendly way. While working with LLMs like Claude and GPT, I often need to provide codebase context efficiently. That's why I created `gitin` - a simple yet powerful tool that helps you extract and format GitHub repository content for LLM consumption. 8 | 9 | ## Why gitin? 10 | 11 | When chatting with AI models about code, providing the right context is crucial. `gitin` helps you: 12 | - Extract relevant code files from any GitHub repository 13 | - Format them into a clean, token-efficient markdown file 14 | - Filter files by type, size, and content 15 | - Get token estimates for LLM context windows 16 | 17 | ## Installation 18 | 19 | ```bash 20 | pip install gitin 21 | ``` 22 | 23 | ## Quick Start 24 | 25 | Basic usage - get all Python files from a repository: 26 | ```bash 27 | gitin https://github.com/unclecode/crawl4ai -o output.md --include="*.py" 28 | ``` 29 | 30 | ## Examples 31 | 32 | ### 1. Basic Repository Extraction 33 | Extract Python files from Crawl4AI, excluding tests: 34 | ```bash 35 | gitin https://github.com/unclecode/crawl4ai \ 36 | --include="*.py" \ 37 | --exclude="tests/*" \ 38 | -o basic_example.md 39 | ``` 40 | 41 | ### 2. Search for Specific Content 42 | Find files containing async functions: 43 | ```bash 44 | gitin https://github.com/unclecode/crawl4ai \ 45 | --include="*.py" \ 46 | --search="async def" \ 47 | -o async_functions.md 48 | ``` 49 | 50 | ### 3. Multiple File Types with Size Limit 51 | Get both Python and Markdown files under 5KB: 52 | ```bash 53 | gitin https://github.com/unclecode/crawl4ai \ 54 | --include="*.py,*.md" \ 55 | --exclude="tests/*,docs/*" \ 56 | --max-size=5000 \ 57 | -o small_files.md 58 | ``` 59 | 60 | ### 4. Documentation Files Only 61 | Extract markdown files for documentation: 62 | ```bash 63 | gitin https://github.com/unclecode/crawl4ai \ 64 | --include="docs/**/*.md" \ 65 | -o documentation.md 66 | ``` 67 | 68 | ## Output Format 69 | 70 | The tool generates a clean markdown file with: 71 | - Repository structure 72 | - File contents with syntax highlighting 73 | - Clear separators between files 74 | - Token count estimation for LLMs 75 | 76 | ## Command-Line Options 77 | 78 | ``` 79 | Options: 80 | --version Show the version and exit 81 | --exclude TEXT Comma-separated glob patterns to exclude 82 | Example: --exclude="test_*,*.tmp,docs/*" 83 | --include TEXT Comma-separated glob patterns to include 84 | Example: --include="*.py,src/*.js,lib/*.rb" 85 | --search TEXT Comma-separated strings to search in file contents 86 | Example: --search="TODO,FIXME,HACK" 87 | --max-size INTEGER Maximum file size in bytes (default: 1MB) 88 | -o, --output TEXT Output markdown file path [required] 89 | --help Show this message and exit 90 | ``` 91 | 92 | ## Use with LLMs 93 | 94 | When using the output with AI models: 95 | 96 | 1. Generate the markdown file: 97 | ```bash 98 | gitin https://github.com/your/repo -o context.md --include="*.py" 99 | ``` 100 | 101 | 2. Copy the content to your conversation with the AI model 102 | 103 | 3. The AI model will now have context about your codebase and can help with: 104 | - Code review 105 | - Bug fixing 106 | - Feature implementation 107 | - Documentation 108 | - Refactoring suggestions 109 | 110 | ## Pro Tips 111 | 112 | 1. **Token Efficiency**: Use `--max-size` to limit file sizes and stay within context windows 113 | 2. **Relevant Context**: Use `--search` to find specific code patterns or TODO comments 114 | 3. **Multiple Patterns**: Combine patterns with commas: `--include="*.py,*.js,*.md"` 115 | 4. **Exclude Tests**: Use `--exclude="tests/*,*_test.py"` to focus on main code 116 | 5. **Documentation**: Include only docs with `--include="docs/**/*.md"` 117 | 118 | ## About the Author 119 | 120 | I'm unclecode, and I love building tools that make AI development easier. Check out my other project [Crawl4AI](https://github.com/unclecode/crawl4ai) and follow me on X [@unclecode](https://x.com/unclecode). 121 | 122 | ## Contributing 123 | 124 | Contributions are welcome! Feel free to: 125 | - Report bugs 126 | - Suggest features 127 | - Submit pull requests 128 | 129 | I'm extremely busy with Crawl4ai, so I may not be able to check this repository frequently. However, feel free to send your pull request, and I will try to approve it. 130 | 131 | ## License 132 | 133 | MIT License - feel free to use in your projects! 134 | 135 | ## Changelog 136 | 137 | See [CHANGELOG.md](CHANGELOG.md) for release history. 138 | -------------------------------------------------------------------------------- /gitin/__init__.py: -------------------------------------------------------------------------------- 1 | from .__version__ import __version__ 2 | 3 | __all__ = ['__version__'] -------------------------------------------------------------------------------- /gitin/__version__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /gitin/gitin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import subprocess 5 | from pathlib import Path 6 | from typing import List 7 | import click 8 | from fnmatch import fnmatch 9 | import tempfile 10 | import shutil 11 | from tqdm import tqdm 12 | from .__version__ import __version__ 13 | 14 | HELP_TEXT = """GitHub Repository Content Extractor 15 | 16 | Extracts and formats repository content, optimized for use with Large Language Models. 17 | Creates a single markdown file with clear separators between files. 18 | 19 | Examples: 20 | 21 | # Basic usage - get all Python files 22 | gitin https://github.com/user/repo -o output.md --include="*.py" 23 | 24 | # Multiple file patterns with content search 25 | gitin https://github.com/user/repo \\ 26 | --include="*.py,*.js" \\ 27 | --search="TODO,FIXME" \\ 28 | --exclude="test_*" \\ 29 | -o code_review.md 30 | 31 | # Extract only specific file types with size limit 32 | gitin https://github.com/user/repo \\ 33 | --include="src/*.py" \\ 34 | --max-size=100000 \\ 35 | -o small_py_files.md 36 | """ 37 | 38 | @click.command(help=HELP_TEXT) 39 | @click.version_option(version=__version__) 40 | @click.argument('github_url') 41 | @click.option('--exclude', default='', 42 | help="""Comma-separated glob patterns to exclude. 43 | Example: --exclude="test_*,*.tmp,docs/*" """) 44 | @click.option('--include', default='', 45 | help="""Comma-separated glob patterns to include. 46 | Example: --include="*.py,src/*.js,lib/*.rb" """) 47 | @click.option('--search', default='', 48 | help="""Comma-separated strings to search in file contents. Only files containing 49 | at least one of these strings will be included. 50 | Example: --search="TODO,FIXME,HACK" """) 51 | @click.option('--max-size', default=1000000, 52 | help="""Maximum file size in bytes to process. Files larger than this will be skipped. 53 | Default: 1MB""") 54 | @click.option('-o', '--output', required=True, 55 | help="Output markdown file path") 56 | def main(github_url: str, exclude: str, include: str, search: str, 57 | max_size: int, output: str): 58 | """Extract and format repository content.""" 59 | 60 | exclude_patterns = [p.strip() for p in exclude.split(',') if p.strip()] 61 | include_patterns = [p.strip() for p in include.split(',') if p.strip()] 62 | search_terms = [s.strip() for s in search.split(',') if s.strip()] 63 | 64 | with tempfile.TemporaryDirectory() as temp_dir: 65 | # Clone repository 66 | repo_dir = clone_repository(github_url, temp_dir) 67 | if not repo_dir: 68 | return 69 | 70 | # Process files 71 | process_repository(repo_dir, output, exclude_patterns, include_patterns, 72 | search_terms, max_size) 73 | 74 | def clone_repository(github_url: str, temp_dir: str) -> str: 75 | """Clone the repository and return the repo directory.""" 76 | try: 77 | subprocess.run(['git', 'clone', '--depth=1', github_url, temp_dir], 78 | check=True, capture_output=True) 79 | return temp_dir 80 | except subprocess.CalledProcessError as e: 81 | click.echo(f"Error cloning repository: {e.stderr.decode()}", err=True) 82 | return None 83 | 84 | def process_repository(repo_dir: str, output_file: str, 85 | exclude_patterns: List[str], include_patterns: List[str], 86 | search_terms: List[str], max_size: int): 87 | """Process repository files and write to output markdown file.""" 88 | processed_files = 0 89 | total_chars = 0 90 | 91 | # First, count total files for progress bar 92 | total_files = sum(1 for _ in os.walk(repo_dir)) 93 | 94 | with open(output_file, 'w') as f: 95 | f.write(f"# Repository Content\n\n") 96 | 97 | # Create progress bar for directory scanning 98 | with tqdm(total=total_files, desc="Scanning directories", unit="dir") as pbar: 99 | for root, _, files in os.walk(repo_dir): 100 | if '.git' in root: 101 | pbar.update(1) 102 | continue 103 | 104 | # Create progress bar for files in current directory 105 | files_pbar = tqdm(files, desc=f"Processing {os.path.basename(root)}", 106 | leave=False, unit="file") 107 | 108 | for file in files_pbar: 109 | file_path = os.path.join(root, file) 110 | rel_path = os.path.relpath(file_path, repo_dir) 111 | 112 | # Update description with current file 113 | files_pbar.set_description(f"Processing {rel_path}") 114 | 115 | # Skip files that match exclude patterns 116 | if any(fnmatch(rel_path, pat) for pat in exclude_patterns): 117 | continue 118 | 119 | # Skip files that don't match include patterns (if specified) 120 | if include_patterns and not any(fnmatch(rel_path, pat) 121 | for pat in include_patterns): 122 | continue 123 | 124 | # Skip files larger than max_size 125 | if os.path.getsize(file_path) > max_size: 126 | continue 127 | 128 | # Check file content for search terms 129 | if search_terms: 130 | with open(file_path, 'r', encoding='utf-8', errors='ignore') as cf: 131 | content = cf.read() 132 | if not any(term.lower() in content.lower() 133 | for term in search_terms): 134 | continue 135 | 136 | # Write file content to markdown 137 | try: 138 | with open(file_path, 'r', encoding='utf-8', errors='ignore') as cf: 139 | content = cf.read() 140 | f.write(f"\n## {rel_path}\n") 141 | f.write("```\n") 142 | f.write(content) 143 | f.write("\n```\n") 144 | processed_files += 1 145 | total_chars += len(content) 146 | except Exception as e: 147 | click.echo(f"Error processing {rel_path}: {str(e)}", err=True) 148 | 149 | pbar.update(1) 150 | 151 | # Print summary statistics 152 | click.echo("\nSUMMARY:") 153 | click.echo(f"Files processed: {processed_files}") 154 | click.echo(f"Total characters: {total_chars}") 155 | click.echo(f"Estimated tokens: {total_chars // 4}") # Rough estimate of tokens 156 | click.echo(f"Output written to: {output_file}") 157 | 158 | if __name__ == '__main__': 159 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.1.7 2 | requests==2.31.0 3 | tqdm==4.66.1 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from gitin.__version__ import __version__ 3 | 4 | with open("README.md", "r", encoding="utf-8") as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name="gitin", 9 | version=__version__, 10 | author="unclecode", 11 | author_email="", # Add your email if you want 12 | description="Extract and format GitHub repository content for LLMs", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/unclecode/gitin", 16 | project_urls={ 17 | "Bug Tracker": "https://github.com/unclecode/gitin/issues", 18 | "Documentation": "https://github.com/unclecode/gitin#readme", 19 | "Source Code": "https://github.com/unclecode/gitin", 20 | "Changelog": "https://github.com/unclecode/gitin/blob/main/CHANGELOG.md", 21 | }, 22 | packages=find_packages(), 23 | classifiers=[ 24 | "Development Status :: 4 - Beta", 25 | "Environment :: Console", 26 | "Intended Audience :: Developers", 27 | "Intended Audience :: Science/Research", 28 | "License :: OSI Approved :: MIT License", 29 | "Operating System :: OS Independent", 30 | "Programming Language :: Python :: 3", 31 | "Programming Language :: Python :: 3.6", 32 | "Programming Language :: Python :: 3.7", 33 | "Programming Language :: Python :: 3.8", 34 | "Programming Language :: Python :: 3.9", 35 | "Programming Language :: Python :: 3.10", 36 | "Topic :: Software Development :: Libraries :: Python Modules", 37 | "Topic :: Software Development :: Version Control :: Git", 38 | "Topic :: Text Processing :: Markup :: Markdown", 39 | ], 40 | keywords="github, llm, content-extraction, markdown, repository-analysis", 41 | python_requires=">=3.6", 42 | install_requires=[ 43 | "click>=8.1.7", 44 | "requests>=2.31.0", 45 | "tqdm>=4.66.1", 46 | ], 47 | entry_points={ 48 | "console_scripts": [ 49 | "gitin=gitin.gitin:main", 50 | ], 51 | }, 52 | ) 53 | --------------------------------------------------------------------------------