├── .ccignore
├── .github
    └── workflows
    │   ├── publish.yml
    │   ├── run-static.yml
    │   └── run-tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .python-version
├── AGENTS.md
├── README.md
├── pyproject.toml
├── src
    └── copychat
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── core.py
    │   ├── format.py
    │   ├── patterns.py
    │   └── sources.py
├── tests
    ├── conftest.py
    ├── data
    │   ├── test1.txt
    │   └── test2.md
    ├── fixtures
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── config
    │   │   └── settings.yml
    │   ├── db
    │   │   └── schema.sql
    │   ├── docs
    │   │   └── README.md
    │   └── src
    │   │   ├── app.js
    │   │   ├── main.py
    │   │   ├── styles
    │   │       └── main.css
    │   │   ├── types.ts
    │   │   └── utils
    │   │       └── helpers.py
    ├── test_ccignore.py
    ├── test_cli.py
    ├── test_core.py
    ├── test_format.py
    ├── test_github_item.py
    ├── test_integration.py
    ├── test_patterns.py
    ├── test_sources.py
    └── tests
    │   └── data
    │       ├── test1.txt
    │       └── test2.md
└── uv.lock


/.ccignore:
--------------------------------------------------------------------------------
 1 | # CopyChat default ignore patterns
 2 | # This file uses the same syntax as .gitignore
 3 | # It applies to the current directory and all subdirectories
 4 | 
 5 | # Build artifacts
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | *.so
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | *.egg-info/
16 | .mypy_cache/
17 | 
18 | # Editor and IDE files
19 | .idea/
20 | .vscode/
21 | *.swp
22 | *.swo
23 | *~
24 | 
25 | # Local development
26 | .env
27 | .venv
28 | env/
29 | venv/
30 | 
31 | # Dependencies
32 | node_modules/
33 | 
34 | # Project-specific
35 | # Add patterns specific to your project here 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Copychat to PyPI
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 |   workflow_dispatch:
 6 | 
 7 | jobs:
 8 |   publish-pypi-release:
 9 |     runs-on: ubuntu-latest
10 |     environment: release
11 |     permissions:
12 |       contents: write
13 |       id-token: write
14 |     steps:
15 |       - name: Checkout
16 |         uses: actions/checkout@v4
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v5
19 |         with:
20 |           python-version: "3.11"
21 |           cache: pip
22 |           cache-dependency-path: "**/pyproject.toml"
23 |       - name: Install dependencies
24 |         run: |
25 |           pip install setuptools wheel build
26 |       - name: Build
27 |         run: |
28 |           python -m build
29 |       - name: Publish
30 |         uses: pypa/gh-action-pypi-publish@release/v1
31 |         with:
32 |           verbose: true
33 | 


--------------------------------------------------------------------------------
/.github/workflows/run-static.yml:
--------------------------------------------------------------------------------
 1 | name: Run static analysis
 2 | 
 3 | env:
 4 |   # enable colored output
 5 |   # https://github.com/pytest-dev/pytest/issues/7443
 6 |   PY_COLORS: 1
 7 | 
 8 | on:
 9 |   push:
10 |     branches: ["main"]
11 |     paths:
12 |       - "src/**"
13 |       - "tests/**"
14 |       - "uv.lock"
15 |       - "pyproject.toml"
16 |       - ".github/workflows/**"
17 | 
18 |   # run on all pull requests because these checks are required and will block merges otherwise
19 |   pull_request:
20 | 
21 |   workflow_dispatch:
22 | 
23 | permissions:
24 |   contents: read
25 | 
26 | jobs:
27 |   static_analysis:
28 |     timeout-minutes: 2
29 | 
30 |     runs-on: ubuntu-latest
31 | 
32 |     steps:
33 |       - uses: actions/checkout@v4
34 |       - name: Install uv
35 |         uses: astral-sh/setup-uv@v5
36 |         with:
37 |           enable-cache: true
38 |           cache-dependency-glob: "uv.lock"
39 |       - name: Set up Python
40 |         uses: actions/setup-python@v5
41 |         with:
42 |           python-version: "3.12"
43 |       - name: Install dependencies
44 |         run: uv sync --dev
45 |       - name: Run pre-commit
46 |         uses: pre-commit/action@v3.0.1
47 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | env:
 4 |   # enable colored output
 5 |   PY_COLORS: 1
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: ["main"]
10 |     paths:
11 |       - "src/**"
12 |       - "tests/**"
13 |       - "uv.lock"
14 |       - "pyproject.toml"
15 |       - ".github/workflows/**"
16 | 
17 |   # run on all pull requests because these checks are required and will block merges otherwise
18 |   pull_request:
19 | 
20 |   workflow_dispatch:
21 | 
22 | permissions:
23 |   contents: read
24 | 
25 | jobs:
26 |   run_tests:
27 |     name: "Run tests: Python ${{ matrix.python-version }} on ${{ matrix.os }}"
28 |     runs-on: ${{ matrix.os }}
29 |     strategy:
30 |       matrix:
31 |         os: [ubuntu-latest]
32 |         python-version: ["3.10", "3.12"]
33 |       fail-fast: false
34 |     timeout-minutes: 5
35 | 
36 |     steps:
37 |       - uses: actions/checkout@v4
38 | 
39 |       - name: Install uv
40 |         uses: astral-sh/setup-uv@v5
41 |         with:
42 |           enable-cache: true
43 |           cache-dependency-glob: "uv.lock"
44 |           python-version: ${{ matrix.python-version }}
45 | 
46 |       - name: Install CopyChat
47 |         run: uv sync --dev --locked
48 | 
49 |       - name: Run tests
50 |         run: uv run pytest
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual environments
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | env.bak/
30 | venv.bak/
31 | 
32 | # IDE
33 | .idea/
34 | .vscode/
35 | *.swp
36 | *.swo
37 | *~
38 | .project
39 | .classpath
40 | .settings/
41 | *.sublime-workspace
42 | *.sublime-project
43 | 
44 | # OS
45 | .DS_Store
46 | .DS_Store?
47 | ._*
48 | .Spotlight-V100
49 | .Trashes
50 | ehthumbs.db
51 | Thumbs.db
52 | Desktop.ini
53 | 
54 | # Logs and databases
55 | *.log
56 | *.sqlite
57 | *.db
58 | 
59 | # Coverage and test reports
60 | htmlcov/
61 | .tox/
62 | .nox/
63 | .coverage
64 | .coverage.*
65 | .cache
66 | nosetests.xml
67 | coverage.xml
68 | *.cover
69 | *.py,cover
70 | .hypothesis/
71 | .pytest_cache/
72 | cover/
73 | 
74 | # Build and packaging
75 | *.manifest
76 | *.spec
77 | pip-log.txt
78 | pip-delete-this-directory.txt
79 | 
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 | 
83 | # Local development
84 | local_settings.py
85 | db.sqlite3
86 | db.sqlite3-journal
87 | 
88 | # Node (in case of docs or frontend components)
89 | node_modules/
90 | npm-debug.log*
91 | yarn-debug.log*
92 | yarn-error.log*
93 | .pnpm-debug.log*
94 | src/copychat/_version.py
95 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | fail_fast: true
 2 | 
 3 | repos:
 4 |   - repo: https://github.com/abravalheri/validate-pyproject
 5 |     rev: v0.23
 6 |     hooks:
 7 |       - id: validate-pyproject
 8 | 
 9 |   - repo: https://github.com/astral-sh/ruff-pre-commit
10 |     # Ruff version.
11 |     rev: v0.11.4
12 |     hooks:
13 |       # Run the linter.
14 |       - id: ruff
15 |         args: [--fix, --exit-non-zero-on-fix]
16 |       # Run the formatter.
17 |       - id: ruff-format
18 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
  1 | # AGENTS
  2 | 
  3 | Copychat converts project code into LLM-friendly context. This is a guide to help LLMs quickly understand and navigate the codebase. The repo is maintained by [@jlowin](https://github.com/jlowin) on GitHub[https://github.com/jlowin/copychat].
  4 | 
  5 | ## Project Overview
  6 | 
  7 | Copychat is a CLI tool that prepares source code for LLM context windows by:
  8 | 1. Scanning directories/files based on inclusion/exclusion patterns
  9 | 2. Respecting `.gitignore` and `.ccignore` patterns
 10 | 3. Formatting code with proper language tags
 11 | 4. Including git diff information when requested
 12 | 5. Estimating token counts for context planning
 13 | 
 14 | ## Repository Layout
 15 | 
 16 | * `README.md` – overview and documentation
 17 | * `src/copychat/` – CLI and library implementation
 18 | * `tests/` – pytest suite
 19 | * `pyproject.toml` – PEP-621 metadata; build is managed by **uv**
 20 | * `.github/workflows/` – CI that lints, runs tests, and publishes to PyPI
 21 | * `.ccignore` – custom ignore patterns for copychat itself
 22 | 
 23 | ## Core Components
 24 | 
 25 | * `core.py` - Main scanning functionality, git integration, and file handling
 26 |   - `scan_directory()` - Primary function for finding and processing files
 27 |   - `DiffMode` - Enum defining different git diff display modes
 28 |   - Handles `.gitignore` and `.ccignore` patterns
 29 | 
 30 | * `format.py` - Formats code for LLM consumption
 31 |   - `format_files()` - Formats file content with metadata
 32 |   - `estimate_tokens()` - Calculates approximate token usage
 33 | 
 34 | * `cli.py` - Command-line interface
 35 |   - Main entry point for user interaction
 36 |   - Parses arguments and handles output (clipboard/file)
 37 | 
 38 | * `sources.py` - Handles different source types (filesystem, GitHub)
 39 |   - `GitHubSource` - Fetches code from GitHub repositories
 40 | 
 41 | * `patterns.py` - Defines file patterns and exclusions
 42 | 
 43 | ## Key Workflows
 44 | 
 45 | 1. **Basic Usage**: `copychat` scans the current directory and copies formatted code to clipboard
 46 | 2. **Filtered Scanning**: `copychat --include py,js` only processes specified file types
 47 | 3. **Git Integration**: `copychat --diff-mode full-with-diff` shows changes with context
 48 | 4. **GitHub**: `copychat --source github:user/repo` fetches remote code
 49 | 
 50 | ## Common CLI Flags
 51 | 
 52 | * `--include py,js` - restrict scanned extensions
 53 | * `--exclude "**/*.test.js"` - exclude specific patterns
 54 | * `--diff-mode full-with-diff` - embed git diff chunks
 55 | * `--diff-branch main` - compare against specific branch
 56 | * `--source github:<org>/<repo>` - pull remote code via GitHub
 57 | * `--out file.md` - write to file instead of clipboard
 58 | * `--depth 2` - limit directory recursion depth
 59 | 
 60 | ## Data Flow
 61 | 
 62 | 1. CLI parses arguments → determines source type
 63 | 2. `scan_directory()` finds matching files → applies filters
 64 | 3. Git diff information is added if requested
 65 | 4. `format_files()` processes content → calculates tokens
 66 | 5. Formatted output is sent to clipboard/file/stdout
 67 | 
 68 | ## Common Patterns
 69 | 
 70 | - Path handling uses `pathlib.Path` throughout
 71 | - File content is processed as `(path, content)` tuples
 72 | - Git operations use subprocess to call git commands
 73 | - `.ccignore` supports hierarchical pattern inheritance
 74 | 
 75 | ## Testing
 76 | 
 77 | - `tests/test_integration.py` has examples of all major functionality
 78 | - `sample_project` fixture provides test files of various types
 79 | 
 80 | ## Development
 81 | 
 82 | ```bash
 83 | uv sync                       # install runtime + dev deps
 84 | uv run pre-commit run --all-files
 85 | uv run pytest
 86 | ```
 87 | 
 88 | ## Release Process
 89 | 
 90 | Pushing a semantic-version tag to `main` triggers the **Publish Copychat to PyPI** workflow, building and uploading the wheel.
 91 | 
 92 | ## Development Guidelines
 93 | 
 94 | 1. Fix all Ruff/Pyright warnings (`pre-commit` enforces)
 95 | 2. Respect `.ccignore` and `.gitignore` patterns
 96 | 3. Keep CLI flags backward-compatible
 97 | 4. Document changes in `CHANGELOG.md`
 98 | 
 99 | ## Extending the Tool
100 | 
101 | - Add new file types in `patterns.py`
102 | - Add new source types in `sources.py`
103 | - Add new formatting options in `format.py`
104 | - `GitHubItem` in `sources.py` fetches issues and PRs. Pass an issue/PR URL
105 |   (e.g. `owner/repo#123` or `https://github.com/owner/repo/issues/123`) directly
106 |   to the main `copychat` command.
107 | 
108 | **Note to LLMs**: When working on this repository, keep this AGENTS.md file up to date with new insights that would help future LLMs quickly understand the codebase structure and functionality. This file should serve as a quick reference that reduces the need for extensive code exploration.
109 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Copychat 📋🐈‍⬛
  2 | 
  3 | **Simple code-to-context.**
  4 | 
  5 | Copychat is a lightweight CLI tool that prepares your code for conversations with LLMs. It intelligently formats your source files into chat-ready context, handling everything from file selection to git diffs.
  6 | 
  7 | ## Features
  8 | 
  9 | - 🎯 **Smart file selection**: Automatically identifies relevant source files while respecting `.gitignore`
 10 | - 🔍 **Git-aware**: Can include diffs and focus on changed files
 11 | - 📦 **GitHub integration**: Pull directly from repositories
 12 | - 🎨 **Clean output**: Formats code with proper language tags and metadata
 13 | - 📋 **Clipboard ready**: Results go straight to your clipboard
 14 | - 🔢 **Token smart**: Estimates token count for context planning
 15 | 
 16 | ## Running Copychat
 17 | 
 18 | You can use [uv](https://docs.astral.sh/uv/) to run copychat directly from the command line, without needing to install it first:
 19 | ```bash
 20 | uvx copychat
 21 | ```
 22 | 
 23 | Frequent users may want to add the following alias to their `.zshrc` or `.bashrc`:
 24 | ```bash
 25 | alias cc="uvx copychat"
 26 | ```
 27 | 
 28 | This permits you to quickly copy context by running e.g. `cc docs/getting-started/ src/core/` from any directory, in any environment.
 29 | 
 30 | If you want to save a few milliseconds, you can install copychat globally with `uv tool install copychat` or add it to your environment with `uv add copychat`. And of course, `pip install copychat` works too.
 31 | 
 32 | ## Quick Start
 33 | Collect, format, and copy all source code in the current directory (and subdirectories) to the clipboard:
 34 | ```bash
 35 | copychat
 36 | ```
 37 | 
 38 | Copy only Python files to clipboard:
 39 | ```bash
 40 | copychat -i py
 41 | ```
 42 | 
 43 | Copy specific files, including any git diffs:
 44 | ```bash
 45 | copychat src/ tests/test_api.py --diff-mode full-with-diff
 46 | ```
 47 | 
 48 | Use GitHub as a source instead of the local filesystem:
 49 | ```bash
 50 | copychat src/ -s github:prefecthq/controlflow
 51 | ```
 52 | 
 53 | ## Usage Guide
 54 | 
 55 | Copychat is designed to be intuitive while offering powerful options for more complex needs. Let's walk through common use cases:
 56 | 
 57 | ### Basic Directory Scanning
 58 | 
 59 | At its simplest, run `copychat` in any directory to scan and format all recognized source files:
 60 | 
 61 | ```bash
 62 | copychat
 63 | ```
 64 | 
 65 | This will scan the current directory, format all supported files, and copy the result to your clipboard. The output includes metadata like character and token counts to help you stay within LLM context limits.
 66 | 
 67 | ### Targeting Specific Files
 68 | 
 69 | You can specify exactly what you want to include:
 70 | 
 71 | ```bash
 72 | # Single file
 73 | copychat src/main.py
 74 | 
 75 | # Multiple specific files and directories
 76 | copychat src/api.py tests/test_api.py docs/
 77 | 
 78 | # Glob patterns
 79 | copychat src/*.py tests/**/*.md
 80 | ```
 81 | 
 82 | ### Filtering by Language
 83 | 
 84 | When you only want specific file types, use the `--include` flag with comma-separated extensions:
 85 | 
 86 | ```bash
 87 | # Just Python files
 88 | copychat --include py
 89 | 
 90 | # Python and JavaScript
 91 | copychat --include py,js,jsx
 92 | ```
 93 | 
 94 | ### Working with Git
 95 | 
 96 | Copychat shines when working with git repositories. Use different diff modes to focus on what matters:
 97 | 
 98 | ```bash
 99 | # Show only files that have changed, with their diffs
100 | copychat --diff-mode changed-with-diff
101 | 
102 | # Show all files, but include diffs for changed ones
103 | copychat --diff-mode full-with-diff
104 | 
105 | # Show only the git diff chunks themselves
106 | copychat --diff-mode diff-only
107 | 
108 | # See what changed since branching from develop
109 | copychat --diff-mode diff-only --diff-branch develop
110 | ```
111 | 
112 | The `-diff-mode` and `--diff-branch` options are particularly useful when you want to:
113 | - Review any changes you've made, either in isolation or in context
114 | - Compare changes against a specific branch
115 | 
116 | ### Excluding Files
117 | 
118 | You can exclude files that match certain patterns:
119 | 
120 | ```bash
121 | # Skip test files
122 | copychat --exclude "**/*.test.js,**/*.spec.py"
123 | 
124 | # Skip specific directories
125 | copychat --exclude "build/*,dist/*"
126 | ```
127 | 
128 | Copychat automatically respects your `.gitignore` file and common ignore patterns (like `node_modules`).
129 | 
130 | ### GitHub Integration
131 | 
132 | #### Reading GitHub Repositories
133 | 
134 | Pull directly from GitHub repositories:
135 | 
136 | ```bash
137 | # Using the github: prefix
138 | copychat --source github:username/repo
139 | 
140 | # Or just paste a GitHub URL
141 | copychat --source https://github.com/username/repo
142 | 
143 | # Process specific paths within the repository
144 | copychat --source github:username/repo src/main.py tests/
145 | ```
146 | 
147 | The `--source` flag specifies where to look (GitHub, filesystem, etc.), and then any additional arguments specify which paths within that source to process. This means you can target specific files or directories within a GitHub repository just like you would with local files.
148 | 
149 | #### Reading GitHub Issues, PRs & Discussions
150 | 
151 | Copy the full text and comment history of a GitHub issue, pull request, or discussion by
152 | passing the identifier directly to the main command:
153 | 
154 | ```bash
155 | # Issues and PRs
156 | copychat owner/repo#123
157 | copychat https://github.com/owner/repo/issues/123
158 | copychat https://github.com/owner/repo/pull/456
159 | 
160 | # Discussions  
161 | copychat https://github.com/owner/repo/discussions/789
162 | ```
163 | 
164 | For pull requests, the diff is included by default, giving you complete context of the proposed changes.
165 | 
166 | Set `GITHUB_TOKEN` or use `--token` if you need to access private content or want higher rate limits.
167 | 
168 | #### Reading Individual GitHub Files
169 | 
170 | You can fetch individual files directly from GitHub without cloning the entire repository by using blob URLs:
171 | 
172 | ```bash
173 | # Fetch a specific file from a commit/branch/tag
174 | copychat https://github.com/owner/repo/blob/main/src/api.py
175 | copychat https://github.com/owner/repo/blob/v1.2.3/config/settings.yaml
176 | copychat https://github.com/owner/repo/blob/abc123def/docs/README.md
177 | ```
178 | 
179 | This is perfect for quickly grabbing specific files for context without the overhead of repository cloning.
180 | 
181 | The output is formatted like other files, with XML-style tags and proper language detection.
182 | 
183 | ### Output Options
184 | 
185 | By default, Copychat copies to your clipboard, but you have other options:
186 | 
187 | ```bash
188 | # Append to clipboard
189 | copychat --append
190 | 
191 | # Write to a file
192 | copychat --out context.md
193 | 
194 | # Append to existing file
195 | copychat --out context.md --append
196 | 
197 | # Print to screen
198 | copychat --print
199 | 
200 | # Both copy to clipboard and save to file
201 | copychat --out context.md
202 | ```
203 | 
204 | ### Verbose Output
205 | 
206 | Use the `--verbose` flag (or `-v`) to include detailed file information in the output, including token counts:
207 | 
208 | ```bash
209 | copychat -v
210 | ```
211 | 
212 | ### Limiting Directory Depth
213 | 
214 | Control how deep copychat scans subdirectories:
215 | 
216 | ```bash
217 | # Only files in current directory
218 | copychat --depth 0
219 | 
220 | # Current directory and immediate subdirectories only
221 | copychat --depth 1
222 | 
223 | # Scan up to 3 levels deep
224 | copychat --depth 3
225 | ```
226 | 
227 | ## Options
228 | 
229 | ```bash
230 | copychat [OPTIONS] [PATHS]...
231 | 
232 | Options:
233 |   -s, --source TEXT     Source to scan (filesystem path, github:owner/repo, or URL)
234 |   -o, --out PATH        Write output to file
235 |   -a, --append          Append output instead of overwriting
236 |   -p, --print          Print output to screen
237 |   -v, --verbose         Show detailed file information in output
238 |   -i, --include TEXT    Extensions to include (comma-separated, e.g. 'py,js,ts')
239 |   -x, --exclude TEXT    Glob patterns to exclude
240 |   -d, --depth INTEGER   Maximum directory depth to scan (0 = current dir only)
241 |   --diff-mode TEXT     How to handle git diffs
242 |   --diff-branch TEXT Compare changes against specified branch
243 |   --debug              Debug mode for development
244 |   --help               Show this message and exit
245 | ```
246 | 
247 | ## Supported File Types
248 | 
249 | Copychat automatically recognizes and properly formats many common file types, including:
250 | 
251 | - Python (`.py`, `.pyi`)
252 | - JavaScript/TypeScript (`.js`, `.ts`, `.jsx`, `.tsx`)
253 | - Web (`.html`, `.css`, `.scss`)
254 | - Systems (`.c`, `.cpp`, `.rs`, `.go`)
255 | - Config (`.yaml`, `.toml`, `.json`)
256 | - Documentation (`.md`, `.rst`, `.txt`)
257 | - And [many more](https://github.com/username/copychat/blob/main/copychat/patterns.py)
258 | 
259 | ## Output Format
260 | 
261 | Copychat generates clean, structured output with:
262 | - File paths and language tags
263 | - Token count estimates
264 | - Git diff information (when requested)
265 | - Proper syntax highlighting markers
266 | 
267 | ## Using `.ccignore` Files
268 | 
269 | CopyChat supports hierarchical ignore patterns through `.ccignore` files. These files work similarly to `.gitignore` files but with an important difference: they apply to all directories and subdirectories where they're located.
270 | 
271 | ### Key Features
272 | 
273 | - `.ccignore` files use the same syntax as `.gitignore` files
274 | - Each `.ccignore` file applies to its directory and all subdirectories
275 | - Patterns from multiple `.ccignore` files are inherited, with more specific directories taking precedence
276 | 
277 | ### Example
278 | 
279 | ```
280 | project/
281 | ├── .ccignore        # Contains "*.log" - excludes log files in all directories
282 | ├── src/
283 | │   ├── .ccignore    # Contains "*.tmp" - excludes tmp files in src/ and below
284 | │   └── ...
285 | └── tests/
286 |     ├── .ccignore    # Contains "*.fixture" - excludes fixture files in tests/ and below
287 |     └── ...
288 | ```
289 | 
290 | In this example:
291 | - `*.log` files are excluded everywhere
292 | - `*.tmp` files are only excluded in `src/` and its subdirectories
293 | - `*.fixture` files are only excluded in `tests/` and its subdirectories
294 | 
295 | ### Creating a `.ccignore` File
296 | 
297 | Create a `.ccignore` file in your project root or any subdirectory:
298 | 
299 | ```
300 | # Comment lines start with #
301 | # Blank lines are ignored
302 | 
303 | # Ignore all files with .log extension
304 | *.log
305 | 
306 | # Ignore specific files
307 | secrets.json
308 | credentials.yaml
309 | 
310 | # Ignore directories
311 | node_modules/
312 | __pycache__/
313 | ```
314 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "copychat"
 3 | dynamic = ["version"]
 4 | description = "Easily copy code for LLM context"
 5 | authors = []
 6 | dependencies = [
 7 |     "typer>=0.9.0",
 8 |     "rich>=13.7.0",
 9 |     "pyperclip>=1.8.2",
10 |     "pathspec>=0.12.1",
11 |     "tiktoken>=0.6.0",
12 |     "gitpython>=3.1.42",
13 | ]
14 | requires-python = ">=3.9"
15 | readme = "README.md"
16 | license = { text = "MIT" }
17 | 
18 | [project.scripts]
19 | copychat = "copychat.cli:app"
20 | 
21 | [build-system]
22 | requires = ["setuptools>=64", "setuptools_scm>=8"]
23 | build-backend = "setuptools.build_meta"
24 | 
25 | [tool.setuptools_scm]
26 | version_file = "src/copychat/_version.py"
27 | 
28 | [tool.ruff]
29 | line-length = 88
30 | target-version = "py39"
31 | 
32 | [tool.pytest.ini_options]
33 | testpaths = ["tests"]
34 | addopts = "-v --tb=short"
35 | 
36 | [dependency-groups]
37 | dev = [
38 |     "pre-commit>=4.2.0",
39 |     "pytest>=8.3.3",
40 |  "pytest-asyncio>=0.24.0",
41 |  "pytest-cov>=5.0.0",
42 | ]
43 | 


--------------------------------------------------------------------------------
/src/copychat/__init__.py:
--------------------------------------------------------------------------------
 1 | """Convert source code directories into markdown for LLM context."""
 2 | 
 3 | 
 4 | # --- Version ---
 5 | 
 6 | try:
 7 |     from ._version import version as __version__  # type: ignore
 8 | except ImportError:
 9 |     __version__ = "unknown"
10 | 


--------------------------------------------------------------------------------
/src/copychat/cli.py:
--------------------------------------------------------------------------------
  1 | import typer
  2 | from pathlib import Path
  3 | from typing import Optional, List
  4 | from rich.console import Console
  5 | import pyperclip
  6 | from enum import Enum
  7 | from importlib.metadata import version as get_version
  8 | import atexit
  9 | import shutil
 10 | 
 11 | from .core import (
 12 |     scan_directory,
 13 |     DiffMode,
 14 |     get_file_content,
 15 | )
 16 | from .format import (
 17 |     format_files as format_files_xml,
 18 |     create_display_header,
 19 | )
 20 | from .sources import GitHubSource, GitHubItem, GitHubFile
 21 | 
 22 | 
 23 | # Register cleanup of temporary GitHub directory
 24 | def _cleanup_github_temp():
 25 |     from .sources import _github_temp_dir
 26 | 
 27 |     if _github_temp_dir is not None and _github_temp_dir.exists():
 28 |         try:
 29 |             shutil.rmtree(_github_temp_dir)
 30 |         except Exception:
 31 |             pass  # Ignore cleanup errors
 32 | 
 33 | 
 34 | atexit.register(_cleanup_github_temp)
 35 | 
 36 | 
 37 | class SourceType(Enum):
 38 |     """Type of source to scan."""
 39 | 
 40 |     FILESYSTEM = "filesystem"  # Default
 41 |     GITHUB = "github"
 42 |     WEB = "web"  # For future use
 43 | 
 44 | 
 45 | def parse_source(source: str) -> tuple[SourceType, str]:
 46 |     """Parse source string into type and location."""
 47 |     import re
 48 | 
 49 |     if source.startswith(("github:", "gh:")):
 50 |         return SourceType.GITHUB, source.split(":", 1)[1]
 51 | 
 52 |     # Handle GitHub URLs with issues/pulls/discussions
 53 |     if source and source.startswith(("http://", "https://")) and "github.com" in source:
 54 |         pr_issue_match = re.search(
 55 |             r"github\.com/([^/]+/[^/]+)/(?:issues|pull|discussions)/([0-9]+)", source
 56 |         )
 57 |         if pr_issue_match:
 58 |             # This is a PR, issue, or discussion URL, keep it as FILESYSTEM type so it's processed directly
 59 |             return SourceType.FILESYSTEM, source
 60 | 
 61 |         # Handle GitHub blob URLs (individual files)
 62 |         blob_match = re.search(r"github\.com/([^/]+/[^/]+)/blob/([^/]+)/(.*)", source)
 63 |         if blob_match:
 64 |             # This is a GitHub blob URL, keep it as FILESYSTEM type so it's processed directly
 65 |             return SourceType.FILESYSTEM, source
 66 | 
 67 |     # Regular GitHub repo URL
 68 |     if source and "github.com" in source:
 69 |         parts = source.split("github.com/", 1)
 70 |         if len(parts) == 2:
 71 |             return SourceType.GITHUB, parts[1]
 72 | 
 73 |     if source and source.startswith(("http://", "https://")):
 74 |         return SourceType.WEB, source
 75 | 
 76 |     return SourceType.FILESYSTEM, source
 77 | 
 78 | 
 79 | def parse_github_item(item: str) -> tuple[str, int, str]:
 80 |     """Parse issue, PR, or discussion identifier into repo, number, and type."""
 81 |     import re
 82 | 
 83 |     if item.startswith("http://") or item.startswith("https://"):
 84 |         m = re.search(
 85 |             r"github\.com/([^/]+/[^/]+)/(issues|pull|discussions)/([0-9]+)", item
 86 |         )
 87 |         if not m:
 88 |             raise typer.BadParameter("Invalid GitHub URL")
 89 |         return (
 90 |             m.group(1),
 91 |             int(m.group(3)),
 92 |             m.group(2).rstrip("s"),
 93 |         )  # Remove 's' from 'issues' -> 'issue'
 94 | 
 95 |     if "#" in item:
 96 |         repo, num = item.split("#", 1)
 97 |         return (
 98 |             repo.strip(),
 99 |             int(num),
100 |             "issue",
101 |         )  # Default to issue for backward compatibility
102 | 
103 |     raise typer.BadParameter("Item must be in owner/repo#number format or URL")
104 | 
105 | 
106 | def parse_github_blob(item: str) -> tuple[str, str, str]:
107 |     """Parse GitHub blob URL into repo, ref, and file path."""
108 |     import re
109 | 
110 |     match = re.search(r"github\.com/([^/]+/[^/]+)/blob/([^/]+)/(.*)", item)
111 |     if not match:
112 |         raise typer.BadParameter("Invalid GitHub blob URL")
113 | 
114 |     return match.group(1), match.group(2), match.group(3)  # repo, ref, file_path
115 | 
116 | 
117 | def diff_mode_callback(value: str) -> DiffMode:
118 |     """Convert string value to DiffMode enum."""
119 |     try:
120 |         if isinstance(value, DiffMode):
121 |             return value
122 |         return DiffMode(value)
123 |     except ValueError:
124 |         valid_values = [mode.value for mode in DiffMode]
125 |         raise typer.BadParameter(f"Must be one of: {', '.join(valid_values)}")
126 | 
127 | 
128 | app = typer.Typer(
129 |     no_args_is_help=True,  # Show help when no args provided
130 |     add_completion=False,  # Disable shell completion for simplicity
131 | )
132 | console = Console()
133 | error_console = Console(stderr=True)
134 | 
135 | 
136 | @app.command()
137 | def main(
138 |     paths: list[str] = typer.Argument(
139 |         None,
140 |         help="Paths to process within the source (defaults to current directory)",
141 |     ),
142 |     version: bool = typer.Option(
143 |         None,
144 |         "--version",
145 |         help="Show version and exit.",
146 |         is_eager=True,
147 |     ),
148 |     source: Optional[str] = typer.Option(
149 |         None,
150 |         "--source",
151 |         "-s",
152 |         help="Source to scan (filesystem path, github:owner/repo, or URL)",
153 |     ),
154 |     outfile: Optional[Path] = typer.Option(
155 |         None,
156 |         "--out",
157 |         "-o",
158 |         help="Write output to file. If provided, output will not be copied to clipboard.",
159 |     ),
160 |     append: bool = typer.Option(
161 |         False,
162 |         "--append",
163 |         "-a",
164 |         help="Append output instead of overwriting",
165 |     ),
166 |     print_output: bool = typer.Option(
167 |         False,
168 |         "--print",
169 |         "-p",
170 |         help="Print output to screen",
171 |     ),
172 |     verbose: bool = typer.Option(
173 |         False,
174 |         "--verbose",
175 |         "-v",
176 |         help="Show detailed file information in output",
177 |     ),
178 |     include: Optional[str] = typer.Option(
179 |         None,
180 |         "--include",
181 |         "-i",
182 |         help="Extensions to include (comma-separated, e.g. 'py,js,ts')",
183 |     ),
184 |     exclude: Optional[List[str]] = typer.Option(
185 |         None,
186 |         "--exclude",
187 |         "-x",
188 |         help="Glob patterns to exclude",
189 |     ),
190 |     diff_mode: str = typer.Option(
191 |         "full",  # Pass the string value instead of enum
192 |         "--diff-mode",
193 |         help="How to handle git diffs",
194 |         callback=diff_mode_callback,
195 |     ),
196 |     depth: Optional[int] = typer.Option(
197 |         None,
198 |         "--depth",
199 |         "-d",
200 |         help="Maximum directory depth to scan (0 = current dir only)",
201 |     ),
202 |     debug: bool = typer.Option(
203 |         False,
204 |         "--debug",
205 |         help="Debug mode for development",
206 |     ),
207 |     compare_branch: Optional[str] = typer.Option(
208 |         None,
209 |         "--diff-branch",
210 |         help="Compare changes against specified branch instead of working directory",
211 |     ),
212 |     token: Optional[str] = typer.Option(
213 |         None,
214 |         "--token",
215 |         envvar="GITHUB_TOKEN",
216 |         help="GitHub token for issue and PR access",
217 |     ),
218 | ) -> None:
219 |     """Convert source code files to markdown format for LLM context."""
220 |     if version:
221 |         console.print(f"copychat version {get_version('copychat')}")
222 |         raise typer.Exit()
223 | 
224 |     try:
225 |         # Parse source type and location
226 |         source_type, source_loc = (
227 |             parse_source(source) if source else (SourceType.FILESYSTEM, ".")
228 |         )
229 | 
230 |         if debug:
231 |             error_console.print(
232 |                 f"[magenta]Source type:[/] {source_type}, location: {source_loc}"
233 |             )
234 |             error_console.print(f"[magenta]Paths to process:[/] {paths}")
235 | 
236 |         # Handle different source types
237 |         if source_type == SourceType.GITHUB:
238 |             try:
239 |                 github_source = GitHubSource(source_loc)
240 |                 source_dir = github_source.fetch()
241 |             except Exception as e:
242 |                 if debug:
243 |                     raise
244 |                 error_console.print(
245 |                     f"[red]Error fetching GitHub repository:[/] {str(e)}"
246 |                 )
247 |                 raise typer.Exit(1)
248 |         elif source_type == SourceType.WEB:
249 |             error_console.print("[red]Web sources not yet implemented[/]")
250 |             raise typer.Exit(1)
251 |         else:
252 |             source_dir = Path(source_loc)
253 | 
254 |         # Handle file vs directory source
255 |         if source_dir.is_file():
256 |             content = get_file_content(
257 |                 source_dir, diff_mode, compare_branch=compare_branch
258 |             )
259 |             all_files = {source_dir: content} if content is not None else {}
260 |         else:
261 |             # For directories, scan all paths
262 |             if not paths:
263 |                 paths = ["."]
264 | 
265 |             # Handle paths
266 |             all_files = {}
267 |             for path in paths:
268 |                 if debug:
269 |                     error_console.print(f"[cyan]Processing path:[/] {path}")
270 | 
271 |                 # Allow GitHub issues/PRs/discussions as direct arguments
272 |                 try:
273 |                     repo, num, item_type = parse_github_item(path)
274 |                     if debug:
275 |                         error_console.print(
276 |                             f"[blue]Processing GitHub {item_type}:[/] {repo}#{num}"
277 |                         )
278 |                     gh_item = GitHubItem(repo, num, token, item_type)
279 |                     p, content = gh_item.fetch()
280 |                     all_files[p] = content
281 |                     if debug:
282 |                         error_console.print(
283 |                             f"[green]Successfully fetched GitHub {item_type}[/]"
284 |                         )
285 |                     continue
286 |                 except Exception as e:
287 |                     if debug:
288 |                         error_console.print(
289 |                             f"[yellow]Failed to process as GitHub item:[/] {str(e)}"
290 |                         )
291 |                     pass
292 | 
293 |                 # Allow GitHub blob URLs (individual files)
294 |                 try:
295 |                     repo, ref, file_path = parse_github_blob(path)
296 |                     if debug:
297 |                         error_console.print(
298 |                             f"[blue]Processing GitHub file:[/] {repo}/{file_path}@{ref}"
299 |                         )
300 |                     gh_file = GitHubFile(path, token)
301 |                     p, content = gh_file.fetch()
302 |                     all_files[p] = content
303 |                     if debug:
304 |                         error_console.print(
305 |                             "[green]Successfully fetched GitHub file[/]"
306 |                         )
307 |                     continue
308 |                 except Exception as e:
309 |                     if debug:
310 |                         error_console.print(
311 |                             f"[yellow]Failed to process as GitHub blob:[/] {str(e)}"
312 |                         )
313 |                     pass
314 | 
315 |                 target = Path(path)
316 |                 if target.is_absolute():
317 |                     # Use absolute paths as-is
318 |                     if target.is_file():
319 |                         content = get_file_content(
320 |                             target, diff_mode, compare_branch=compare_branch
321 |                         )
322 |                         if content is not None:
323 |                             all_files[target] = content
324 |                     else:
325 |                         files = scan_directory(
326 |                             target,
327 |                             include=include.split(",") if include else None,
328 |                             exclude_patterns=exclude,
329 |                             diff_mode=diff_mode,
330 |                             max_depth=depth,
331 |                             compare_branch=compare_branch,
332 |                         )
333 |                         all_files.update(files)
334 |                 else:
335 |                     # For relative paths, try source dir first, then current dir
336 |                     targets = []
337 |                     if source_type == SourceType.GITHUB:
338 |                         # For GitHub sources, only look in the source directory
339 |                         targets = [source_dir / path]
340 |                     else:
341 |                         # For filesystem sources, try both but prefer source dir
342 |                         if source_dir != Path("."):
343 |                             targets.append(source_dir / path)
344 |                         targets.append(Path.cwd() / path)
345 | 
346 |                     for target in targets:
347 |                         if target.exists():
348 |                             if target.is_file():
349 |                                 content = get_file_content(
350 |                                     target, diff_mode, compare_branch=compare_branch
351 |                                 )
352 |                                 if content is not None:
353 |                                     all_files[target] = content
354 |                                 break
355 |                             else:
356 |                                 files = scan_directory(
357 |                                     target,
358 |                                     include=include.split(",") if include else None,
359 |                                     exclude_patterns=exclude,
360 |                                     diff_mode=diff_mode,
361 |                                     max_depth=depth,
362 |                                     compare_branch=compare_branch,
363 |                                 )
364 |                                 all_files.update(files)
365 |                                 break
366 |         if not all_files:
367 |             error_console.print("Found [red]0[/] matching files")
368 |             return
369 | 
370 |         # Separate GitHub issues/PRs from regular files for better reporting
371 |         github_items = []
372 |         filesystem_files = []
373 | 
374 |         for path, content in all_files.items():
375 |             if (
376 |                 str(path).endswith((".md", ".issue.md", ".pr.md", ".discussion.md"))
377 |                 and isinstance(path, Path)
378 |                 and not path.exists()
379 |             ) or (
380 |                 # Also detect GitHub files by checking if the filename contains repo info and doesn't exist locally
381 |                 isinstance(path, Path)
382 |                 and not path.exists()
383 |                 and "_"
384 |                 in str(
385 |                     path.name
386 |                 )  # GitHub files have underscores from repo/ref/path formatting
387 |                 and any(part in str(path.name) for part in ["github", "blob", "_"])
388 |             ):
389 |                 github_items.append((path, content))
390 |             else:
391 |                 filesystem_files.append((path, content))
392 | 
393 |         # Format files - pass both paths and content
394 |         format_result = format_files_xml(
395 |             [(path, content) for path, content in all_files.items()]
396 |         )
397 | 
398 |         # Get the formatted content, conditionally including header
399 |         if verbose:
400 |             result = str(format_result)
401 |             # Print the display header to stderr for visibility
402 |             error_console.print(
403 |                 "\nFile summary:",
404 |                 style="bold blue",
405 |             )
406 |             # Use the display-friendly header
407 |             error_console.print(create_display_header(format_result))
408 |             error_console.print()  # Add blank line after header
409 |         else:
410 |             # Skip the header by taking only the formatted files
411 |             result = "\n".join(f.formatted_content for f in format_result.files)
412 | 
413 |         # Custom message based on content types
414 |         if github_items and filesystem_files:
415 |             error_console.print(
416 |                 f"Downloaded [green]{len(github_items)}[/] GitHub items and found [green]{len(filesystem_files)}[/] matching files"
417 |             )
418 |         elif github_items:
419 |             error_console.print(
420 |                 f"Downloaded [green]{len(github_items)}[/] GitHub {'item' if len(github_items) == 1 else 'items'}"
421 |             )
422 |         else:
423 |             error_console.print(
424 |                 f"Found [green]{len(format_result.files)}[/] matching files"
425 |             )
426 | 
427 |         # Handle outputs
428 |         if outfile:
429 |             if append and outfile.exists():
430 |                 existing_content = outfile.read_text()
431 |                 result = existing_content + "\n\n" + result
432 |             outfile.write_text(result)
433 |             error_console.print(
434 |                 f"Output {'appended' if append else 'written'} to [green]{outfile}[/]"
435 |             )
436 |         # Only use clipboard if not writing to file AND not just printing to stdout
437 |         elif not print_output or append:
438 |             if append:
439 |                 try:
440 |                     existing_clipboard = pyperclip.paste()
441 |                     result = existing_clipboard + "\n\n" + result
442 |                 except Exception:
443 |                     error_console.print(
444 |                         "[yellow]Warning: Could not read clipboard for append[/]"
445 |                     )
446 | 
447 |             try:
448 |                 pyperclip.copy(result)
449 |                 # Calculate total lines outside the f-string
450 |                 total_lines = sum(
451 |                     f.content.count("\n") + 1 for f in format_result.files
452 |                 )
453 |                 error_console.print(
454 |                     f"{'Appended' if append else 'Copied'} to clipboard "
455 |                     f"(~{format_result.total_tokens:,} tokens, {total_lines:,} lines)"
456 |                 )
457 |             except Exception as e:
458 |                 error_console.print(
459 |                     f"[yellow]Warning: Could not copy to clipboard: {str(e)}[/]"
460 |                 )
461 |                 if not print_output:
462 |                     # If clipboard failed and we're not printing, show the content
463 |                     error_console.print("[cyan]Content would have been:[/]")
464 |                     print(result)
465 | 
466 |         # Print to stdout only if explicitly requested
467 |         if print_output:
468 |             print(result)
469 | 
470 |     except Exception as e:
471 |         if debug:
472 |             raise
473 |         error_console.print(f"[red]Error:[/] {str(e)}")
474 |         raise typer.Exit(1)
475 | 


--------------------------------------------------------------------------------
/src/copychat/core.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Optional
  3 | import pathspec
  4 | import subprocess
  5 | from enum import Enum
  6 | import os
  7 | 
  8 | from .patterns import DEFAULT_EXTENSIONS, EXCLUDED_DIRS, EXCLUDED_PATTERNS
  9 | 
 10 | 
 11 | class DiffMode(Enum):
 12 |     FULL = "full"  # All files as-is
 13 |     FULL_WITH_DIFF = "full-with-diff"  # All files with diff markers
 14 |     CHANGED_WITH_DIFF = "changed-with-diff"  # Only changed files with diff markers
 15 |     DIFF_ONLY = "diff-only"  # Only the diff chunks
 16 | 
 17 | 
 18 | def is_glob_pattern(path: str) -> bool:
 19 |     """Check if a path contains glob patterns."""
 20 |     return "*" in path
 21 | 
 22 | 
 23 | def resolve_paths(paths: list[str], base_path: Path = Path(".")) -> list[Path]:
 24 |     """Resolve a mix of glob patterns and regular paths."""
 25 |     resolved = []
 26 |     base_path = base_path.resolve()
 27 | 
 28 |     # Get gitignore and ccignore specs once for all paths
 29 |     git_spec = get_gitignore_spec(base_path)
 30 |     cc_spec = get_ccignore_spec(base_path)
 31 | 
 32 |     for path in paths:
 33 |         if is_glob_pattern(path):
 34 |             matches = list(base_path.glob(path))
 35 |             # Filter matches through gitignore and ccignore
 36 |             for match in matches:
 37 |                 try:
 38 |                     # Check if under base path
 39 |                     rel_path = match.relative_to(base_path)
 40 |                     # Skip if matches gitignore or ccignore patterns
 41 |                     rel_path_str = str(rel_path)
 42 |                     if git_spec.match_file(rel_path_str) or cc_spec.match_file(
 43 |                         rel_path_str
 44 |                     ):
 45 |                         continue
 46 |                     resolved.append(match)
 47 |                 except ValueError:
 48 |                     # If path is not relative to base_path, just use it as-is
 49 |                     resolved.append(match)
 50 |         else:
 51 |             # For non-glob paths, use them as-is
 52 |             path_obj = Path(path)
 53 |             if path_obj.is_absolute():
 54 |                 resolved.append(path_obj)
 55 |             else:
 56 |                 resolved.append(base_path / path)
 57 |     return resolved
 58 | 
 59 | 
 60 | def find_gitignore(start_path: Path) -> Optional[Path]:
 61 |     """Search for .gitignore file in current and parent directories."""
 62 |     current = start_path.absolute()
 63 |     while current != current.parent:
 64 |         gitignore = current / ".gitignore"
 65 |         if gitignore.is_file():
 66 |             return gitignore
 67 |         current = current.parent
 68 |     return None
 69 | 
 70 | 
 71 | def find_ccignore_files(start_path: Path) -> list[tuple[Path, Path]]:
 72 |     """
 73 |     Find all .ccignore files that apply to the given path.
 74 | 
 75 |     Returns a list of tuples (ccignore_file, directory) where:
 76 |     - ccignore_file is the path to the .ccignore file
 77 |     - directory is the directory containing the .ccignore file
 78 | 
 79 |     The list is ordered from most specific (closest to start_path) to most general.
 80 |     """
 81 |     ccignore_files = []
 82 |     current = start_path.absolute()
 83 | 
 84 |     # Start from the given path and traverse up to the root
 85 |     while current != current.parent:
 86 |         ccignore = current / ".ccignore"
 87 |         if ccignore.is_file():
 88 |             ccignore_files.append((ccignore, current))
 89 |         current = current.parent
 90 | 
 91 |     return ccignore_files
 92 | 
 93 | 
 94 | def get_gitignore_spec(
 95 |     path: Path, extra_patterns: Optional[list[str]] = None
 96 | ) -> pathspec.PathSpec:
 97 |     """Load .gitignore patterns and combine with our default exclusions."""
 98 |     patterns = list(EXCLUDED_PATTERNS)
 99 | 
100 |     # Add directory exclusions
101 |     dir_patterns = [f"{d}/" for d in EXCLUDED_DIRS]
102 |     patterns.extend(dir_patterns)
103 | 
104 |     # Add any extra patterns provided
105 |     if extra_patterns:
106 |         patterns.extend(extra_patterns)
107 | 
108 |     # Add patterns from .gitignore if found
109 |     gitignore_path = find_gitignore(path)
110 |     if gitignore_path:
111 |         with open(gitignore_path) as f:
112 |             gitignore_patterns = [
113 |                 line.strip() for line in f if line.strip() and not line.startswith("#")
114 |             ]
115 |             patterns.extend(gitignore_patterns)
116 | 
117 |     return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
118 | 
119 | 
120 | def get_ccignore_spec(
121 |     path: Path, extra_patterns: Optional[list[str]] = None
122 | ) -> pathspec.PathSpec:
123 |     """
124 |     Load .ccignore patterns from all applicable directories.
125 | 
126 |     This function finds all .ccignore files that apply to the given path,
127 |     from the most specific (closest to the path) to the most general (root).
128 |     Patterns from more specific .ccignore files take precedence over more general ones.
129 |     """
130 |     patterns = []
131 | 
132 |     # Add any extra patterns provided
133 |     if extra_patterns:
134 |         patterns.extend(extra_patterns)
135 | 
136 |     # Get all applicable .ccignore files
137 |     ccignore_files = find_ccignore_files(path)
138 | 
139 |     # Process files from most general to most specific
140 |     # This way, more specific patterns override more general ones
141 |     for ccignore_path, dir_path in reversed(ccignore_files):
142 |         with open(ccignore_path) as f:
143 |             ccignore_patterns = [
144 |                 line.strip() for line in f if line.strip() and not line.startswith("#")
145 |             ]
146 |             patterns.extend(ccignore_patterns)
147 | 
148 |     return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
149 | 
150 | 
151 | def get_git_diff(path: Path, compare_branch: Optional[str] = None) -> str:
152 |     """Get git diff for the given path, optionally comparing against a specific branch."""
153 |     try:
154 |         # First check if file is tracked by git
155 |         result = subprocess.run(
156 |             ["git", "ls-files", "--error-unmatch", str(path)],
157 |             capture_output=True,
158 |             text=True,
159 |             check=False,  # Don't raise error for untracked files
160 |         )
161 |         if result.returncode != 0:
162 |             return ""  # File is not tracked by git
163 | 
164 |         # Get the diff, either against the index (default) or specified branch
165 |         if compare_branch:
166 |             # First get the merge base
167 |             merge_base = subprocess.run(
168 |                 ["git", "merge-base", "HEAD", compare_branch],
169 |                 capture_output=True,
170 |                 text=True,
171 |                 check=True,
172 |             ).stdout.strip()
173 | 
174 |             # Then do the diff against the merge base
175 |             result = subprocess.run(
176 |                 ["git", "diff", merge_base, "--", str(path)],
177 |                 capture_output=True,
178 |                 text=True,
179 |                 check=False,
180 |             )
181 |         else:
182 |             result = subprocess.run(
183 |                 ["git", "diff", "--", str(path)],  # Removed --exit-code
184 |                 capture_output=True,
185 |                 text=True,
186 |                 check=False,
187 |             )
188 |         return result.stdout  # Return output regardless of return code
189 | 
190 |     except subprocess.CalledProcessError:
191 |         return ""
192 | 
193 | 
194 | def get_changed_files(compare_branch: Optional[str] = None) -> set[Path]:
195 |     """Get set of files that have changes according to git."""
196 |     try:
197 |         # First get the git root directory
198 |         git_root = subprocess.run(
199 |             ["git", "rev-parse", "--show-toplevel"],
200 |             capture_output=True,
201 |             text=True,
202 |             check=True,
203 |         ).stdout.strip()
204 |         git_root_path = Path(git_root)
205 | 
206 |         if compare_branch:
207 |             # Get all changes between current branch and compare branch
208 |             result = subprocess.run(
209 |                 [
210 |                     "git",
211 |                     "diff",
212 |                     "--name-status",
213 |                     f"{compare_branch}...HEAD",  # Use triple dot to compare branches
214 |                 ],
215 |                 capture_output=True,
216 |                 text=True,
217 |                 check=True,
218 |             )
219 |             # Also get any unstaged/uncommitted changes
220 |             unstaged_result = subprocess.run(
221 |                 ["git", "status", "--porcelain"],
222 |                 capture_output=True,
223 |                 text=True,
224 |                 check=True,
225 |             )
226 |             # Combine both results
227 |             combined_output = result.stdout + unstaged_result.stdout
228 |         else:
229 |             # Get both staged and unstaged changes (current behavior)
230 |             combined_output = subprocess.run(
231 |                 ["git", "status", "--porcelain"],
232 |                 capture_output=True,
233 |                 text=True,
234 |                 check=True,
235 |             ).stdout
236 | 
237 |         changed = set()
238 |         for line in combined_output.splitlines():
239 |             if not line.strip():
240 |                 continue
241 | 
242 |             # Split on tab or space to handle both formats
243 |             parts = line.split(None, 1)  # Split on whitespace, max 1 split
244 |             if len(parts) < 2:
245 |                 continue
246 | 
247 |             status, filepath = parts
248 | 
249 |             # Handle renamed files (they have arrow notation)
250 |             if " -> " in filepath:
251 |                 filepath = filepath.split(" -> ")[-1]
252 | 
253 |             # Convert relative path to absolute using git root
254 |             abs_path = (git_root_path / filepath).resolve()
255 |             changed.add(abs_path)
256 | 
257 |         return changed
258 |     except subprocess.CalledProcessError:
259 |         return set()
260 | 
261 | 
262 | def get_file_content(
263 |     path: Path,
264 |     diff_mode: DiffMode,
265 |     changed_files: Optional[set[Path]] = None,
266 |     compare_branch: Optional[str] = None,
267 | ) -> Optional[str]:
268 |     """Get file content based on diff mode."""
269 |     if not path.is_file():
270 |         return None
271 | 
272 |     # Get content
273 |     content = path.read_text()
274 | 
275 |     # Return full content immediately if that's what we want
276 |     if diff_mode == DiffMode.FULL:
277 |         return content
278 | 
279 |     # Check if file has changes and get diff if needed
280 |     if changed_files is not None:
281 |         has_changes = path in changed_files
282 |         # Get diff here so we can use it for all diff modes
283 |         diff = get_git_diff(path, compare_branch) if has_changes else ""
284 |     else:
285 |         # Get diff first, then check if there are changes
286 |         diff = get_git_diff(path, compare_branch)
287 |         has_changes = bool(diff)
288 | 
289 |     # Handle different modes
290 |     if diff_mode == DiffMode.DIFF_ONLY:
291 |         return diff if has_changes else None
292 |     elif diff_mode == DiffMode.CHANGED_WITH_DIFF:
293 |         if not has_changes:
294 |             return None
295 |         return f"{content}\n\n# Git Diff:\n{diff}"
296 |     elif diff_mode == DiffMode.FULL_WITH_DIFF:
297 |         if not has_changes:
298 |             return content
299 |         return f"{content}\n\n# Git Diff:\n{diff}"
300 | 
301 |     return None
302 | 
303 | 
304 | def scan_directory(
305 |     path: Path,
306 |     include: Optional[list[str]] = None,
307 |     exclude_patterns: Optional[list[str]] = None,
308 |     diff_mode: DiffMode = DiffMode.FULL,
309 |     max_depth: Optional[int] = None,
310 |     compare_branch: Optional[str] = None,
311 | ) -> dict[Path, str]:
312 |     """Scan directory for files to process."""
313 |     # Get changed files upfront if we're using a diff mode
314 |     changed_files = (
315 |         get_changed_files(compare_branch) if diff_mode != DiffMode.FULL else None
316 |     )
317 | 
318 |     # Convert string paths to Path objects and handle globs
319 |     if isinstance(path, str):
320 |         if is_glob_pattern(path):
321 |             paths = resolve_paths([path])
322 |         else:
323 |             paths = [Path(path)]
324 |     else:
325 |         paths = [path]
326 | 
327 |     result = {}
328 | 
329 |     # Pre-compute extension set
330 |     include_set = {f".{ext.lstrip('.')}" for ext in (include or DEFAULT_EXTENSIONS)}
331 | 
332 |     for current_path in paths:
333 |         if current_path.is_file():
334 |             # For single files, just check if it matches filters
335 |             if include and current_path.suffix.lstrip(".") not in include:
336 |                 continue
337 |             content = get_file_content(
338 |                 current_path, diff_mode, changed_files, compare_branch
339 |             )
340 |             if content is not None:
341 |                 result[current_path] = content
342 |             continue
343 | 
344 |         # Convert to absolute path once
345 |         abs_path = current_path.resolve()
346 |         if not abs_path.exists():
347 |             continue
348 | 
349 |         # Get gitignore spec once for the starting directory
350 |         git_spec = get_gitignore_spec(abs_path, exclude_patterns)
351 | 
352 |         # Use os.walk for better performance than rglob
353 |         for root, _, files in os.walk(abs_path):
354 |             root_path = Path(root)
355 | 
356 |             # Check depth if max_depth is specified
357 |             if max_depth is not None:
358 |                 try:
359 |                     # Calculate current depth relative to the starting path
360 |                     rel_path = root_path.relative_to(abs_path)
361 |                     current_depth = len(rel_path.parts)
362 |                     if current_depth > max_depth:
363 |                         continue
364 |                 except ValueError:
365 |                     continue
366 | 
367 |             # Get relative path once per directory
368 |             try:
369 |                 rel_root = str(root_path.relative_to(abs_path))
370 |                 if rel_root == ".":
371 |                     rel_root = ""
372 |             except ValueError:
373 |                 continue
374 | 
375 |             # Get ccignore spec for the current directory (to handle hierarchical patterns)
376 |             cc_spec = get_ccignore_spec(root_path, exclude_patterns)
377 | 
378 |             # Check if directory should be skipped (via gitignore or ccignore)
379 |             if rel_root:
380 |                 dir_path = rel_root + "/"
381 |                 if git_spec.match_file(dir_path) or cc_spec.match_file(dir_path):
382 |                     continue
383 | 
384 |             for filename in files:
385 |                 # Quick extension check before more expensive operations
386 |                 ext = Path(filename).suffix.lower()
387 |                 if ext not in include_set:
388 |                     continue
389 | 
390 |                 # Build relative path string directly
391 |                 rel_path_str = (
392 |                     os.path.join(rel_root, filename) if rel_root else filename
393 |                 )
394 | 
395 |                 # Check both gitignore and ccignore patterns
396 |                 if git_spec.match_file(rel_path_str) or cc_spec.match_file(
397 |                     rel_path_str
398 |                 ):
399 |                     continue
400 | 
401 |                 # Only create Path object if file passes all filters
402 |                 file_path = root_path / filename
403 | 
404 |                 # Get content based on diff mode
405 |                 content = get_file_content(
406 |                     file_path, diff_mode, changed_files, compare_branch
407 |                 )
408 |                 if content is not None:
409 |                     result[file_path] = content
410 | 
411 |     return result
412 | 
413 | 
414 | def scan_files(patterns: list[str], root: Path) -> set[Path]:
415 |     """Scan directory for files matching glob patterns."""
416 |     files = set()
417 |     for pattern in patterns:
418 |         files.update(root.glob(pattern))
419 |     return files
420 | 


--------------------------------------------------------------------------------
/src/copychat/format.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Optional
  3 | from os.path import commonpath
  4 | from datetime import datetime, timezone
  5 | import tiktoken
  6 | from dataclasses import dataclass
  7 | 
  8 | 
  9 | @dataclass
 10 | class FileStats:
 11 |     """Statistics for a single file."""
 12 | 
 13 |     chars: int
 14 |     tokens: int
 15 | 
 16 | 
 17 | @dataclass
 18 | class FormattedFile:
 19 |     """A formatted file with its stats."""
 20 | 
 21 |     path: Path
 22 |     content: str
 23 |     stats: FileStats
 24 |     formatted_content: str
 25 | 
 26 | 
 27 | @dataclass
 28 | class FormatResult:
 29 |     """Result of formatting one or more files."""
 30 | 
 31 |     files: list[FormattedFile]
 32 |     root_path: Path
 33 |     timestamp: datetime
 34 |     formatted_content: str
 35 |     total_chars: int = 0
 36 |     total_tokens: int = 0
 37 |     has_header: bool = True
 38 | 
 39 |     def __str__(self) -> str:
 40 |         """Return the formatted content."""
 41 |         return self.formatted_content
 42 | 
 43 | 
 44 | def format_file(
 45 |     file_path: Path, root_path: Path, content: Optional[str] = None
 46 | ) -> FormattedFile:
 47 |     """Format a single file as XML-style markdown and return structured result."""
 48 |     try:
 49 |         # Use provided content or read from file
 50 |         if content is None:
 51 |             content = file_path.read_text()
 52 | 
 53 |         # Calculate stats
 54 |         stats = FileStats(chars=len(content), tokens=estimate_tokens(content))
 55 | 
 56 |         # Use string paths for comparison to handle symlinks and different path formats
 57 |         file_str = str(file_path.resolve())
 58 |         root_str = str(root_path.resolve())
 59 | 
 60 |         # Remove the root path and any leading slashes
 61 |         if file_str.startswith(root_str):
 62 |             rel_path = file_str[len(root_str) :].lstrip("/\\")
 63 |         else:
 64 |             rel_path = file_str  # Fallback to full path if not a subpath
 65 | 
 66 |         language = guess_language(file_path)
 67 | 
 68 |         # Build the XML tag with attributes
 69 |         tag_attrs = [f'path="{rel_path}"']
 70 |         if language:
 71 |             tag_attrs.append(f'language="{language}"')
 72 | 
 73 |         attrs_str = " ".join(tag_attrs)
 74 | 
 75 |         formatted_content = f"""<file {attrs_str}>
 76 | {content}
 77 | </file>"""
 78 | 
 79 |         return FormattedFile(
 80 |             path=file_path,
 81 |             content=content,
 82 |             stats=stats,
 83 |             formatted_content=formatted_content,
 84 |         )
 85 | 
 86 |     except Exception as e:
 87 |         # Return empty stats for failed files
 88 |         return FormattedFile(
 89 |             path=file_path,
 90 |             content=f"<!-- Error processing {file_path}: {str(e)} -->",
 91 |             stats=FileStats(chars=0, tokens=0),
 92 |             formatted_content=f"<!-- Error processing {file_path}: {str(e)} -->",
 93 |         )
 94 | 
 95 | 
 96 | def create_header(result: FormatResult) -> str:
 97 |     """Create a header with metadata about the export."""
 98 |     timestamp = result.timestamp.strftime("%Y-%m-%d %H:%M:%S UTC")
 99 | 
100 |     # Create a table-like format for files
101 |     rel_paths = []
102 |     for f in result.files:
103 |         try:
104 |             rel_path = str(f.path.relative_to(result.root_path))
105 |             # Make sure path is not empty or just "."
106 |             if not rel_path or rel_path == ".":
107 |                 # For GitHub items, use a more descriptive name
108 |                 if (
109 |                     isinstance(f.path, Path)
110 |                     and f.path.name
111 |                     and (
112 |                         "_pr_" in f.path.name
113 |                         or "_issue_" in f.path.name
114 |                         or "_discussion_" in f.path.name
115 |                     )
116 |                 ) or (
117 |                     # Also check for GitHub blob files (they have repo_ref_filepath pattern)
118 |                     isinstance(f.path, Path)
119 |                     and f.path.name
120 |                     and "_" in f.path.name
121 |                     and len(f.path.name.split("_"))
122 |                     >= 3  # repo_ref_filepath has at least 3 parts
123 |                 ):
124 |                     # This appears to be a GitHub item, use a more descriptive name
125 |                     rel_path = f.path.name
126 |                 else:
127 |                     rel_path = f.path.name or str(f.path)
128 |         except ValueError:
129 |             rel_path = str(f.path)  # Fallback to full path if not a subpath
130 |         rel_paths.append(rel_path)
131 | 
132 |     # Use the minimum of the longest path or 50 chars
133 |     max_path_len = (
134 |         max(len(path) for path in rel_paths) if rel_paths else 4
135 |     )  # Min "Path" header width
136 |     max_path_len = max(max_path_len, 4)  # Ensure min width for "Path" header
137 |     max_path_len = min(max_path_len, 50)  # Cap path length for readability
138 | 
139 |     # Calculate line counts
140 |     file_lines = {f.path: f.content.count("\n") + 1 for f in result.files}
141 |     total_lines = sum(file_lines.values())
142 | 
143 |     header = [
144 |         "<!--",
145 |         f"Generated by copychat on {timestamp}",
146 |         f"Root path: {result.root_path}",
147 |         f"Summary: {len(result.files)} files, ~{result.total_tokens:,} tokens, {total_lines:,} lines",
148 |         "",
149 |         "Files:",
150 |         "┌" + "─" * (max_path_len + 2) + "┬" + "─" * 12 + "┬" + "─" * 10 + "┐",
151 |         f"│ {'Path':<{max_path_len}} │ {'Tokens':>10} │ {'Lines':>8} │",
152 |         "├" + "─" * (max_path_len + 2) + "┼" + "─" * 12 + "┼" + "─" * 10 + "┤",
153 |     ]
154 | 
155 |     # Format each file as a table row
156 |     for i, f in enumerate(sorted(result.files, key=lambda x: str(x.path))):
157 |         rel_path = rel_paths[i]
158 |         if len(rel_path) > max_path_len:
159 |             trunc_len = max_path_len - 3
160 |             rel_path = "..." + rel_path[-trunc_len:]
161 | 
162 |         lines = file_lines[f.path]
163 |         header.append(
164 |             f"│ {rel_path:<{max_path_len}} │ {f.stats.tokens:>10,} │ {lines:>8,} │"
165 |         )
166 | 
167 |     header.extend(
168 |         [
169 |             "└" + "─" * (max_path_len + 2) + "┴" + "─" * 12 + "┴" + "─" * 10 + "┘",
170 |             "-->",
171 |             "",
172 |         ]
173 |     )
174 | 
175 |     return "\n".join(header)
176 | 
177 | 
178 | def create_display_header(result: FormatResult) -> str:
179 |     """Create a display-friendly header without XML comments."""
180 |     timestamp = result.timestamp.strftime("%Y-%m-%d %H:%M:%S UTC")
181 | 
182 |     # Create a table-like format for files
183 |     rel_paths = []
184 |     for f in result.files:
185 |         try:
186 |             rel_path = str(f.path.relative_to(result.root_path))
187 |             # Make sure path is not empty or just "."
188 |             if not rel_path or rel_path == ".":
189 |                 # For GitHub items, use a more descriptive name
190 |                 if (
191 |                     isinstance(f.path, Path)
192 |                     and f.path.name
193 |                     and (
194 |                         "_pr_" in f.path.name
195 |                         or "_issue_" in f.path.name
196 |                         or "_discussion_" in f.path.name
197 |                     )
198 |                 ) or (
199 |                     # Also check for GitHub blob files (they have repo_ref_filepath pattern)
200 |                     isinstance(f.path, Path)
201 |                     and f.path.name
202 |                     and "_" in f.path.name
203 |                     and len(f.path.name.split("_"))
204 |                     >= 3  # repo_ref_filepath has at least 3 parts
205 |                 ):
206 |                     # This appears to be a GitHub item, use a more descriptive name
207 |                     rel_path = f.path.name
208 |                 else:
209 |                     rel_path = f.path.name or str(f.path)
210 |         except ValueError:
211 |             rel_path = str(f.path)  # Fallback to full path if not a subpath
212 |         rel_paths.append(rel_path)
213 | 
214 |     # Use the minimum of the longest path or 50 chars
215 |     max_path_len = (
216 |         max(len(path) for path in rel_paths) if rel_paths else 4
217 |     )  # Min "Path" header width
218 |     max_path_len = max(max_path_len, 4)  # Ensure min width for "Path" header
219 |     max_path_len = min(max_path_len, 50)  # Cap path length for readability
220 | 
221 |     # Calculate line counts
222 |     file_lines = {f.path: f.content.count("\n") + 1 for f in result.files}
223 |     total_lines = sum(file_lines.values())
224 | 
225 |     header = [
226 |         f"Generated by copychat on {timestamp}",
227 |         f"Root path: {result.root_path}",
228 |         f"Summary: {len(result.files)} files, ~{result.total_tokens:,} tokens, {total_lines:,} lines",
229 |         "",
230 |         "┌" + "─" * (max_path_len + 2) + "┬" + "─" * 12 + "┬" + "─" * 10 + "┐",
231 |         f"│ {'Path':<{max_path_len}} │ {'Tokens':>10} │ {'Lines':>8} │",
232 |         "├" + "─" * (max_path_len + 2) + "┼" + "─" * 12 + "┼" + "─" * 10 + "┤",
233 |     ]
234 | 
235 |     # Format each file as a table row
236 |     for i, f in enumerate(sorted(result.files, key=lambda x: str(x.path))):
237 |         rel_path = rel_paths[i]
238 |         if len(rel_path) > max_path_len:
239 |             trunc_len = max_path_len - 3
240 |             rel_path = "..." + rel_path[-trunc_len:]
241 | 
242 |         lines = file_lines[f.path]
243 |         header.append(
244 |             f"│ {rel_path:<{max_path_len}} │ {f.stats.tokens:>10,} │ {lines:>8,} │"
245 |         )
246 | 
247 |     header.append(
248 |         "└" + "─" * (max_path_len + 2) + "┴" + "─" * 12 + "┴" + "─" * 10 + "┘"
249 |     )
250 | 
251 |     return "\n".join(header)
252 | 
253 | 
254 | def format_files(files: list[tuple[Path, str]]) -> FormatResult:
255 |     """Format files into markdown with XML-style tags.
256 | 
257 |     Args:
258 |         files: List of (path, content) tuples to format
259 | 
260 |     Returns:
261 |         FormatResult containing all formatting information
262 |     """
263 |     if not files:
264 |         return FormatResult(
265 |             files=[],
266 |             root_path=Path("."),
267 |             timestamp=datetime.now(timezone.utc),
268 |             formatted_content="<!-- No files found matching criteria -->\n",
269 |             has_header=False,
270 |         )
271 | 
272 |     # Find common root path using os.path.commonpath
273 |     paths = [f[0] for f in files]
274 |     str_paths = [str(f.absolute()) for f in paths]
275 |     root_path = Path(commonpath(str_paths))
276 | 
277 |     # Format each file
278 |     formatted_files = []
279 |     total_chars = 0
280 |     total_tokens = 0
281 | 
282 |     for file_path, content in files:
283 |         formatted = format_file(file_path, root_path, content)
284 |         formatted_files.append(formatted)
285 |         total_chars += formatted.stats.chars
286 |         total_tokens += formatted.stats.tokens
287 | 
288 |     result = FormatResult(
289 |         files=formatted_files,
290 |         root_path=root_path,
291 |         timestamp=datetime.now(timezone.utc),
292 |         total_chars=total_chars,
293 |         total_tokens=total_tokens,
294 |         formatted_content="",  # Will be set after header
295 |     )
296 | 
297 |     # Create header and combine all parts
298 |     header = create_header(result)
299 |     formatted_content = "\n".join(
300 |         [header] + [f.formatted_content for f in formatted_files]
301 |     )
302 | 
303 |     # Update the formatted content
304 |     result.formatted_content = formatted_content
305 | 
306 |     return result
307 | 
308 | 
309 | # Keep existing helper functions unchanged
310 | def estimate_tokens(text: str) -> int:
311 |     """Estimate the number of tokens in the text using GPT tokenizer."""
312 |     try:
313 |         # Using cl100k_base (used by GPT-4, Claude)
314 |         encoding = tiktoken.get_encoding("cl100k_base")
315 |         return len(encoding.encode(text))
316 |     except Exception:
317 |         # Fallback to rough estimate if tiktoken fails
318 |         return len(text) // 4  # Rough estimate: ~4 chars per token
319 | 
320 | 
321 | def guess_language(file_path: Path) -> Optional[str]:
322 |     """Guess the programming language based on file extension."""
323 |     ext = file_path.suffix.lower()
324 | 
325 |     # Common language mappings
326 |     language_map = {
327 |         ".py": "python",
328 |         ".js": "javascript",
329 |         ".ts": "typescript",
330 |         ".jsx": "jsx",
331 |         ".tsx": "tsx",
332 |         ".html": "html",
333 |         ".css": "css",
334 |         ".scss": "scss",
335 |         ".rs": "rust",
336 |         ".go": "go",
337 |         ".java": "java",
338 |         ".cpp": "cpp",
339 |         ".c": "c",
340 |         ".h": "c",
341 |         ".hpp": "cpp",
342 |         ".rb": "ruby",
343 |         ".php": "php",
344 |         ".sh": "bash",
345 |         ".yaml": "yaml",
346 |         ".yml": "yaml",
347 |         ".json": "json",
348 |         ".md": "markdown",
349 |         ".sql": "sql",
350 |         ".r": "r",
351 |         ".swift": "swift",
352 |         ".kt": "kotlin",
353 |         ".kts": "kotlin",
354 |         ".scala": "scala",
355 |         ".pl": "perl",
356 |         ".pm": "perl",
357 |     }
358 | 
359 |     return language_map.get(ext)
360 | 


--------------------------------------------------------------------------------
/src/copychat/patterns.py:
--------------------------------------------------------------------------------
  1 | """Default patterns and extensions for file filtering."""
  2 | 
  3 | # Default extensions we care about (without dots)
  4 | DEFAULT_EXTENSIONS = {
  5 |     # Web
  6 |     "html",
  7 |     "css",
  8 |     "scss",
  9 |     "js",
 10 |     "jsx",
 11 |     "ts",
 12 |     "tsx",
 13 |     "json",
 14 |     # Python
 15 |     "py",
 16 |     "pyi",
 17 |     "pyw",
 18 |     # Ruby
 19 |     "rb",
 20 |     "erb",
 21 |     # JVM
 22 |     "java",
 23 |     "kt",
 24 |     "scala",
 25 |     "gradle",
 26 |     # Systems
 27 |     "c",
 28 |     "h",
 29 |     "cpp",
 30 |     "hpp",
 31 |     "rs",
 32 |     "go",
 33 |     # Shell
 34 |     "sh",
 35 |     "bash",
 36 |     "zsh",
 37 |     "fish",
 38 |     # Config
 39 |     "yaml",
 40 |     "yml",
 41 |     "toml",
 42 |     "ini",
 43 |     "conf",
 44 |     # Docs
 45 |     "md",
 46 |     "mdx",
 47 |     "rst",
 48 |     "txt",
 49 |     # Other
 50 |     "sql",
 51 |     "graphql",
 52 |     "xml",
 53 |     "dockerfile",
 54 |     "gitignore",
 55 | }
 56 | 
 57 | # Directories that should always be excluded
 58 | EXCLUDED_DIRS = {
 59 |     # Version Control
 60 |     ".git",
 61 |     ".svn",
 62 |     ".hg",
 63 |     # Dependencies
 64 |     "node_modules",
 65 |     "venv",
 66 |     ".venv",
 67 |     "env",
 68 |     "__pycache__",
 69 |     ".pytest_cache",
 70 |     ".ruff_cache",
 71 |     "target",
 72 |     "build",
 73 |     "dist",
 74 |     # IDE
 75 |     ".idea",
 76 |     ".vscode",
 77 |     # Other
 78 |     ".next",
 79 |     ".nuxt",
 80 |     ".output",
 81 |     "coverage",
 82 | }
 83 | 
 84 | # Files or patterns that should always be excluded
 85 | EXCLUDED_PATTERNS = {
 86 |     # Build artifacts
 87 |     "*.pyc",
 88 |     "*.pyo",
 89 |     "*.pyd",
 90 |     "*.so",
 91 |     "*.dll",
 92 |     "*.dylib",
 93 |     "*.class",
 94 |     "*.jar",
 95 |     "*.war",
 96 |     "*.min.js",
 97 |     "*.min.css",
 98 |     # Logs and databases
 99 |     "*.log",
100 |     "*.sqlite",
101 |     "*.db",
102 |     # OS files
103 |     ".DS_Store",
104 |     "Thumbs.db",
105 |     "desktop.ini",
106 |     # Package files
107 |     "package-lock.json",
108 |     "yarn.lock",
109 |     "poetry.lock",
110 |     # Environment and secrets
111 |     ".env",
112 |     ".env.*",
113 |     "*.env",
114 |     # Other
115 |     "*.bak",
116 |     "*.swp",
117 |     "*.swo",
118 |     "*~",
119 | }
120 | 


--------------------------------------------------------------------------------
/src/copychat/sources.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import shutil
  3 | from typing import Optional
  4 | import git
  5 | from rich.console import Console
  6 | import tempfile
  7 | 
  8 | error_console = Console(stderr=True)
  9 | 
 10 | # Shared temporary directory for GitHub items
 11 | _github_temp_dir = None
 12 | 
 13 | 
 14 | def get_github_temp_dir() -> Path:
 15 |     """Get a temporary directory for GitHub items that persists for the process."""
 16 |     global _github_temp_dir
 17 |     if _github_temp_dir is None:
 18 |         _github_temp_dir = Path(tempfile.mkdtemp(prefix="copychat_github_"))
 19 |     return _github_temp_dir
 20 | 
 21 | 
 22 | class GitHubSource:
 23 |     """Handle GitHub repositories as sources."""
 24 | 
 25 |     def __init__(self, repo_path: str, cache_dir: Optional[Path] = None):
 26 |         """Initialize GitHub source."""
 27 |         self.repo_path = repo_path.strip("/")
 28 |         self.cache_dir = cache_dir or Path.home() / ".cache" / "copychat" / "github"
 29 |         self.cache_dir.mkdir(parents=True, exist_ok=True)
 30 | 
 31 |     @property
 32 |     def clone_url(self) -> str:
 33 |         """Get HTTPS clone URL for repository."""
 34 |         return f"https://github.com/{self.repo_path}.git"
 35 | 
 36 |     @property
 37 |     def repo_dir(self) -> Path:
 38 |         """Get path to cached repository."""
 39 |         return self.cache_dir / self.repo_path.replace("/", "_")
 40 | 
 41 |     def fetch(self) -> Path:
 42 |         """Fetch repository and return path to files."""
 43 |         try:
 44 |             if self.repo_dir.exists():
 45 |                 # Update existing repo
 46 |                 repo = git.Repo(self.repo_dir)
 47 |                 repo.remotes.origin.fetch()
 48 |                 repo.remotes.origin.pull()
 49 |             else:
 50 |                 # Clone new repo
 51 |                 git.Repo.clone_from(self.clone_url, self.repo_dir, depth=1)
 52 | 
 53 |             return self.repo_dir
 54 | 
 55 |         except git.GitCommandError as e:
 56 |             error_console.print(f"[red]Error accessing repository:[/] {str(e)}")
 57 |             raise
 58 | 
 59 |     def cleanup(self) -> None:
 60 |         """Remove cached repository."""
 61 |         if self.repo_dir.exists():
 62 |             shutil.rmtree(self.repo_dir)
 63 | 
 64 | 
 65 | class GitHubItem:
 66 |     """Fetch a GitHub issue, pull request, or discussion with comments."""
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         repo_path: str,
 71 |         number: int,
 72 |         token: Optional[str] = None,
 73 |         item_type: str = "issue",
 74 |     ):
 75 |         self.repo_path = repo_path.strip("/")
 76 |         self.number = number
 77 |         self.token = token
 78 |         self.item_type = item_type  # 'issue', 'pull', or 'discussion'
 79 |         self.api_base = "https://api.github.com"
 80 | 
 81 |     def _headers(self) -> dict[str, str]:
 82 |         headers = {"Accept": "application/vnd.github+json"}
 83 |         if self.token:
 84 |             headers["Authorization"] = f"Bearer {self.token}"
 85 |         return headers
 86 | 
 87 |     def _graphql_headers(self) -> dict[str, str]:
 88 |         headers = {"Content-Type": "application/json"}
 89 |         if self.token:
 90 |             headers["Authorization"] = f"Bearer {self.token}"
 91 |         return headers
 92 | 
 93 |     def _fetch_discussion(self) -> tuple[dict, list]:
 94 |         """Fetch discussion data using GraphQL API."""
 95 |         import requests
 96 | 
 97 |         if not self.token:
 98 |             error_console.print(
 99 |                 "[yellow]Warning: GitHub token recommended for discussions. Some rate limits may apply.[/]"
100 |             )
101 | 
102 |         # GraphQL query to fetch discussion
103 |         query = """
104 |         query($owner: String!, $name: String!, $number: Int!) {
105 |           repository(owner: $owner, name: $name) {
106 |             discussion(number: $number) {
107 |               title
108 |               body
109 |               url
110 |               createdAt
111 |               updatedAt
112 |               author {
113 |                 login
114 |               }
115 |               category {
116 |                 name
117 |               }
118 |               comments(first: 100) {
119 |                 nodes {
120 |                   body
121 |                   createdAt
122 |                   author {
123 |                     login
124 |                   }
125 |                   replies(first: 50) {
126 |                     nodes {
127 |                       body
128 |                       createdAt
129 |                       author {
130 |                         login
131 |                       }
132 |                     }
133 |                   }
134 |                 }
135 |               }
136 |             }
137 |           }
138 |         }
139 |         """
140 | 
141 |         owner, repo = self.repo_path.split("/")
142 |         variables = {"owner": owner, "name": repo, "number": self.number}
143 | 
144 |         try:
145 |             resp = requests.post(
146 |                 "https://api.github.com/graphql",
147 |                 headers=self._graphql_headers(),
148 |                 json={"query": query, "variables": variables},
149 |                 timeout=30,
150 |             )
151 |             resp.raise_for_status()
152 |             data = resp.json()
153 | 
154 |             if "errors" in data:
155 |                 error_console.print(f"[red]GraphQL errors:[/] {data['errors']}")
156 |                 raise Exception(f"GraphQL errors: {data['errors']}")
157 | 
158 |             discussion = data["data"]["repository"]["discussion"]
159 |             if not discussion:
160 |                 raise Exception(f"Discussion #{self.number} not found")
161 | 
162 |             # Flatten comments and replies
163 |             comments = []
164 |             for comment in discussion["comments"]["nodes"]:
165 |                 comments.append(comment)
166 |                 # Add replies as nested comments
167 |                 for reply in comment["replies"]["nodes"]:
168 |                     comments.append(reply)
169 | 
170 |             return discussion, comments
171 | 
172 |         except Exception as e:
173 |             error_console.print(
174 |                 f"[yellow]Warning: Failed to fetch discussion: {str(e)}[/]"
175 |             )
176 |             raise
177 | 
178 |     def _fetch_pr_diff(self) -> Optional[str]:
179 |         """Fetch the PR diff from GitHub."""
180 |         import requests
181 | 
182 |         if not self.token:
183 |             error_console.print(
184 |                 "[yellow]Warning: GitHub token not provided. Some rate limits may apply.[/]"
185 |             )
186 | 
187 |         # Get the diff using the GitHub API
188 |         diff_url = f"{self.api_base}/repos/{self.repo_path}/pulls/{self.number}"
189 |         headers = self._headers()
190 |         headers["Accept"] = "application/vnd.github.diff"
191 |         try:
192 |             diff_resp = requests.get(diff_url, headers=headers, timeout=30)
193 |             diff_resp.raise_for_status()
194 |             return diff_resp.text
195 |         except Exception as e:
196 |             error_console.print(
197 |                 f"[yellow]Warning: Failed to fetch PR diff: {str(e)}[/]"
198 |             )
199 |             return None
200 | 
201 |     def fetch(self) -> tuple[Path, str]:
202 |         """Return (path, content) for the issue, PR, or discussion."""
203 |         if self.item_type == "discussion":
204 |             return self._fetch_discussion_content()
205 |         else:
206 |             return self._fetch_issue_or_pr_content()
207 | 
208 |     def _fetch_discussion_content(self) -> tuple[Path, str]:
209 |         """Fetch and format discussion content."""
210 |         discussion, comments = self._fetch_discussion()
211 | 
212 |         lines = [f"# {discussion.get('title', '')} (#{self.number})", ""]
213 | 
214 |         # Add metadata section
215 |         html_url = discussion.get(
216 |             "url", f"https://github.com/{self.repo_path}/discussions/{self.number}"
217 |         )
218 |         user = discussion.get("author", {}).get("login", "unknown")
219 |         created_at = discussion.get("createdAt", "")
220 |         updated_at = discussion.get("updatedAt", "")
221 |         category = discussion.get("category", {}).get("name", "")
222 | 
223 |         lines.extend(
224 |             [
225 |                 f"> **Discussion**: [{self.repo_path}#{self.number}]({html_url})",
226 |                 f"> **Category**: {category}",
227 |                 f"> **Author**: {user}",
228 |                 f"> **Created**: {created_at}",
229 |                 f"> **Updated**: {updated_at}",
230 |                 "",
231 |             ]
232 |         )
233 | 
234 |         body = discussion.get("body") or ""
235 |         if body:
236 |             lines.append(body)
237 |             lines.append("")
238 | 
239 |         # Add comments
240 |         for comment in comments:
241 |             user = comment.get("author", {}).get("login", "unknown")
242 |             created = comment.get("createdAt", "")
243 |             lines.append(f"## {user} - {created}")
244 |             if comment.get("body"):
245 |                 lines.append(comment["body"])
246 |             lines.append("")
247 | 
248 |         content = "\n".join(lines).strip() + "\n"
249 | 
250 |         # Use temporary directory
251 |         filename = f"{self.repo_path.replace('/', '_')}_discussion_{self.number}.md"
252 |         temp_dir = get_github_temp_dir()
253 |         path = temp_dir / filename
254 | 
255 |         return path, content
256 | 
257 |     def _fetch_issue_or_pr_content(self) -> tuple[Path, str]:
258 |         """Fetch and format issue or PR content."""
259 |         import requests
260 | 
261 |         issue_url = f"{self.api_base}/repos/{self.repo_path}/issues/{self.number}"
262 |         resp = requests.get(issue_url, headers=self._headers(), timeout=30)
263 |         resp.raise_for_status()
264 |         data = resp.json()
265 | 
266 |         comments_resp = requests.get(
267 |             data.get("comments_url"), headers=self._headers(), timeout=30
268 |         )
269 |         comments_resp.raise_for_status()
270 |         comments = comments_resp.json()
271 | 
272 |         review_comments = []
273 |         is_pr = "pull_request" in data
274 |         diff_content = None
275 | 
276 |         if is_pr:
277 |             # Fetch review comments
278 |             review_url = (
279 |                 f"{self.api_base}/repos/{self.repo_path}/pulls/{self.number}/comments"
280 |             )
281 |             rc = requests.get(review_url, headers=self._headers(), timeout=30)
282 |             if rc.ok:
283 |                 review_comments = rc.json()
284 | 
285 |             # Get the PR diff
286 |             diff_content = self._fetch_pr_diff()
287 | 
288 |         lines = [f"# {data.get('title', '')} (#{self.number})", ""]
289 |         body = data.get("body") or ""
290 | 
291 |         # Add metadata section
292 |         item_type = "Pull Request" if is_pr else "Issue"
293 |         html_url = data.get(
294 |             "html_url", f"https://github.com/{self.repo_path}/issues/{self.number}"
295 |         )
296 |         user = data.get("user", {}).get("login", "unknown")
297 |         created_at = data.get("created_at", "")
298 |         updated_at = data.get("updated_at", "")
299 |         state = data.get("state", "").upper()
300 | 
301 |         # Create a metadata header
302 |         lines.extend(
303 |             [
304 |                 f"> **{item_type}**: [{self.repo_path}#{self.number}]({html_url})",
305 |                 f"> **Status**: {state}",
306 |                 f"> **Author**: {user}",
307 |                 f"> **Created**: {created_at}",
308 |                 f"> **Updated**: {updated_at}",
309 |                 "",
310 |             ]
311 |         )
312 | 
313 |         if body:
314 |             lines.append(body)
315 |             lines.append("")
316 | 
317 |         # Add PR diff if available
318 |         if is_pr and diff_content:
319 |             lines.extend(
320 |                 [
321 |                     "## PR Diff",
322 |                     "",
323 |                     "```diff",
324 |                     diff_content,
325 |                     "```",
326 |                     "",
327 |                 ]
328 |             )
329 | 
330 |         for c in comments:
331 |             user = c.get("user", {}).get("login", "unknown")
332 |             created = c.get("created_at", "")
333 |             lines.append(f"## {user} - {created}")
334 |             if c.get("body"):
335 |                 lines.append(c["body"])
336 |             lines.append("")
337 | 
338 |         for c in review_comments:
339 |             user = c.get("user", {}).get("login", "unknown")
340 |             created = c.get("created_at", "")
341 |             path = c.get("path", "")
342 |             lines.append(f"## Review by {user} on {path} - {created}")
343 |             if c.get("body"):
344 |                 lines.append(c["body"])
345 |             lines.append("")
346 | 
347 |         content = "\n".join(lines).strip() + "\n"
348 |         item_type_filename = "pr" if is_pr else "issue"
349 | 
350 |         # Use temporary directory
351 |         filename = (
352 |             f"{self.repo_path.replace('/', '_')}_{item_type_filename}_{self.number}.md"
353 |         )
354 |         temp_dir = get_github_temp_dir()
355 |         path = temp_dir / filename
356 | 
357 |         return path, content
358 | 
359 | 
360 | class GitHubFile:
361 |     """Fetch a single file from GitHub via blob URL."""
362 | 
363 |     def __init__(self, blob_url: str, token: Optional[str] = None):
364 |         self.blob_url = blob_url
365 |         self.token = token
366 | 
367 |         # Parse the blob URL to extract repo, ref, and file path
368 |         import re
369 | 
370 |         match = re.search(r"github\.com/([^/]+/[^/]+)/blob/([^/]+)/(.*)", blob_url)
371 |         if not match:
372 |             raise ValueError(f"Invalid GitHub blob URL: {blob_url}")
373 | 
374 |         self.repo_path = match.group(1)
375 |         self.ref = match.group(2)
376 |         self.file_path = match.group(3)
377 | 
378 |     def _headers(self) -> dict[str, str]:
379 |         headers = {"Accept": "application/vnd.github+json"}
380 |         if self.token:
381 |             headers["Authorization"] = f"Bearer {self.token}"
382 |         return headers
383 | 
384 |     def fetch(self) -> tuple[Path, str]:
385 |         """Fetch the file content and return (path, content)."""
386 |         import requests
387 | 
388 |         # Use the raw.githubusercontent.com URL for direct file access
389 |         raw_url = f"https://raw.githubusercontent.com/{self.repo_path}/{self.ref}/{self.file_path}"
390 | 
391 |         try:
392 |             resp = requests.get(raw_url, timeout=30)
393 |             resp.raise_for_status()
394 |             content = resp.text
395 |         except Exception as e:
396 |             error_console.print(
397 |                 f"[yellow]Warning: Failed to fetch from raw URL, trying API:[/] {str(e)}"
398 |             )
399 | 
400 |             # Fallback to GitHub API
401 |             api_url = f"https://api.github.com/repos/{self.repo_path}/contents/{self.file_path}"
402 |             params = {"ref": self.ref}
403 | 
404 |             try:
405 |                 resp = requests.get(
406 |                     api_url, headers=self._headers(), params=params, timeout=30
407 |                 )
408 |                 resp.raise_for_status()
409 |                 data = resp.json()
410 | 
411 |                 if data.get("type") != "file":
412 |                     raise Exception(
413 |                         f"URL points to a {data.get('type', 'unknown')}, not a file"
414 |                     )
415 | 
416 |                 # Decode base64 content
417 |                 import base64
418 | 
419 |                 content = base64.b64decode(data["content"]).decode("utf-8")
420 |             except Exception as api_error:
421 |                 error_console.print(f"[red]Failed to fetch file:[/] {str(api_error)}")
422 |                 raise
423 | 
424 |         # Create a meaningful filename in temp directory
425 |         filename = f"{self.repo_path.replace('/', '_')}_{self.ref}_{self.file_path.replace('/', '_')}"
426 |         temp_dir = get_github_temp_dir()
427 |         path = temp_dir / filename
428 | 
429 |         return path, content
430 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pytest
 3 | import shutil
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def sample_project(tmp_path) -> Path:
 8 |     """Create a copy of the fixture project in a temporary directory."""
 9 |     fixtures_dir = Path(__file__).parent / "fixtures"
10 |     project_dir = tmp_path / "test_project"
11 | 
12 |     # Copy all fixtures to temporary directory
13 |     shutil.copytree(fixtures_dir, project_dir, dirs_exist_ok=True)
14 | 
15 |     return project_dir
16 | 
17 | 
18 | @pytest.fixture
19 | def sample_project_files(sample_project) -> list[Path]:
20 |     """Get a list of all files in the sample project."""
21 |     return list(sample_project.rglob("*"))
22 | 
23 | 
24 | def test_fixture_structure(sample_project):
25 |     """Verify the fixture structure is correct."""
26 |     assert (sample_project / "src" / "main.py").exists()
27 |     assert (sample_project / "src" / "app.js").exists()
28 |     assert (sample_project / "src" / "styles" / "main.css").exists()
29 |     assert (sample_project / "docs" / "README.md").exists()
30 |     assert (sample_project / "config" / "settings.yml").exists()
31 |     assert (sample_project / "db" / "schema.sql").exists()
32 |     assert (sample_project / ".gitignore").exists()
33 |     assert (sample_project / ".env").exists()
34 | 


--------------------------------------------------------------------------------
/tests/data/test1.txt:
--------------------------------------------------------------------------------
1 | This is a test file


--------------------------------------------------------------------------------
/tests/data/test2.md:
--------------------------------------------------------------------------------
1 | This is another test file


--------------------------------------------------------------------------------
/tests/fixtures/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .env
4 | node_modules/
5 | .DS_Store
6 | 


--------------------------------------------------------------------------------
/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
1 | """Test fixtures package."""
2 | 


--------------------------------------------------------------------------------
/tests/fixtures/config/settings.yml:
--------------------------------------------------------------------------------
1 | app:
2 |   name: TestApp
3 |   version: 1.0.0
4 | 
5 | database:
6 |   host: localhost
7 |   port: 5432
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/db/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE users (
2 |     id SERIAL PRIMARY KEY,
3 |     username VARCHAR(50) NOT NULL,
4 |     email VARCHAR(255) NOT NULL
5 | );
6 | 
7 | CREATE INDEX idx_username ON users(username);


--------------------------------------------------------------------------------
/tests/fixtures/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Test Project
 2 | 
 3 | This is a test project with various file types.
 4 | 
 5 | ## Structure
 6 | - src/
 7 |   - main.py
 8 |   - app.js
 9 |   - styles/
10 |   - utils/
11 | - docs/
12 | - tests/
13 | 


--------------------------------------------------------------------------------
/tests/fixtures/src/app.js:
--------------------------------------------------------------------------------
 1 | function App() {
 2 |     return (
 3 |         <div className="app">
 4 |             <h1>Hello World</h1>
 5 |             <p>This is a test component</p>
 6 |         </div>
 7 |     );
 8 | }
 9 | 
10 | export default App;
11 | 


--------------------------------------------------------------------------------
/tests/fixtures/src/main.py:
--------------------------------------------------------------------------------
1 | def main():
2 |     """Example main function."""
3 |     print("Hello from main!")
4 |     return True
5 | 
6 | 
7 | if __name__ == "__main__":
8 |     main()
9 | 


--------------------------------------------------------------------------------
/tests/fixtures/src/styles/main.css:
--------------------------------------------------------------------------------
 1 | .app {
 2 |   margin: 0;
 3 |   padding: 20px;
 4 |   font-family: sans-serif;
 5 | }
 6 | 
 7 | .header {
 8 |   color: #333;
 9 |   font-size: 24px;
10 | }
11 | 


--------------------------------------------------------------------------------
/tests/fixtures/src/types.ts:
--------------------------------------------------------------------------------
 1 | interface User {
 2 |   id: number;
 3 |   name: string;
 4 |   email: string;
 5 | }
 6 | 
 7 | type UserRole = "admin" | "user" | "guest";
 8 | 
 9 | export { User, UserRole };
10 | 


--------------------------------------------------------------------------------
/tests/fixtures/src/utils/helpers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | def format_string(value: Any) -> str:
 5 |     """Format any value as a string."""
 6 |     return str(value).strip()
 7 | 
 8 | 
 9 | def calculate_total(numbers: list[float]) -> float:
10 |     """Calculate sum of numbers."""
11 |     return sum(numbers)
12 | 


--------------------------------------------------------------------------------
/tests/test_ccignore.py:
--------------------------------------------------------------------------------
  1 | """Tests for .ccignore functionality."""
  2 | 
  3 | import pytest
  4 | from copychat.core import (
  5 |     find_ccignore_files,
  6 |     get_ccignore_spec,
  7 |     scan_directory,
  8 | )
  9 | 
 10 | 
 11 | @pytest.fixture
 12 | def ccignore_test_dir(tmp_path):
 13 |     """Create a test directory structure with .ccignore files."""
 14 |     # Root directory with .ccignore
 15 |     root_ccignore = tmp_path / ".ccignore"
 16 |     root_ccignore.write_text("*.log\n")
 17 | 
 18 |     # Create subdirectory with its own .ccignore
 19 |     subdir = tmp_path / "subdir"
 20 |     subdir.mkdir()
 21 |     subdir_ccignore = subdir / ".ccignore"
 22 |     subdir_ccignore.write_text("*.json\n")
 23 | 
 24 |     # Create nested subdirectory with its own .ccignore
 25 |     nested_subdir = subdir / "nested"
 26 |     nested_subdir.mkdir()
 27 |     nested_ccignore = nested_subdir / ".ccignore"
 28 |     nested_ccignore.write_text("*.md\n")
 29 | 
 30 |     # Create test files
 31 |     (tmp_path / "root.txt").write_text("root text file")
 32 |     (tmp_path / "root.log").write_text("root log file")
 33 |     (tmp_path / "root.json").write_text("root json file")
 34 |     (tmp_path / "root.md").write_text("root md file")
 35 | 
 36 |     (subdir / "subdir.txt").write_text("subdir text file")
 37 |     (subdir / "subdir.log").write_text("subdir log file")
 38 |     (subdir / "subdir.json").write_text("subdir json file")
 39 |     (subdir / "subdir.md").write_text("subdir md file")
 40 | 
 41 |     (nested_subdir / "nested.txt").write_text("nested text file")
 42 |     (nested_subdir / "nested.log").write_text("nested log file")
 43 |     (nested_subdir / "nested.json").write_text("nested json file")
 44 |     (nested_subdir / "nested.md").write_text("nested md file")
 45 | 
 46 |     return tmp_path
 47 | 
 48 | 
 49 | def test_find_ccignore_files(ccignore_test_dir):
 50 |     """Test finding all .ccignore files that apply to a path."""
 51 |     nested_dir = ccignore_test_dir / "subdir" / "nested"
 52 | 
 53 |     # Should find 3 .ccignore files, from most specific to most general
 54 |     result = find_ccignore_files(nested_dir)
 55 |     assert len(result) == 3
 56 | 
 57 |     # Check the order - should be from most specific to most general
 58 |     assert result[0][0] == nested_dir / ".ccignore"
 59 |     assert result[1][0] == ccignore_test_dir / "subdir" / ".ccignore"
 60 |     assert result[2][0] == ccignore_test_dir / ".ccignore"
 61 | 
 62 |     # Test with path that has no .ccignore
 63 |     empty_dir = ccignore_test_dir / "empty_dir"
 64 |     empty_dir.mkdir()
 65 |     result = find_ccignore_files(empty_dir)
 66 |     assert len(result) == 1
 67 |     assert result[0][0] == ccignore_test_dir / ".ccignore"
 68 | 
 69 | 
 70 | def test_get_ccignore_spec(ccignore_test_dir):
 71 |     """Test generating PathSpec from .ccignore files."""
 72 |     # Root directory should only exclude .log files
 73 |     root_spec = get_ccignore_spec(ccignore_test_dir)
 74 |     assert root_spec.match_file("test.log")
 75 |     assert not root_spec.match_file("test.json")
 76 |     assert not root_spec.match_file("test.md")
 77 | 
 78 |     # Subdirectory should exclude .log and .json files
 79 |     subdir_spec = get_ccignore_spec(ccignore_test_dir / "subdir")
 80 |     assert subdir_spec.match_file("test.log")
 81 |     assert subdir_spec.match_file("test.json")
 82 |     assert not subdir_spec.match_file("test.md")
 83 | 
 84 |     # Nested subdirectory should exclude .log, .json, and .md files
 85 |     nested_spec = get_ccignore_spec(ccignore_test_dir / "subdir" / "nested")
 86 |     assert nested_spec.match_file("test.log")
 87 |     assert nested_spec.match_file("test.json")
 88 |     assert nested_spec.match_file("test.md")
 89 | 
 90 | 
 91 | def test_scan_directory_with_ccignore(ccignore_test_dir):
 92 |     """Test that scan_directory respects .ccignore patterns."""
 93 |     # Scan the root directory - should exclude .log files
 94 |     files = scan_directory(ccignore_test_dir, include=["txt", "json", "md", "log"])
 95 |     paths = {str(f) for f in files}
 96 | 
 97 |     # Root dir - .log should be excluded, others included
 98 |     assert not any(p.endswith("root.log") for p in paths)
 99 |     assert any(p.endswith("root.txt") for p in paths)
100 |     assert any(p.endswith("root.json") for p in paths)
101 |     assert any(p.endswith("root.md") for p in paths)
102 | 
103 |     # Subdir - .log and .json should be excluded, others included
104 |     assert not any(p.endswith("subdir.log") for p in paths)
105 |     assert not any(p.endswith("subdir.json") for p in paths)
106 |     assert any(p.endswith("subdir.txt") for p in paths)
107 |     assert any(p.endswith("subdir.md") for p in paths)
108 | 
109 |     # Nested subdir - .log, .json, and .md should be excluded, others included
110 |     assert not any(p.endswith("nested.log") for p in paths)
111 |     assert not any(p.endswith("nested.json") for p in paths)
112 |     assert not any(p.endswith("nested.md") for p in paths)
113 |     assert any(p.endswith("nested.txt") for p in paths)
114 | 
115 | 
116 | def test_ccignore_with_extra_patterns(ccignore_test_dir):
117 |     """Test that extra exclude patterns work with .ccignore."""
118 |     # Add extra exclude pattern for .txt files
119 |     spec = get_ccignore_spec(ccignore_test_dir, extra_patterns=["*.txt"])
120 | 
121 |     # Should exclude both .log files (from .ccignore) and .txt files (from extra patterns)
122 |     assert spec.match_file("test.log")
123 |     assert spec.match_file("test.txt")
124 |     assert not spec.match_file("test.json")
125 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | from typer.testing import CliRunner
  2 | from copychat.cli import app
  3 | import pyperclip
  4 | import re
  5 | from pathlib import Path
  6 | 
  7 | runner = CliRunner()
  8 | 
  9 | 
 10 | def strip_ansi(text: str) -> str:
 11 |     """Remove ANSI escape codes from text."""
 12 |     ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
 13 |     return ansi_escape.sub("", text)
 14 | 
 15 | 
 16 | def test_cli_default_behavior(tmp_path, monkeypatch):
 17 |     """Test that default behavior copies to clipboard."""
 18 |     # Create a test file
 19 |     test_file = tmp_path / "test.py"
 20 |     test_file.write_text("print('hello')")
 21 | 
 22 |     # Mock pyperclip.copy
 23 |     copied_content = []
 24 | 
 25 |     def mock_copy(text):
 26 |         copied_content.append(text)
 27 | 
 28 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
 29 | 
 30 |     # Run CLI
 31 |     result = runner.invoke(app, [str(tmp_path)])
 32 | 
 33 |     assert result.exit_code == 0
 34 |     assert len(copied_content) == 1
 35 |     assert 'language="python"' in copied_content[0]
 36 |     assert "print('hello')" in copied_content[0]
 37 | 
 38 | 
 39 | def test_cli_output_file(tmp_path, monkeypatch):
 40 |     """Test writing output to file."""
 41 |     # Create a test file
 42 |     test_file = tmp_path / "test.py"
 43 |     test_file.write_text("print('hello')")
 44 | 
 45 |     # Create output file path
 46 |     out_file = tmp_path / "output.md"
 47 | 
 48 |     # Mock pyperclip.copy
 49 |     monkeypatch.setattr(pyperclip, "copy", lambda x: None)
 50 | 
 51 |     # Run CLI
 52 |     result = runner.invoke(app, [str(tmp_path), "--out", str(out_file)])
 53 | 
 54 |     assert result.exit_code == 0
 55 |     assert out_file.exists()
 56 |     content = out_file.read_text()
 57 |     assert 'language="python"' in content
 58 |     assert "print('hello')" in content
 59 | 
 60 | 
 61 | def test_cli_print_output(tmp_path, monkeypatch):
 62 |     """Test printing output to screen."""
 63 |     # Create a test file
 64 |     test_file = tmp_path / "test.py"
 65 |     test_file.write_text("print('hello')")
 66 | 
 67 |     # Mock pyperclip.copy
 68 |     monkeypatch.setattr(pyperclip, "copy", lambda x: None)
 69 | 
 70 |     # Run CLI
 71 |     result = runner.invoke(app, [str(tmp_path), "--print"])
 72 | 
 73 |     assert result.exit_code == 0
 74 |     assert 'language="python"' in result.stdout
 75 |     assert "print('hello')" in result.stdout
 76 | 
 77 | 
 78 | def test_cli_no_files_found(tmp_path):
 79 |     """Test behavior when no matching files are found."""
 80 |     # Create a non-matching file
 81 |     test_file = tmp_path / "test.txt"
 82 |     test_file.write_text("hello")
 83 | 
 84 |     # Run CLI with filter for .py files only
 85 |     result = runner.invoke(app, [str(tmp_path), "--include", "py"])
 86 | 
 87 |     # Since this is expected behavior, CLI should exit with code 0 rather than 1
 88 |     assert result.exit_code == 0
 89 |     assert "Found 0 matching files" in strip_ansi(result.stderr)
 90 | 
 91 | 
 92 | def test_cli_multiple_outputs(tmp_path, monkeypatch):
 93 |     """Test combining output options."""
 94 |     # Create a test file
 95 |     test_file = tmp_path / "test.py"
 96 |     test_file.write_text("print('hello')")
 97 | 
 98 |     # Create output file path
 99 |     out_file = tmp_path / "output.md"
100 | 
101 |     # Mock pyperclip.copy and paste
102 |     copied_content = []
103 | 
104 |     def mock_copy(text):
105 |         copied_content.append(text)
106 | 
107 |     # Since we're using output file, clipboard copy won't happen
108 |     # Instead just check the file output and stdout
109 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
110 | 
111 |     # Run CLI with both file output and print
112 |     result = runner.invoke(app, [str(tmp_path), "--out", str(out_file), "--print"])
113 | 
114 |     assert result.exit_code == 0
115 | 
116 |     # Check file
117 |     assert out_file.exists()
118 |     file_content = out_file.read_text()
119 |     assert 'language="python"' in file_content
120 | 
121 |     # Check stdout
122 |     assert 'language="python"' in result.stdout
123 | 
124 | 
125 | def test_cli_append_file(tmp_path, monkeypatch):
126 |     """Test appending output to an existing file."""
127 |     # Create a test file to scan
128 |     test_file = tmp_path / "test.py"
129 |     test_file.write_text("print('hello')")
130 | 
131 |     # Create existing output file with content
132 |     out_file = tmp_path / "output.md"
133 |     out_file.write_text("existing content\n")
134 | 
135 |     # Mock pyperclip.copy
136 |     monkeypatch.setattr(pyperclip, "copy", lambda x: None)
137 | 
138 |     # Run CLI with append flag
139 |     result = runner.invoke(app, [str(tmp_path), "--out", str(out_file), "--append"])
140 | 
141 |     assert result.exit_code == 0
142 |     content = out_file.read_text()
143 |     assert "existing content" in content
144 |     assert 'language="python"' in content
145 |     assert "print('hello')" in content
146 | 
147 | 
148 | def test_cli_append_clipboard(tmp_path, monkeypatch):
149 |     """Test appending output to clipboard content."""
150 |     # Create a test file
151 |     test_file = tmp_path / "test.py"
152 |     test_file.write_text("print('new content')")
153 | 
154 |     # Mock clipboard content and operations
155 |     clipboard_content = ["existing clipboard content"]
156 | 
157 |     def mock_copy(text):
158 |         clipboard_content[0] = text
159 | 
160 |     def mock_paste():
161 |         return clipboard_content[0]
162 | 
163 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
164 |     monkeypatch.setattr(pyperclip, "paste", mock_paste)
165 | 
166 |     # Run CLI with append flag
167 |     result = runner.invoke(app, [str(tmp_path), "--append"])
168 | 
169 |     assert result.exit_code == 0
170 |     assert "existing clipboard content" in clipboard_content[0]
171 |     assert 'language="python"' in clipboard_content[0]
172 |     assert "print('new content')" in clipboard_content[0]
173 | 
174 | 
175 | def test_cli_exclude_pattern(tmp_path, monkeypatch):
176 |     """Test excluding files with patterns."""
177 |     # Create test files
178 |     py_file = tmp_path / "code.py"
179 |     py_file.write_text("print('include me')")
180 | 
181 |     js_file = tmp_path / "script.js"
182 |     js_file.write_text("console.log('exclude me')")
183 | 
184 |     # Mock pyperclip.copy
185 |     copied_content = []
186 | 
187 |     def mock_copy(text):
188 |         copied_content.append(text)
189 | 
190 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
191 | 
192 |     # Run CLI with exclude pattern for JS files
193 |     result = runner.invoke(app, [str(tmp_path), "--exclude", "*.js"])
194 | 
195 |     assert result.exit_code == 0
196 |     assert len(copied_content) == 1
197 |     assert "print('include me')" in copied_content[0]
198 |     assert "console.log('exclude me')" not in copied_content[0]
199 | 
200 | 
201 | def test_cli_directory_depth(tmp_path, monkeypatch):
202 |     """Test limiting directory scan depth."""
203 |     # Create nested directory structure
204 |     level1 = tmp_path / "level1"
205 |     level1.mkdir()
206 |     level1_file = level1 / "level1.py"
207 |     level1_file.write_text("print('level1')")
208 | 
209 |     level2 = level1 / "level2"
210 |     level2.mkdir()
211 |     level2_file = level2 / "level2.py"
212 |     level2_file.write_text("print('level2')")
213 | 
214 |     # Mock pyperclip.copy
215 |     copied_content = []
216 | 
217 |     def mock_copy(text):
218 |         copied_content.append(text)
219 | 
220 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
221 | 
222 |     # Run CLI with depth=1 (should only include level1 directory)
223 |     result = runner.invoke(app, [str(tmp_path), "--depth", "1"])
224 | 
225 |     assert result.exit_code == 0
226 |     assert len(copied_content) == 1
227 |     assert "print('level1')" in copied_content[0]
228 |     assert "print('level2')" not in copied_content[0]
229 | 
230 | 
231 | def test_cli_verbose_output(tmp_path, monkeypatch):
232 |     """Test verbose output includes file metadata."""
233 |     # Create a test file
234 |     test_file = tmp_path / "test.py"
235 |     test_file.write_text("print('hello')")
236 | 
237 |     # Mock pyperclip.copy
238 |     copied_content = []
239 | 
240 |     def mock_copy(text):
241 |         copied_content.append(text)
242 | 
243 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
244 | 
245 |     # Run CLI with verbose flag
246 |     result = runner.invoke(app, [str(tmp_path), "--verbose"])
247 | 
248 |     assert result.exit_code == 0
249 |     assert len(copied_content) == 1
250 | 
251 |     # Verbose output should include file metadata header with summary
252 |     # header_content = copied_content[0].split("```")[0]
253 |     assert "File summary" in strip_ansi(result.stderr)
254 |     assert (
255 |         "Files: 1" in strip_ansi(result.stderr)
256 |         or "1 file" in strip_ansi(result.stderr).lower()
257 |     )
258 | 
259 | 
260 | def test_cli_github_item_basic(monkeypatch):
261 |     """Basic test for GitHub item handling that doesn't rely on internal implementation."""
262 |     runner = CliRunner()
263 | 
264 |     # Instead of mocking complex internals, just provide a simple mock for the scan_directory function
265 |     # so it returns a known result when the CLI processes a GitHub item
266 |     def mock_scan_empty(directory, **kwargs):
267 |         """Return empty dict to ensure our mock item is the only one processed."""
268 |         return {}
269 | 
270 |     # Mock clipboard operations
271 |     copied = []
272 |     monkeypatch.setattr(pyperclip, "copy", lambda x: copied.append(x))
273 | 
274 |     # Replace scan_directory with our mock to avoid file system dependencies
275 |     monkeypatch.setattr("copychat.cli.scan_directory", mock_scan_empty)
276 | 
277 |     # Run the CLI with a mocked item
278 |     # The exact format doesn't matter as we're not testing the GitHub API integration
279 |     result = runner.invoke(app, ["owner/repo#123"], catch_exceptions=False)
280 | 
281 |     # We expect either:
282 |     # 1. Success (exit_code=0) if the mock returns results, or
283 |     # 2. "Found 0 matching files" message (exit_code=0) if mocking couldn't succeed
284 |     # Either way, we've tested that the CLI can handle the GitHub item format
285 |     assert result.exit_code == 0 or "No module named 'requests'" in result.stderr
286 | 
287 |     # If we failed to fetch anything due to missing requests library
288 |     # at least make sure we attempted to parse the GitHub item format
289 |     if result.exit_code != 0:
290 |         assert "owner/repo#123" in result.stderr or "GitHub" in result.stderr
291 | 
292 | 
293 | def test_table_alignment_with_dot_path(tmp_path, monkeypatch):
294 |     """Test table alignment when path resolves to '.'"""
295 |     # Create a test file
296 |     test_file = tmp_path / "test.md"
297 |     test_file.write_text("# Test content")
298 | 
299 |     # Mock relative_to so it returns "." path
300 |     original_relative_to = Path.relative_to
301 | 
302 |     def mock_relative_to(self, other):
303 |         # Always return a path that is just "."
304 |         if str(self) == str(test_file):
305 |             return Path(".")
306 |         return original_relative_to(self, other)
307 | 
308 |     monkeypatch.setattr(Path, "relative_to", mock_relative_to)
309 | 
310 |     # Mock pyperclip.copy
311 |     copied_content = []
312 | 
313 |     def mock_copy(text):
314 |         copied_content.append(text)
315 | 
316 |     monkeypatch.setattr(pyperclip, "copy", mock_copy)
317 | 
318 |     # Run CLI with verbose flag
319 |     result = runner.invoke(app, [str(test_file), "--verbose"])
320 | 
321 |     assert result.exit_code == 0
322 | 
323 |     # Ensure table is properly aligned in the output
324 |     table_output = strip_ansi(result.stderr)
325 | 
326 |     # The "Path" header and first column content should be aligned
327 |     path_header_idx = table_output.find("│ Path")
328 |     assert path_header_idx > 0, "Path header not found in table"
329 | 
330 |     # Extract the table rows by looking for lines with │ characters
331 |     table_lines = [line for line in table_output.split("\n") if "│" in line]
332 | 
333 |     # Verify there are at least a header row and a data row
334 |     assert len(table_lines) >= 2, "Table should have header and data rows"
335 | 
336 |     # Check that columns align vertically - the first │ should be at the same position in each row
337 |     positions = [line.find("│") for line in table_lines]
338 |     assert len(set(positions)) == 1, "Misaligned table columns (first pipe)"
339 | 
340 |     # Check that second │ (after Path column) aligns in all rows
341 |     positions = [line.find("│", positions[0] + 1) for line in table_lines]
342 |     assert len(set(positions)) == 1, "Misaligned table columns (second pipe)"
343 | 
344 |     # Confirm test.md appears in the table with proper alignment
345 |     assert "test.md" in table_output, "Filename should appear in table output"
346 | 


--------------------------------------------------------------------------------
/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from copychat.core import (
  3 |     find_gitignore,
  4 |     DiffMode,
  5 |     is_glob_pattern,
  6 |     resolve_paths,
  7 |     scan_directory,
  8 |     scan_files,
  9 | )
 10 | from pathlib import Path
 11 | 
 12 | 
 13 | def test_diff_mode_enum():
 14 |     """Test DiffMode enum values."""
 15 |     assert DiffMode.FULL.value == "full"
 16 |     assert DiffMode.FULL_WITH_DIFF.value == "full-with-diff"
 17 |     assert DiffMode.CHANGED_WITH_DIFF.value == "changed-with-diff"
 18 |     assert DiffMode.DIFF_ONLY.value == "diff-only"
 19 | 
 20 | 
 21 | def test_is_glob_pattern():
 22 |     """Test glob pattern detection."""
 23 |     assert is_glob_pattern("*.py")
 24 |     assert is_glob_pattern("src/**/*.js")
 25 |     assert is_glob_pattern("test/*")
 26 |     assert not is_glob_pattern("src/main.py")
 27 |     assert not is_glob_pattern("path/to/file")
 28 | 
 29 | 
 30 | def test_resolve_paths(tmp_path):
 31 |     """Test path resolution with glob patterns."""
 32 |     # Create test files
 33 |     (tmp_path / "test1.py").touch()
 34 |     (tmp_path / "test2.py").touch()
 35 |     (tmp_path / "src").mkdir()
 36 |     (tmp_path / "src" / "main.py").touch()
 37 |     (tmp_path / "src" / "util.js").touch()
 38 | 
 39 |     # Test glob resolution
 40 |     paths = resolve_paths(["*.py", "src/**/*.py"], base_path=tmp_path)
 41 |     assert len(paths) == 3
 42 |     assert tmp_path / "test1.py" in paths
 43 |     assert tmp_path / "test2.py" in paths
 44 |     assert tmp_path / "src" / "main.py" in paths
 45 | 
 46 |     # Test mixed glob and regular paths
 47 |     paths = resolve_paths(["src", "*.py"], base_path=tmp_path)
 48 |     assert len(paths) == 3
 49 |     assert tmp_path / "src" in paths
 50 | 
 51 | 
 52 | @pytest.fixture
 53 | def git_repo(tmp_path):
 54 |     """Create a temporary git repository with a .gitignore file."""
 55 |     gitignore = tmp_path / ".gitignore"
 56 |     gitignore.write_text("*.pyc\n__pycache__/\n")
 57 |     return tmp_path
 58 | 
 59 | 
 60 | def test_scan_with_glob_patterns():
 61 |     # Create test directory and files if they don't exist
 62 |     test_dir = Path("tests/data")
 63 |     test_dir.mkdir(parents=True, exist_ok=True)
 64 | 
 65 |     with open(test_dir / "test1.txt", "w") as f:
 66 |         f.write("This is a test file")
 67 |     with open(test_dir / "test2.md", "w") as f:
 68 |         f.write("This is another test file")
 69 | 
 70 |     files = scan_files(["*.txt", "*.md"], test_dir)
 71 |     assert len(files) == 2
 72 | 
 73 | 
 74 | def test_find_gitignore_exists(git_repo):
 75 |     """Test finding .gitignore in current directory."""
 76 |     result = find_gitignore(git_repo)
 77 |     assert result == git_repo / ".gitignore"
 78 | 
 79 | 
 80 | def test_find_gitignore_parent(git_repo):
 81 |     """Test finding .gitignore in parent directory."""
 82 |     child_dir = git_repo / "subdir"
 83 |     child_dir.mkdir()
 84 |     result = find_gitignore(child_dir)
 85 |     assert result == git_repo / ".gitignore"
 86 | 
 87 | 
 88 | def test_find_gitignore_not_found(tmp_path):
 89 |     """Test behavior when no .gitignore is found."""
 90 |     result = find_gitignore(tmp_path)
 91 |     assert result is None
 92 | 
 93 | 
 94 | def test_scan_with_recursive_glob(tmp_path):
 95 |     """Test scanning with recursive glob patterns."""
 96 |     # Create nested test files
 97 |     (tmp_path / "test1.py").write_text("print('test1')")
 98 |     deep_dir = tmp_path / "very" / "deep" / "nested"
 99 |     deep_dir.mkdir(parents=True)
100 |     (deep_dir / "test2.py").write_text("print('test2')")
101 |     (deep_dir / "test.js").write_text("console.log('test')")
102 | 
103 |     # Test recursive glob pattern
104 |     files = scan_directory(
105 |         tmp_path, include=["py"]
106 |     )  # Changed from tmp_path / "**/*.py"
107 |     assert len(files) == 2
108 |     assert any("test1.py" in str(p) for p in files)
109 |     assert any("test2.py" in str(p) for p in files)
110 | 
111 |     # Test from within subdirectory
112 |     subdir_files = scan_directory(
113 |         tmp_path / "very", include=["py"]
114 |     )  # Changed from tmp_path / "very" / "**/*.py"
115 |     assert len(subdir_files) == 1
116 |     assert any("test2.py" in str(p) for p in subdir_files)
117 | 
118 | 
119 | def test_scan_single_file(tmp_path):
120 |     """Test scanning a single file."""
121 |     # Create a test file
122 |     test_file = tmp_path / "test.py"
123 |     test_file.write_text("print('hello world')")
124 | 
125 |     # Create some other files that shouldn't be included
126 |     (tmp_path / "other.py").write_text("print('other')")
127 |     (tmp_path / "test.js").write_text("console.log('test')")
128 | 
129 |     # Test scanning just the single file
130 |     files = scan_directory(test_file, include=["py"])
131 | 
132 |     # Should only contain our specific file
133 |     assert len(files) == 1
134 |     assert test_file in files
135 |     assert files[test_file] == "print('hello world')"
136 | 
137 |     # Test with non-matching extension filter
138 |     files = scan_directory(test_file, include=["js"])
139 |     assert len(files) == 0
140 | 
141 |     # Test with non-existent file
142 |     files = scan_directory(tmp_path / "nonexistent.py", include=["py"])
143 |     assert len(files) == 0
144 | 


--------------------------------------------------------------------------------
/tests/test_format.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pytest
  3 | from copychat.format import (
  4 |     guess_language,
  5 |     format_file,
  6 |     create_header,
  7 |     estimate_tokens,
  8 |     format_files,
  9 | )
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def temp_files(tmp_path):
 14 |     """Create temporary test files."""
 15 |     # Create a python file
 16 |     py_file = tmp_path / "test.py"
 17 |     py_file.write_text("def hello():\n    print('world')")
 18 | 
 19 |     # Create a javascript file
 20 |     js_file = tmp_path / "test.js"
 21 |     js_file.write_text("function hello() {\n    console.log('world');\n}")
 22 | 
 23 |     return tmp_path, [py_file, js_file]
 24 | 
 25 | 
 26 | def test_guess_language():
 27 |     """Test language detection from file extensions."""
 28 |     assert guess_language(Path("test.py")) == "python"
 29 |     assert guess_language(Path("test.js")) == "javascript"
 30 |     assert guess_language(Path("test.tsx")) == "tsx"
 31 |     assert guess_language(Path("test.unknown")) is None
 32 | 
 33 | 
 34 | def test_format_file(temp_files):
 35 |     """Test single file formatting."""
 36 |     root_path, (py_file, _) = temp_files
 37 | 
 38 |     formatted_file = format_file(py_file, root_path)
 39 |     result = formatted_file.formatted_content
 40 | 
 41 |     assert "<file" in result
 42 |     assert 'path="test.py"' in result
 43 |     assert 'language="python"' in result
 44 |     assert "def hello():" in result
 45 |     assert "print('world')" in result
 46 | 
 47 | 
 48 | def test_create_header(temp_files):
 49 |     """Test header creation."""
 50 |     root_path, files = temp_files
 51 | 
 52 |     # Create a FormatResult to pass to create_header
 53 |     file_contents = [(f, f.read_text()) for f in files]
 54 |     format_result = format_files(file_contents)
 55 | 
 56 |     header = create_header(format_result)
 57 | 
 58 |     assert "Generated by copychat on" in header
 59 |     assert "Root path:" in header
 60 |     assert "Summary:" in header
 61 |     assert "test.py" in header
 62 |     assert "test.js" in header
 63 | 
 64 | 
 65 | def test_estimate_tokens():
 66 |     """Test token estimation."""
 67 |     text = "Hello, world! This is a test."
 68 |     tokens = estimate_tokens(text)
 69 |     assert tokens > 0
 70 |     assert isinstance(tokens, int)
 71 | 
 72 | 
 73 | def test_format_files(temp_files):
 74 |     """Test formatting multiple files."""
 75 |     root_path, files = temp_files
 76 | 
 77 |     # Pass a list of tuples (Path, str) to format_files
 78 |     file_contents = [(f, f.read_text()) for f in files]
 79 |     format_result = format_files(file_contents)
 80 |     result = str(format_result)
 81 | 
 82 |     # Check header
 83 |     assert "Generated by copychat" in result
 84 | 
 85 |     # Check both files are included
 86 |     assert 'path="test.py"' in result
 87 |     assert 'path="test.js"' in result
 88 | 
 89 |     # Check content
 90 |     assert "def hello():" in result
 91 |     assert "console.log('world');" in result
 92 | 
 93 | 
 94 | def test_format_files_empty():
 95 |     """Test formatting with no files."""
 96 |     format_result = format_files([])
 97 |     result = str(format_result)
 98 |     assert "No files found" in result
 99 | 
100 | 
101 | def test_format_file_error(tmp_path):
102 |     """Test handling of file read errors."""
103 |     non_existent = tmp_path / "does_not_exist.py"
104 |     formatted_file = format_file(non_existent, tmp_path)
105 |     result = formatted_file.formatted_content
106 |     assert "Error processing" in result
107 | 


--------------------------------------------------------------------------------
/tests/test_github_item.py:
--------------------------------------------------------------------------------
  1 | from copychat.sources import GitHubItem
  2 | 
  3 | 
  4 | class DummyResponse:
  5 |     def __init__(self, data, status=200, is_text=False):
  6 |         self._data = data
  7 |         self.status_code = status
  8 |         self.ok = status == 200
  9 |         self._is_text = is_text
 10 | 
 11 |     def raise_for_status(self):
 12 |         if not self.ok:
 13 |             raise Exception("status")
 14 | 
 15 |     def json(self):
 16 |         return self._data
 17 | 
 18 |     @property
 19 |     def text(self):
 20 |         return self._data if self._is_text else ""
 21 | 
 22 | 
 23 | def test_github_item_fetch(monkeypatch):
 24 |     """GitHubItem should format issue and comments."""
 25 | 
 26 |     issue_data = {
 27 |         "title": "Test issue",
 28 |         "body": "Body text",
 29 |         "comments_url": "http://example.com/comments",
 30 |         "pull_request": {},
 31 |         "html_url": "https://github.com/owner/repo/pull/1",
 32 |         "user": {"login": "testuser"},
 33 |         "created_at": "2024-01-01",
 34 |         "updated_at": "2024-01-02",
 35 |         "state": "open",
 36 |     }
 37 |     comments = [{"user": {"login": "alice"}, "created_at": "2024-01-01", "body": "hi"}]
 38 |     reviews = [
 39 |         {
 40 |             "user": {"login": "bob"},
 41 |             "created_at": "2024-01-02",
 42 |             "path": "file.py",
 43 |             "body": "looks good",
 44 |         }
 45 |     ]
 46 | 
 47 |     calls = []
 48 | 
 49 |     def fake_get(url, headers=None, timeout=0):
 50 |         calls.append(url)
 51 |         if "comments" in url and "pulls" in url:
 52 |             return DummyResponse(reviews)
 53 |         if "comments" in url:
 54 |             return DummyResponse(comments)
 55 |         return DummyResponse(issue_data)
 56 | 
 57 |     monkeypatch.setattr("requests.get", fake_get)
 58 | 
 59 |     item = GitHubItem("owner/repo", 1)
 60 |     path, content = item.fetch()
 61 | 
 62 |     assert path.name == "owner_repo_pr_1.md"
 63 |     assert "Test issue" in content
 64 |     assert "alice" in content
 65 |     assert "looks good" in content
 66 |     assert "**Pull Request**" in content
 67 |     assert "**Status**: OPEN" in content
 68 |     assert "**Author**: testuser" in content
 69 |     assert "https://github.com/owner/repo/pull/1" in content
 70 |     assert any("pulls" in c for c in calls)
 71 | 
 72 | 
 73 | def test_github_item_fetch_with_diff(monkeypatch):
 74 |     """GitHubItem should include PR diff when available."""
 75 | 
 76 |     issue_data = {
 77 |         "title": "Test PR",
 78 |         "body": "PR description",
 79 |         "comments_url": "http://example.com/comments",
 80 |         "pull_request": {},
 81 |         "html_url": "https://github.com/owner/repo/pull/2",
 82 |         "user": {"login": "testuser"},
 83 |         "created_at": "2024-01-01",
 84 |         "updated_at": "2024-01-02",
 85 |         "state": "open",
 86 |     }
 87 |     comments = []
 88 |     reviews = []
 89 |     diff_content = """diff --git a/file.txt b/file.txt
 90 | index abc123..def456 100644
 91 | --- a/file.txt
 92 | +++ b/file.txt
 93 | @@ -1,3 +1,3 @@
 94 |  Line 1
 95 | -Line 2
 96 | +Line 2 modified
 97 |  Line 3"""
 98 | 
 99 |     calls = []
100 |     headers_received = {}
101 | 
102 |     def fake_get(url, headers=None, timeout=0):
103 |         calls.append(url)
104 |         if headers:
105 |             headers_received[url] = headers
106 | 
107 |         if "diff" in headers.get("Accept", "") and "pulls" in url:
108 |             return DummyResponse(diff_content, is_text=True)
109 |         if "comments" in url and "pulls" in url:
110 |             return DummyResponse(reviews)
111 |         if "comments" in url:
112 |             return DummyResponse(comments)
113 |         return DummyResponse(issue_data)
114 | 
115 |     monkeypatch.setattr("requests.get", fake_get)
116 | 
117 |     item = GitHubItem("owner/repo", 2)
118 |     path, content = item.fetch()
119 | 
120 |     assert path.name == "owner_repo_pr_2.md"
121 |     assert "Test PR" in content
122 |     assert "PR description" in content
123 |     assert "**Pull Request**" in content
124 |     assert "## PR Diff" in content
125 |     assert "```diff" in content
126 |     assert "+Line 2 modified" in content
127 |     assert "application/vnd.github.diff" in headers_received.get(
128 |         "https://api.github.com/repos/owner/repo/pulls/2", {}
129 |     ).get("Accept", "")
130 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from copychat.core import scan_directory, DiffMode
  3 | from copychat.format import format_files
  4 | 
  5 | 
  6 | def test_basic_scan(sample_project):
  7 |     """Test basic file scanning functionality."""
  8 |     files = scan_directory(
  9 |         sample_project,
 10 |         include=["py", "js", "css"],
 11 |     )
 12 | 
 13 |     # Check we found the expected file types
 14 |     extensions = {f.suffix.lstrip(".") for f in files}
 15 |     assert extensions == {"py", "js", "css"}
 16 | 
 17 |     # Check we found files in nested directories
 18 |     assert any("utils" in str(f) for f in files)
 19 |     assert any("styles" in str(f) for f in files)
 20 | 
 21 | 
 22 | def test_gitignore_handling(sample_project):
 23 |     """Test that .gitignore patterns are respected."""
 24 |     files = scan_directory(sample_project, include=["py", "env"])
 25 | 
 26 |     # These should be excluded by .gitignore
 27 |     paths = {str(f) for f in files}
 28 |     assert not any(f.endswith(".pyc") for f in paths)
 29 |     assert not any("__pycache__" in f for f in paths)
 30 |     assert not any(f.endswith(".env") for f in paths)
 31 | 
 32 | 
 33 | def test_formatting_output(sample_project):
 34 |     """Test that output is formatted correctly."""
 35 |     # Get files and format them
 36 |     files = scan_directory(sample_project, include=["py", "js"])
 37 |     format_result = format_files([(f, f.read_text()) for f in files])
 38 |     result = str(format_result)
 39 | 
 40 |     # Check for file content without line numbers
 41 |     assert "def main():" in result  # Remove the "1|" prefix
 42 |     assert 'print("Hello from main!")' in result
 43 |     assert "function App()" in result
 44 |     assert "def calculate_total" in result
 45 | 
 46 | 
 47 | def test_different_file_types(sample_project):
 48 |     """Test handling of different file types."""
 49 |     files = scan_directory(
 50 |         sample_project,
 51 |         include=["yml", "sql", "ts", "md"],
 52 |     )
 53 |     # Convert files to (path, content) tuples
 54 |     files_with_content = [(f, f.read_text()) for f in files]
 55 |     format_result = format_files(files_with_content)
 56 |     result = str(format_result)
 57 | 
 58 |     # Check various file types are properly formatted
 59 |     assert 'language="yaml"' in result
 60 |     assert 'language="sql"' in result
 61 |     assert 'language="typescript"' in result
 62 |     assert 'language="markdown"' in result
 63 | 
 64 |     # Check content snippets from each type
 65 |     assert "CREATE TABLE users" in result
 66 |     assert "interface User" in result
 67 |     assert "TestApp" in result
 68 |     assert "# Test Project" in result
 69 | 
 70 | 
 71 | def test_exclusion_patterns(sample_project):
 72 |     """Test explicit exclusion patterns."""
 73 |     files = scan_directory(
 74 |         sample_project,
 75 |         include=["py", "js"],
 76 |         exclude_patterns=["**/utils/*"],  # Exclude utils directory
 77 |     )
 78 | 
 79 |     paths = {str(f) for f in files}
 80 |     assert not any("utils" in p for p in paths)
 81 |     assert any("main.py" in p for p in paths)
 82 | 
 83 | 
 84 | def test_empty_directory(tmp_path):
 85 |     """Test handling of empty directories."""
 86 |     files = scan_directory(tmp_path)
 87 |     format_result = format_files([(f, f.read_text()) for f in list(files)])
 88 |     result = str(format_result)
 89 |     assert "No files found" in result
 90 | 
 91 | 
 92 | def test_header_metadata(sample_project):
 93 |     """Test header metadata in formatted output."""
 94 |     files = scan_directory(sample_project, include=["py"])
 95 |     # Convert files to (path, content) tuples
 96 |     files_with_content = [(f, f.read_text()) for f in files]
 97 |     format_result = format_files(files_with_content)
 98 |     result = str(format_result)
 99 | 
100 |     # Check header contains important metadata
101 |     assert "Generated by copychat on" in result
102 |     assert "Root path:" in result
103 |     assert "Summary:" in result  # Changed from Files:
104 | 
105 |     # Check file path info in header table format
106 |     assert "Path" in result
107 |     assert "Tokens" in result
108 |     assert "Lines" in result
109 |     # Paths will be in table rows instead of list format
110 | 
111 | 
112 | @pytest.mark.parametrize(
113 |     "diff_mode",
114 |     [
115 |         DiffMode.FULL,
116 |         DiffMode.FULL_WITH_DIFF,
117 |         # Removing these modes for now as they require git setup
118 |         # DiffMode.CHANGED_WITH_DIFF,
119 |         # DiffMode.DIFF_ONLY,
120 |     ],
121 | )
122 | def test_diff_modes(sample_project, diff_mode):
123 |     """Test different diff modes."""
124 |     files = scan_directory(
125 |         sample_project,
126 |         include=["py"],
127 |         diff_mode=diff_mode,
128 |     )
129 |     assert len(files) > 0
130 | 
131 | 
132 | def test_token_estimation(sample_project):
133 |     """Test token estimation functionality."""
134 |     files = scan_directory(sample_project, include=["py", "js"])
135 |     # Convert files to (path, content) tuples
136 |     files_with_content = [(f, f.read_text()) for f in files]
137 |     format_result = format_files(files_with_content)
138 |     result = str(format_result)
139 | 
140 |     # Result should include token info in header
141 |     assert "tokens" in result.lower()
142 | 
143 |     # Basic sanity check - content should be non-empty
144 |     assert len(result) > 0
145 | 
146 | 
147 | def test_error_handling(sample_project, tmp_path):
148 |     """Test error handling for problematic files."""
149 |     try:
150 |         # Create an unreadable file in the temporary directory
151 |         bad_file = tmp_path / "bad.py"
152 |         bad_file.write_text("def bad():\n    pass\n")
153 |         bad_file.chmod(0o000)  # Remove read permissions
154 | 
155 |         # Include both the sample project and the tmp directory
156 |         files = scan_directory(sample_project, include=["py"])
157 |         files = list(files)
158 |         files.append(bad_file)
159 | 
160 |         # Convert files to (path, content) tuples, handling potential read errors
161 |         files_with_content = []
162 |         for f in files:
163 |             try:
164 |                 content = f.read_text()
165 |                 files_with_content.append((f, content))
166 |             except (PermissionError, OSError):
167 |                 # Still include the file, but with empty content
168 |                 files_with_content.append((f, ""))
169 | 
170 |         format_result = format_files(files_with_content)
171 |         result = str(format_result)
172 | 
173 |         # Check that the bad file is mentioned in the result
174 |         assert "bad.py" in result
175 | 
176 |         # Should still process good files
177 |         assert "main.py" in result
178 |         assert "def main():" in result
179 |     finally:
180 |         # Cleanup
181 |         try:
182 |             bad_file.chmod(0o666)
183 |             bad_file.unlink()
184 |         except Exception:
185 |             pass
186 | 


--------------------------------------------------------------------------------
/tests/test_patterns.py:
--------------------------------------------------------------------------------
 1 | from copychat.patterns import (
 2 |     DEFAULT_EXTENSIONS,
 3 |     EXCLUDED_DIRS,
 4 |     EXCLUDED_PATTERNS,
 5 | )
 6 | 
 7 | 
 8 | def test_default_extensions():
 9 |     """Test default extensions are properly defined."""
10 |     assert isinstance(DEFAULT_EXTENSIONS, set)
11 |     assert "py" in DEFAULT_EXTENSIONS
12 |     assert "js" in DEFAULT_EXTENSIONS
13 |     assert "md" in DEFAULT_EXTENSIONS
14 | 
15 | 
16 | def test_excluded_dirs():
17 |     """Test excluded directories are properly defined."""
18 |     assert isinstance(EXCLUDED_DIRS, set)
19 |     assert ".git" in EXCLUDED_DIRS
20 |     assert "node_modules" in EXCLUDED_DIRS
21 |     assert "__pycache__" in EXCLUDED_DIRS
22 | 
23 | 
24 | def test_excluded_patterns():
25 |     """Test excluded patterns are properly defined."""
26 |     assert isinstance(EXCLUDED_PATTERNS, set)
27 |     assert "*.pyc" in EXCLUDED_PATTERNS
28 |     assert "*.log" in EXCLUDED_PATTERNS
29 |     assert ".env" in EXCLUDED_PATTERNS
30 | 


--------------------------------------------------------------------------------
/tests/test_sources.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import shutil
 3 | from copychat.sources import GitHubSource
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def temp_cache_dir(tmp_path):
 8 |     """Create temporary cache directory."""
 9 |     cache_dir = tmp_path / "cache"
10 |     cache_dir.mkdir()
11 |     yield cache_dir
12 |     # Cleanup
13 |     if cache_dir.exists():
14 |         shutil.rmtree(cache_dir)
15 | 
16 | 
17 | def test_github_source_init(temp_cache_dir):
18 |     """Test GitHubSource initialization."""
19 |     source = GitHubSource("owner/repo", cache_dir=temp_cache_dir)
20 |     assert source.repo_path == "owner/repo"
21 |     assert source.clone_url == "https://github.com/owner/repo.git"
22 |     assert source.repo_dir == temp_cache_dir / "owner_repo"
23 | 
24 | 
25 | def test_github_source_fetch(temp_cache_dir):
26 |     """Test fetching a real public repository."""
27 |     source = GitHubSource("prefecthq/prefect", cache_dir=temp_cache_dir)
28 |     repo_dir = source.fetch()
29 | 
30 |     assert repo_dir.exists()
31 |     assert (repo_dir / ".git").exists()
32 |     assert (repo_dir / "README.md").exists()
33 | 
34 |     # Test update of existing repo
35 |     repo_dir = source.fetch()  # Should use cached version
36 |     assert repo_dir.exists()
37 | 
38 | 
39 | def test_github_source_cleanup(temp_cache_dir):
40 |     """Test repository cleanup."""
41 |     source = GitHubSource("prefecthq/prefect", cache_dir=temp_cache_dir)
42 |     source.fetch()
43 |     assert source.repo_dir.exists()
44 | 
45 |     source.cleanup()
46 |     assert not source.repo_dir.exists()
47 | 


--------------------------------------------------------------------------------
/tests/tests/data/test1.txt:
--------------------------------------------------------------------------------
1 | This is a test file


--------------------------------------------------------------------------------
/tests/tests/data/test2.md:
--------------------------------------------------------------------------------
1 | This is another test file


--------------------------------------------------------------------------------