├── docs
    ├── source
    │   ├── CNAME
    │   ├── contributors.rst
    │   ├── index.rst
    │   ├── project.rst
    │   ├── repository.rst
    │   ├── usecases.rst
    │   └── cache.rst
    ├── requirements.txt
    ├── AUDIT_REPORT.md
    ├── Makefile
    └── make.bat
├── examples
    ├── __init__.py
    ├── img
    │   ├── lifeline.png
    │   ├── githubblame.png
    │   ├── cumulative_blame.png
    │   ├── cumulative_project_blame.png
    │   └── stravalib_cumulative_blame.png
    ├── definitions.py
    ├── file_change_rates.py
    ├── cloud_repo.py
    ├── project_blame.py
    ├── repo_file_detail.py
    ├── punchcard.py
    ├── bus_analysis.py
    ├── cumulative_blame.py
    ├── attributes.py
    ├── hours_estimate.py
    ├── parallel_blame.py
    ├── lifeline.py
    ├── commit_history.py
    ├── release_analytics.py
    └── cache_timestamps.py
├── img
    ├── lifeline.png
    ├── punchcard.png
    └── cumulative_blame.png
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   ├── test-docs-build.yml
    │   ├── docs.yml
    │   ├── test-suite.yml
    │   └── pypi-publish.yml
├── tests
    ├── __init__.py
    ├── test_Repository
    │   ├── __init__.py
    │   ├── test_properties.py
    │   ├── test_punchcard.py
    │   ├── test_file_operations.py
    │   ├── test_file_detail.py
    │   └── test_bus_factor.py
    ├── test_Project
    │   ├── __init__.py
    │   └── test_properties.py
    ├── test_utilities
    │   ├── test_init.py
    │   └── test_check_api.py
    ├── test_examples.py
    ├── conftest.py
    ├── test_cache_key_consistency.py
    └── test_repository_cache.py
├── gitpandas
    ├── __init__.py
    ├── utilities
    │   ├── __init__.py
    │   ├── check_api.py
    │   └── plotting.py
    └── logging.py
├── .gitignore
├── mcp_server
    └── README.md
├── LICENSE.md
├── .cursor
    └── rules
    │   ├── testing.mdc
    │   ├── pytest_standards.mdc
    │   ├── python_standards.mdc
    │   ├── sphinx_docs_standards.mdc
    │   └── project_overview.mdc
├── CONTRIBUTING.md
├── pyproject.toml
├── Makefile
└── CHANGELOG.md


/docs/source/CNAME:
--------------------------------------------------------------------------------
1 | gitpandas.mcginniscommawill.com 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = "willmcginnis"
2 | 


--------------------------------------------------------------------------------
/img/lifeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/lifeline.png


--------------------------------------------------------------------------------
/img/punchcard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/punchcard.png


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Fixes #
2 | 
3 | ## Proposed Changes
4 | 
5 |   -
6 |   -
7 |   -


--------------------------------------------------------------------------------
/img/cumulative_blame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/cumulative_blame.png


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Test suite for git-pandas.
3 | """
4 | 
5 | __author__ = "willmcginnis"
6 | 


--------------------------------------------------------------------------------
/examples/img/lifeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/lifeline.png


--------------------------------------------------------------------------------
/examples/img/githubblame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/githubblame.png


--------------------------------------------------------------------------------
/examples/definitions.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | GIT_PANDAS_DIR = Path(__file__).resolve().parent.parent
4 | 


--------------------------------------------------------------------------------
/examples/img/cumulative_blame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/cumulative_blame.png


--------------------------------------------------------------------------------
/tests/test_Repository/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the Repository class.
3 | """
4 | 
5 | __author__ = "willmcginnis"
6 | 


--------------------------------------------------------------------------------
/tests/test_Project/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the ProjectDirectory class.
3 | """
4 | 
5 | __author__ = "willmcginnis"
6 | 


--------------------------------------------------------------------------------
/examples/img/cumulative_project_blame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/cumulative_project_blame.png


--------------------------------------------------------------------------------
/examples/img/stravalib_cumulative_blame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/stravalib_cumulative_blame.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=4.0.0
2 | sphinx-rtd-theme>=1.0.0
3 | sphinx-rtd-dark-mode>=1.0.0
4 | sphinxcontrib-googleanalytics>=0.3 
5 | gitpython>=1.0.0
6 | numpy>=1.9.0
7 | pandas>=2.0.0
8 | requests
9 | redis


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ## Expected Behavior
 2 | 
 3 | 
 4 | ## Actual Behavior
 5 | 
 6 | 
 7 | ## Steps to Reproduce the Problem
 8 | 
 9 |   1.
10 |   1.
11 |   1.
12 | 
13 | ## Specifications
14 | 
15 |   - Version:
16 |   - Platform:
17 |   - Subsystem:


--------------------------------------------------------------------------------
/tests/test_utilities/test_init.py:
--------------------------------------------------------------------------------
1 | import gitpandas
2 | 
3 | 
4 | def test_version():
5 |     """Test that the version is defined."""
6 |     assert isinstance(gitpandas.__version__, str)
7 |     assert len(gitpandas.__version__.split(".")) >= 2  # Should have at least major.minor version
8 | 


--------------------------------------------------------------------------------
/examples/file_change_rates.py:
--------------------------------------------------------------------------------
 1 | from definitions import GIT_PANDAS_DIR
 2 | 
 3 | from gitpandas import Repository
 4 | 
 5 | __author__ = "willmcginnis"
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     repo = Repository(working_dir=GIT_PANDAS_DIR)
10 |     fc = repo.file_change_rates(include_globs=["*.py"], coverage=True)
11 |     print(fc)
12 | 


--------------------------------------------------------------------------------
/examples/cloud_repo.py:
--------------------------------------------------------------------------------
 1 | from gitpandas import Repository
 2 | 
 3 | __author__ = "willmcginnis"
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     repo = Repository(working_dir="https://github.com/CamDavidsonPilon/lifelines.git", verbose=True)
 8 |     shared_blame = repo.blame(include_globs=["*.py"], committer=False, by="file")
 9 | 
10 |     print(shared_blame)
11 | 


--------------------------------------------------------------------------------
/examples/project_blame.py:
--------------------------------------------------------------------------------
 1 | from definitions import GIT_PANDAS_DIR
 2 | 
 3 | from gitpandas import ProjectDirectory
 4 | 
 5 | __author__ = "willmcginnis"
 6 | 
 7 | if __name__ == "__main__":
 8 |     g = ProjectDirectory(working_dir=GIT_PANDAS_DIR)
 9 | 
10 |     b = g.blame(include_globs=["*.py"], ignore_globs=["lib/*", "docs/*"], by="file")
11 |     print(b.head(5))
12 | 


--------------------------------------------------------------------------------
/examples/repo_file_detail.py:
--------------------------------------------------------------------------------
 1 | from definitions import GIT_PANDAS_DIR
 2 | 
 3 | from gitpandas import ProjectDirectory
 4 | 
 5 | __author__ = "willmcginnis"
 6 | 
 7 | if __name__ == "__main__":
 8 |     g = ProjectDirectory(working_dir=GIT_PANDAS_DIR)
 9 | 
10 |     b = g.file_detail(include_globs=["*.py"], ignore_globs=["lib/*", "docs/*"])
11 |     print(b.head(25))
12 | 


--------------------------------------------------------------------------------
/gitpandas/__init__.py:
--------------------------------------------------------------------------------
 1 | from gitpandas.project import GitHubProfile, ProjectDirectory
 2 | from gitpandas.repository import Repository
 3 | 
 4 | try:
 5 |     from importlib.metadata import version
 6 | 
 7 |     __version__ = version("git-pandas")
 8 | except ImportError:
 9 |     # Fallback for Python < 3.8
10 |     from importlib_metadata import version
11 | 
12 |     __version__ = version("git-pandas")
13 | 
14 | __author__ = "willmcginnis"
15 | 
16 | __all__ = ["Repository", "ProjectDirectory", "GitHubProfile"]
17 | 


--------------------------------------------------------------------------------
/gitpandas/utilities/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. module:: utilities
 3 |    :platform: Unix, Windows
 4 |    :synopsis: Helper methods for plotting or otherwise manipulating output from gitpandas objects
 5 | 
 6 | .. moduleauthor:: Will McGinnis <will@pedalwrencher.com>
 7 | 
 8 | 
 9 | """
10 | 
11 | __author__ = "willmcginnis"
12 | # Version is now managed centrally in gitpandas.__init__
13 | 
14 | try:
15 |     import joblib  # noqa: F401
16 | 
17 |     _has_joblib = True
18 | except ImportError:
19 |     _has_joblib = False
20 | 


--------------------------------------------------------------------------------
/examples/punchcard.py:
--------------------------------------------------------------------------------
 1 | """ """
 2 | 
 3 | import sys
 4 | 
 5 | import matplotlib
 6 | 
 7 | matplotlib.use("Agg")  # Set the backend to Agg before importing pyplot
 8 | 
 9 | from definitions import GIT_PANDAS_DIR
10 | 
11 | from gitpandas import ProjectDirectory
12 | from gitpandas.utilities.plotting import plot_punchcard
13 | 
14 | g = ProjectDirectory(working_dir=[str(GIT_PANDAS_DIR)], verbose=True)
15 | 
16 | by = None
17 | punchcard = g.punchcard(include_globs=["*.py"], by=by, normalize=2500)
18 | 
19 | if punchcard.empty:
20 |     print("No commit data available for punchcard analysis.")
21 |     sys.exit(0)
22 | 
23 | plot_punchcard(punchcard, metric="lines", title="punchcard", by=by)
24 | 


--------------------------------------------------------------------------------
/.github/workflows/test-docs-build.yml:
--------------------------------------------------------------------------------
 1 | name: "Pull Request Docs Check"
 2 | on:
 3 | - pull_request
 4 | 
 5 | jobs:
 6 |   docs:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - name: Checkout
10 |       uses: actions/checkout@v1
11 |     
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v4
14 |       with:
15 |         python-version: '3.10'
16 |     
17 |     - name: Install uv
18 |       run: |
19 |         curl -LsSf https://astral.sh/uv/install.sh | sh
20 |         echo "$HOME/.cargo/bin" >> $GITHUB_PATH
21 |     
22 |     - name: Install Dependencies
23 |       run: |
24 |         uv pip install --system -e ".[dev]"
25 |     
26 |     - name: Build Docs
27 |       uses: ammaraskar/sphinx-action@master
28 |       with:
29 |         docs-folder: "docs/"


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: "Master Docs Publication"
 2 | on:
 3 |   push:
 4 |     branches: [ master ]
 5 | 
 6 | jobs:
 7 |   docs:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Clone
11 |         uses: actions/checkout@v1
12 |       
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: '3.10'
17 |       
18 |       - name: Install uv
19 |         run: |
20 |           curl -LsSf https://astral.sh/uv/install.sh | sh
21 |           echo "$HOME/.cargo/bin" >> $GITHUB_PATH
22 |       
23 |       - name: Install Dependencies
24 |         run: |
25 |           uv pip install --system -e ".[dev]"
26 |       
27 |       - name: Build Docs
28 |         uses: ammaraskar/sphinx-action@master
29 |         with:
30 |           docs-folder: "./docs/"
31 |       
32 |       - name: Deploy Docs
33 |         uses: peaceiris/actions-gh-pages@v3
34 |         with:
35 |           github_token: ${{ secrets.GITHUB_TOKEN }}
36 |           publish_dir: "./docs/build/html/"


--------------------------------------------------------------------------------
/gitpandas/utilities/check_api.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A little utility to figure out all of the parameter names in git-pandas, and make sure we aren't mixing up language
 3 | in different functions.
 4 | 
 5 | """
 6 | 
 7 | import inspect
 8 | 
 9 | __author__ = "willmcginnis"
10 | 
11 | 
12 | def extract_objects(m, classes=True, functions=False):
13 |     # add in the classes at this level
14 |     out = {}
15 |     if classes:
16 |         m_dict = {k: v for k, v in m.__dict__.items() if inspect.isclass(v)}
17 |         out.update(m_dict)
18 |     if functions:
19 |         m_dict = {k: v for k, v in m.__dict__.items() if inspect.isfunction(v)}
20 |         out.update(m_dict)
21 | 
22 |     return out
23 | 
24 | 
25 | def parse_docstring(ds):
26 |     ds = [x.strip() for x in ds.split("\n")]
27 |     ds = [x.split(":") for x in ds if x.startswith(":param")]
28 |     ds = [{x[1].replace("param", "").strip(): x[2].strip()} for x in ds]
29 |     return ds
30 | 
31 | 
32 | def get_distinct_params(m):
33 |     out = set()
34 |     for k in m:
35 |         out.update(m[k]["args"])
36 |     return out
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     print("Development utilities for analyzing the git-pandas API")
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by .ignore support plugin (hsz.mobi)
 2 | 
 3 | # tests
 4 | # these should be torn down after every test, but if a test is interupted,
 5 | # they won't be until the next test
 6 | tests/**/repos
 7 | uv.lock
 8 | # scratch stuff
 9 | scratch
10 | 
11 | ### Python template
12 | # Byte-compiled / optimized / DLL files
13 | __pycache__/
14 | *.py[cod]
15 | *$py.class
16 | .idea
17 | .idea/
18 | .DS_Store
19 | # C extensions
20 | *.so
21 | gitnoc_desktop/.venv/
22 | # Distribution / packaging
23 | .Python
24 | env/
25 | build/
26 | develop-eggs/
27 | dist/
28 | downloads/
29 | eggs/
30 | .eggs/
31 | lib/
32 | lib64/
33 | parts/
34 | sdist/
35 | var/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | 
40 | # PyInstaller
41 | #  Usually these files are written by a python script from a template
42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 | 
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 | 
50 | # Unit test / coverage reports
51 | htmlcov/
52 | .tox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *,cover
59 | 
60 | # Translations
61 | *.mo
62 | *.pot
63 | 
64 | # Django stuff:
65 | *.log
66 | 
67 | # Sphinx documentation
68 | docs/_build/
69 | 
70 | # PyBuilder
71 | target/
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/mcp_server/README.md:
--------------------------------------------------------------------------------
 1 | # GitPandas MCP Server
 2 | 
 3 | This directory contains an experimental Model Context Protocol (MCP) server that acts as a wrapper around the `git-pandas` Python library. It allows MCP-compatible clients (like AI assistants or IDE extensions) to interact with git repository data using structured tools.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | *   Python 3.8+
 8 | *   `uv` (Python package installer and virtual environment manager). See [uv installation guide](https://github.com/astral-sh/uv#installation). You can often install it via pip: `pip install uv`.
 9 | 
10 | ## Setup and Installation (using uv)
11 | 
12 | 1.  **Clone the Repository:** If you haven't already, clone the main `git-pandas` repository.
13 | 
14 | ## Connecting with Claude Desktop (or similar clients)
15 | 
16 | 1.  Open Claude Desktop's settings or preferences.
17 | 2.  Look for a section related to "Tools", "MCP Servers", "External Tools", or similar.
18 | 3.  Add a new server configuration.
19 | 4.  When prompted for the server command or path, paste the **full `uv run` command** you constructed in the previous section (e.g., `uv run --cwd /absolute/path/to/git-pandas/mcp_server python server.py`).
20 | 5.  Save the configuration.
21 | 
22 | Claude Desktop should now be able to launch this server process using `uv` when needed and communicate with it to use the defined tools.


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2025, Will McGinnis
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/.github/workflows/test-suite.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: "Test Suite and Linting"
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   test:
14 |     runs-on: ubuntu-latest
15 |     
16 |     steps:
17 |     - name: Checkout code
18 |       uses: actions/checkout@v4
19 |       with:
20 |         fetch-depth: 0
21 |       
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v4
24 |       with:
25 |         python-version: '3.10'
26 |     
27 |     - name: Install uv
28 |       run: |
29 |         curl -LsSf https://astral.sh/uv/install.sh | sh
30 | 
31 |     - name: Add uv to PATH
32 |       run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
33 | 
34 |     - name: Check uv version
35 |       run: uv --version
36 |     
37 |     - name: Install dependencies
38 |       run: |
39 |         uv pip install --system -e ".[dev,datasets]"
40 |     
41 |     - name: Configure Git
42 |       run: |
43 |         git config --global user.email "github-actions@github.com"
44 |         git config --global user.name "GitHub Actions"
45 |     
46 |     - name: Run tests
47 |       run: |
48 |         MPLBACKEND=Agg uv run pytest -v -m "not slow"
49 |     
50 |     - name: Run linting
51 |       run: |
52 |         uv run ruff check .
53 | 


--------------------------------------------------------------------------------
/examples/bus_analysis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of analyzing the "bus factor" of a repository.
 3 | 
 4 | The bus factor is a measure of risk based on how concentrated the codebase knowledge is
 5 | among contributors. A low bus factor (e.g. 1-2) indicates high risk as knowledge is
 6 | concentrated among few contributors.
 7 | 
 8 | This example demonstrates:
 9 | 1. Creating a repository instance
10 | 2. Analyzing commit history with limits
11 | 3. Calculating bus factor
12 | 4. Viewing contributor statistics
13 | """
14 | 
15 | import time
16 | 
17 | from gitpandas import Repository
18 | 
19 | __author__ = "willmcginnis"
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     # Use a smaller repository for faster analysis
24 |     # This is a small Python package that's good for examples
25 |     repo = Repository(working_dir="https://github.com/wdm0006/cookiecutter-pipproject.git")
26 | 
27 |     print("Analyzing repository...")
28 |     start_time = time.time()
29 | 
30 |     # Get commit history with a reasonable limit
31 |     print("\nGetting commit history (limited to last 100 commits)...")
32 |     ch = repo.commit_history("master", limit=100, include_globs=["*.py"])
33 | 
34 |     # Calculate unique committers
35 |     committers = set(ch["committer"].values)
36 |     print(f"\nFound {len(committers)} unique committers:")
37 |     for committer in sorted(committers):
38 |         print(f"  - {committer}")
39 | 
40 |     # Calculate bus factor
41 |     print("\nCalculating bus factor...")
42 |     bus_factor = repo.bus_factor(include_globs=["*.py"])
43 |     print("\nBus factor analysis:")
44 |     print(bus_factor)
45 | 
46 |     end_time = time.time()
47 |     print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds")
48 | 


--------------------------------------------------------------------------------
/.cursor/rules/testing.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: how to use the makefile to run tests
 3 | globs: 
 4 | ---
 5 |  # Gitpandas Testing Standards
 6 | 
 7 | ## Running Tests
 8 | 
 9 | ### Using Make Commands
10 | - Always use `make test` to run the test suite
11 | - Use `make test-cov` to run tests with coverage reports
12 | - Use `make test-all` to run tests on all supported Python versions using tox
13 | 
14 | ### Test Commands
15 | ```bash
16 | # Run the standard test suite
17 | make test
18 | 
19 | # Run tests with coverage
20 | make test-cov
21 | 
22 | # Run tests on all supported Python versions
23 | make test-all
24 | 
25 | # Run a specific test file
26 | make test PYTEST_ARGS="tests/test_unified_interface.py"
27 | 
28 | # Run a specific test class
29 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface"
30 | 
31 | # Run a specific test method
32 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface::test_base_methods_elo"
33 | ```
34 | 
35 | ### Test Environment
36 | - Tests are run using `uv run pytest` through the Makefile
37 | - The test environment is automatically set up with the correct dependencies
38 | - Always run tests in a clean environment to avoid dependency conflicts
39 | 
40 | ### Continuous Integration
41 | - All tests must pass in CI before merging
42 | - Coverage should not decrease with new code
43 | - New features should include corresponding tests
44 | 
45 | ### Benchmarks
46 | - Use `make benchmark` to run performance benchmarks
47 | - Benchmark results are stored in the `benchmark_results` directory
48 | - Performance regressions should be addressed before merging
49 | 
50 | ### Linting and Formatting
51 | - Use `make lint` to check code quality
52 | - Use `make lint-fix` to automatically fix linting issues
53 | - Use `make format` to format code according to project standards


--------------------------------------------------------------------------------
/examples/cumulative_blame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of visualizing cumulative blame information.
 3 | 
 4 | This example demonstrates:
 5 | 1. Creating a project directory instance
 6 | 2. Analyzing blame information across multiple repositories
 7 | 3. Visualizing the results using matplotlib
 8 | """
 9 | 
10 | import os
11 | import time
12 | 
13 | from gitpandas import ProjectDirectory
14 | from gitpandas.utilities.plotting import plot_cumulative_blame
15 | 
16 | __author__ = "willmcginnis"
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     print("Initializing project directory...")
21 |     start_time = time.time()
22 | 
23 |     # Use pygeohash repository - a good size for examples
24 |     g = ProjectDirectory(working_dir=["https://github.com/wdm0006/pygeohash.git"])
25 | 
26 |     print("\nAnalyzing blame information...")
27 |     print("Using a limit of 20 commits and skipping every 2nd commit for faster analysis")
28 | 
29 |     try:
30 |         blame = g.cumulative_blame(
31 |             branch="master",  # Use master instead of main
32 |             include_globs=["*.py"],  # Focus on Python files only
33 |             by="committer",
34 |             limit=20,  # Limit to 20 commits
35 |             skip=2,  # Skip every other commit
36 |         )
37 | 
38 |         print("\nGenerating visualization...")
39 |         # Create the plot and save it
40 |         fig = plot_cumulative_blame(blame)
41 |         output_path = os.path.join("img", "cumulative_blame.png")
42 |         fig.savefig(output_path)
43 |         print(f"Plot saved to {output_path}")
44 | 
45 |     except Exception as e:
46 |         print(f"\nError during analysis: {str(e)}")
47 |         print("This might happen if the repository is not accessible or if there are no commits to analyze.")
48 | 
49 |     end_time = time.time()
50 |     print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds")
51 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | name: "PyPI Packaging"
 2 | on:
 3 |   release:
 4 |     types: created
 5 | 
 6 | jobs:
 7 |   build_and_publish:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       # IMPORTANT: this permission is mandatory for Trusted Publishing
11 |       id-token: write
12 |     steps:
13 |       - name: Clone
14 |         uses: actions/checkout@v3
15 |       
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.11'
20 |       
21 |       - name: Install uv
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install uv
25 |       
26 |       - name: Setup virtual environment
27 |         run: |
28 |           uv venv
29 |           echo "VIRTUAL_ENV=$(pwd)/.venv" >> $GITHUB_ENV
30 |           echo "$(pwd)/.venv/bin" >> $GITHUB_PATH
31 |       
32 |       - name: Install dependencies
33 |         run: |
34 |           uv pip install -e ".[dev]"
35 |           uv pip install build
36 |       
37 |       - name: Build source distribution
38 |         run: |
39 |           # Clean the build directories first
40 |           rm -rf build/
41 |           rm -rf dist/
42 |           rm -rf .eggs/
43 |           
44 |           # Find and remove egg-info directories, excluding the virtual environment
45 |           find . -path ./.venv -prune -o -name '*.egg-info' -type d -exec rm -rf {} \; 2>/dev/null || true
46 |           find . -path ./.venv -prune -o -name '*.egg' -type f -exec rm -f {} \; 2>/dev/null || true
47 |           find . -path ./.venv -prune -o -name '__pycache__' -type d -exec rm -rf {} \; 2>/dev/null || true
48 |           
49 |           # Build only the source distribution
50 |           python -m build --sdist
51 |       
52 |       - name: Publish package to PyPI
53 |         uses: pypa/gh-action-pypi-publish@release/v1
54 |         with:
55 |           packages-dir: dist/


--------------------------------------------------------------------------------
/examples/attributes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. module:: attributes.py
 3 |    :platform: Unix, Windows
 4 |    :synopsis: An example showing basic attribute access to gitpandas objects.
 5 | 
 6 | .. moduleauthor:: Will McGinnis <will@pedalwrencher.com>
 7 | 
 8 | 
 9 | """
10 | 
11 | from pandas import set_option
12 | 
13 | from gitpandas.project import ProjectDirectory
14 | from gitpandas.repository import Repository
15 | 
16 | __author__ = "willmcginnis"
17 | 
18 | 
19 | def repository():
20 |     # build an example repository object then check the attributes
21 |     r = Repository("https://github.com/wdm0006/git-pandas.git")
22 |     print("\nRepository Name")
23 |     print(r.repo_name)
24 |     print("\nRepository Branches:")
25 |     print(r.branches())
26 |     print("\nRepository Tags:")
27 |     print(r.tags())
28 |     print("\nRepository Revisions:")
29 |     print(r.revs())
30 |     print("\nRepository Blame:")
31 |     print(r.blame(include_globs=["*.py"]))
32 |     print("\nRepository Is Bare:")
33 |     print(r.is_bare())
34 | 
35 | 
36 | def project():
37 |     # build an example project directory object then check the attributes
38 |     p = ProjectDirectory(
39 |         [
40 |             "https://github.com/wdm0006/git-pandas.git",
41 |             "https://github.com/CamDavidsonPilon/lifelines.git",
42 |         ]
43 |     )
44 |     print("\nProject Directory Name")
45 |     print(p.repo_name())
46 |     print("\nProject Directory Branches:")
47 |     print(p.branches())
48 |     print("\nProject Directory Tags:")
49 |     print(p.tags())
50 |     print("\nProject Directory Revisions:")
51 |     print(p.revs())
52 |     print("\nProject Directory Blame:")
53 |     print(p.blame())
54 |     print("\nProject Directory Is Bare:")
55 |     print(p.is_bare())
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     set_option("display.max_rows", 500)
60 |     set_option("display.max_columns", 500)
61 |     set_option("display.width", 1000)
62 |     repository()
63 |     project()
64 | 


--------------------------------------------------------------------------------
/examples/hours_estimate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of estimating development hours from commit history.
 3 | 
 4 | This example demonstrates:
 5 | 1. Creating a repository instance
 6 | 2. Analyzing commit history
 7 | 3. Estimating development hours based on commit patterns
 8 | 4. Visualizing the results
 9 | """
10 | 
11 | import time
12 | 
13 | from gitpandas import Repository
14 | 
15 | __author__ = "willmcginnis"
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     print("Initializing repository...")
20 |     start_time = time.time()
21 | 
22 |     # Use pygeohash repository - a good size for examples
23 |     repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git")
24 | 
25 |     print("\nAnalyzing commit history...")
26 |     print("Using a limit of 20 commits for faster analysis")
27 | 
28 |     # Get commit history with limits
29 |     commits = repo.commit_history(
30 |         branch="master",  # Use master instead of main
31 |         limit=20,  # Limit to 20 commits
32 |         include_globs=["*.py"],  # Focus on Python files only
33 |     )
34 | 
35 |     print("\nEstimating development hours...")
36 |     # Group commits by day and estimate hours
37 |     daily_hours = commits.groupby(commits.index.date).agg({"lines": "sum", "insertions": "sum", "deletions": "sum"})
38 | 
39 |     # Estimate hours based on commit patterns
40 |     # Assuming average of 10 lines per hour of development
41 |     daily_hours["estimated_hours"] = daily_hours["lines"] / 10
42 | 
43 |     print("\nResults:")
44 |     print("\nDaily Development Hours:")
45 |     print(daily_hours["estimated_hours"].round(2))
46 | 
47 |     print("\nSummary Statistics:")
48 |     print(f"Total commits analyzed: {len(commits)}")
49 |     print(f"Total days with commits: {len(daily_hours)}")
50 |     print(f"Total estimated hours: {daily_hours['estimated_hours'].sum():.2f}")
51 |     print(f"Average hours per day: {daily_hours['estimated_hours'].mean():.2f}")
52 | 
53 |     end_time = time.time()
54 |     print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds")
55 | 


--------------------------------------------------------------------------------
/.cursor/rules/pytest_standards.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: tests/*
 4 | ---
 5 | # Pytest Standards
 6 | 
 7 | ## Test Organization
 8 | - All test files should be named with the prefix `test_` (e.g., `test_module.py`)
 9 | - Test classes should be named with the prefix `Test` (e.g., `TestClassName`)
10 | - Test methods should be named with the prefix `test_` (e.g., `test_function_name`)
11 | - Tests should be organized in the same directory structure as the source code they test
12 | 
13 | ## Test Structure
14 | - Each test should focus on testing a single functionality
15 | - Use descriptive test names that explain what is being tested
16 | - Group related tests in test classes
17 | - Use appropriate fixtures for test setup and teardown
18 | - Avoid test interdependence - tests should be able to run in any order
19 | 
20 | ## Assertions
21 | - Use pytest's built-in assertions instead of Python's `assert` statement when possible
22 | - Use appropriate assertion methods for the type being tested (e.g., `assert_almost_equal` for floats)
23 | - Include descriptive error messages in assertions to aid debugging
24 | 
25 | ## Fixtures
26 | - Use fixtures for common setup and teardown operations
27 | - Define fixtures at the appropriate scope (function, class, module, or session)
28 | - Use parameterized fixtures for testing multiple inputs
29 | - Use conftest.py for sharing fixtures across multiple test files
30 | 
31 | ## Coverage
32 | - Aim for at least 80% code coverage
33 | - Write tests for both success and failure cases
34 | - Test edge cases and boundary conditions
35 | - Use `pytest-cov` to generate coverage reports
36 | 
37 | ## Best Practices
38 | - Keep tests fast and independent
39 | - Avoid mocking unless necessary
40 | - Use markers to categorize tests (e.g., `@pytest.mark.slow`)
41 | - Use parametrize for testing multiple inputs
42 | - Write tests before or alongside code (TDD approach)
43 | - Run the full test suite before committing changes
44 | 
45 | ## Commands
46 | - Run tests with `pytest`
47 | - Generate coverage reports with `pytest --cov=gitpandas`
48 | - Run specific tests with `pytest path/to/test_file.py::TestClass::test_method` 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests to verify that all example scripts run without errors.
 3 | """
 4 | 
 5 | import subprocess
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | import pytest
10 | 
11 | # Get the examples directory
12 | EXAMPLES_DIR = Path(__file__).parent.parent / "examples"
13 | 
14 | # List of example scripts to test
15 | EXAMPLE_SCRIPTS = [
16 |     "attributes.py",
17 |     "bus_analysis.py",
18 |     "cloud_repo.py",
19 |     "commit_history.py",
20 |     "cumulative_blame.py",
21 |     "definitions.py",
22 |     "file_change_rates.py",
23 |     "hours_estimate.py",
24 |     "lifeline.py",
25 |     "parallel_blame.py",
26 |     "project_blame.py",
27 |     "punchcard.py",
28 |     "remote_fetch_and_cache_warming.py",  # Added new example
29 |     "repo_file_detail.py",
30 |     "release_analytics.py",  # Added new example
31 | ]
32 | 
33 | 
34 | @pytest.mark.slow
35 | def test_example_scripts():
36 |     """Test that all example scripts run without errors."""
37 |     for script in EXAMPLE_SCRIPTS:
38 |         script_path = EXAMPLES_DIR / script
39 |         assert script_path.exists(), f"Example script {script} not found"
40 | 
41 |         # Run the script with Python
42 |         try:
43 |             # Use the same Python interpreter that's running the tests
44 |             python_executable = sys.executable
45 |             result = subprocess.run(
46 |                 [python_executable, str(script_path)],
47 |                 cwd=EXAMPLES_DIR,
48 |                 capture_output=True,
49 |                 text=True,
50 |                 timeout=300,  # 5 minute timeout per script
51 |             )
52 | 
53 |             # Check if the script ran successfully
54 |             assert result.returncode == 0, (
55 |                 f"Script {script} failed with return code {result.returncode}\n"
56 |                 f"stdout: {result.stdout}\n"
57 |                 f"stderr: {result.stderr}"
58 |             )
59 | 
60 |         except subprocess.TimeoutExpired:
61 |             pytest.fail(f"Script {script} timed out after 5 minutes")
62 |         except Exception as e:
63 |             pytest.fail(f"Error running script {script}: {str(e)}")
64 | 


--------------------------------------------------------------------------------
/tests/test_utilities/test_check_api.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | import pytest
 4 | 
 5 | from gitpandas.utilities.check_api import (
 6 |     extract_objects,
 7 |     get_distinct_params,
 8 |     parse_docstring,
 9 | )
10 | 
11 | 
12 | class TestCheckAPI:
13 |     def test_extract_objects(self):
14 |         """Test extracting objects from a module."""
15 |         # Test with classes only
16 |         objects = extract_objects(pytest, classes=True, functions=False)
17 |         assert isinstance(objects, dict)
18 |         assert all(inspect.isclass(v) for v in objects.values())
19 | 
20 |         # Test with functions only
21 |         objects = extract_objects(pytest, classes=False, functions=True)
22 |         assert isinstance(objects, dict)
23 |         assert all(inspect.isfunction(v) for v in objects.values())
24 | 
25 |         # Test with both
26 |         objects = extract_objects(pytest, classes=True, functions=True)
27 |         assert isinstance(objects, dict)
28 |         assert any(inspect.isclass(v) for v in objects.values())
29 |         assert any(inspect.isfunction(v) for v in objects.values())
30 | 
31 |     def test_parse_docstring(self):
32 |         """Test parsing docstrings to extract parameter information."""
33 |         # Test with a simple docstring
34 |         doc = """
35 |         Test function.
36 |         
37 |         :param arg1: First argument
38 |         :param arg2: Second argument
39 |         """
40 |         params = parse_docstring(doc)
41 |         assert len(params) == 2
42 |         assert params[0]["arg1"] == "First argument"
43 |         assert params[1]["arg2"] == "Second argument"
44 | 
45 |         # Test with no parameters
46 |         doc = "Test function with no parameters."
47 |         params = parse_docstring(doc)
48 |         assert len(params) == 0
49 | 
50 |     def test_get_distinct_params(self):
51 |         """Test getting distinct parameters from signatures."""
52 |         # Create test signatures
53 |         sigs = {
54 |             "func1": {"args": ["arg1", "arg2"]},
55 |             "func2": {"args": ["arg2", "arg3"]},
56 |             "func3": {"args": ["arg1", "arg3"]},
57 |         }
58 | 
59 |         # Get distinct parameters
60 |         params = get_distinct_params(sigs)
61 | 
62 |         # Should have all unique parameters
63 |         assert params == {"arg1", "arg2", "arg3"}
64 | 


--------------------------------------------------------------------------------
/.cursor/rules/python_standards.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: *.py
 4 | ---
 5 | # Python Coding Standards
 6 | 
 7 | ## Important Notes
 8 | - Do not ever ever add test-case specific handling into the library code. Do not add handling for "special cases", "test cases" or any other logic branches that attempt to skip the normal buisness logic to get a test to pass.
 9 | 
10 | ## Code Style
11 | - Follow PEP 8 style guide for Python code
12 | - Use 4 spaces for indentation (no tabs)
13 | - Maximum line length of 88 characters (Black default)
14 | - Use snake_case for variables, functions, and methods
15 | - Use CamelCase for classes
16 | - Use UPPER_CASE for constants
17 | - Add a blank line at the end of each file
18 | 
19 | ## Imports
20 | - Group imports in the following order:
21 |   1. Standard library imports
22 |   2. Related third-party imports
23 |   3. Local application/library specific imports
24 | - Use absolute imports when possible
25 | - Avoid wildcard imports (`from module import *`)
26 | - Use import aliases for long module names
27 | 
28 | ## Documentation
29 | - Document all public modules, classes, methods, and functions
30 | - Use docstrings that follow the Google style guide
31 | - Include type hints for function parameters and return values
32 | - Document parameters, return values, and exceptions raised
33 | 
34 | ## Error Handling
35 | - Use specific exception types instead of generic ones
36 | - Handle exceptions at the appropriate level
37 | - Use context managers (`with` statements) for resource management
38 | - Avoid catching exceptions without proper handling
39 | 
40 | ## Code Organization
41 | - Keep functions and methods short and focused
42 | - Follow the Single Responsibility Principle
43 | - Use classes to encapsulate related functionality
44 | - Separate concerns into different modules
45 | 
46 | ## Testing
47 | - Write unit tests for all code
48 | - Use meaningful test names
49 | - Test both normal and edge cases
50 | - Mock external dependencies in tests
51 | 
52 | ## Performance
53 | - Prefer list/dict/set comprehensions over loops when appropriate
54 | - Use generators for large data sets
55 | - Profile code before optimizing
56 | - Consider using NumPy/Pandas for numerical operations
57 | 
58 | ## Tools
59 | - Use Black for code formatting
60 | - Use Ruff for linting and static analysis
61 | - Use mypy for type checking
62 | - Use isort for import sorting
63 | 
64 | ## Version Control
65 | - Write meaningful commit messages
66 | - Keep commits focused on a single change
67 | - Use feature branches for development
68 | - Review code before merging 


--------------------------------------------------------------------------------
/examples/parallel_blame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of analyzing blame information in parallel.
 3 | 
 4 | This example demonstrates:
 5 | 1. Creating a repository instance
 6 | 2. Analyzing blame information sequentially
 7 | 3. Analyzing blame information in parallel (if joblib is available)
 8 | 4. Comparing performance between sequential and parallel analysis
 9 | """
10 | 
11 | import sys
12 | import time
13 | 
14 | from gitpandas import Repository
15 | 
16 | __author__ = "willmcginnis"
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     print("Initializing repository...")
21 | 
22 |     # Use pygeohash repository - a good size for examples
23 |     repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git")
24 | 
25 |     # Define analysis parameters
26 |     branch = "master"  # Use master instead of main
27 |     include_globs = ["*.py"]  # Focus on Python files only
28 |     limit = 20  # Limit to 20 commits for faster analysis
29 | 
30 |     print(f"\nAnalyzing blame information for {branch} branch")
31 |     print(f"Including files: {', '.join(include_globs)}")
32 |     print(f"Analyzing {limit} commits")
33 | 
34 |     # Sequential analysis
35 |     print("\nRunning sequential analysis...")
36 |     start_time = time.time()
37 |     blame = repo.cumulative_blame(branch=branch, include_globs=include_globs, limit=limit)
38 |     seq_time = time.time() - start_time
39 |     print(f"Sequential analysis completed in {seq_time:.2f} seconds")
40 |     print("\nSample of results:")
41 |     print(blame.head())
42 | 
43 |     # Try parallel analysis if joblib is available
44 |     try:
45 |         import joblib  # noqa: F401
46 | 
47 |         print("\nRunning parallel analysis with 4 workers...")
48 |         start_time = time.time()
49 |         blame = repo.parallel_cumulative_blame(branch=branch, include_globs=include_globs, limit=limit, workers=4)
50 |         par_time = time.time() - start_time
51 |         print(f"Parallel analysis completed in {par_time:.2f} seconds")
52 |         print("\nSample of results:")
53 |         print(blame.head())
54 | 
55 |         # Compare performance
56 |         print("\nPerformance comparison:")
57 |         print(f"  Sequential time: {seq_time:.2f} seconds")
58 |         print(f"  Parallel time:   {par_time:.2f} seconds")
59 |         print(f"  Speedup:         {seq_time / par_time:.2f}x")
60 |     except ImportError:
61 |         print("\nParallel analysis skipped: joblib package not installed.")
62 |         print("To enable parallel analysis, install joblib:")
63 |         print("  pip install joblib")
64 |         sys.exit(0)  # Exit with success since this is an expected case
65 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing Code
 2 | =================
 3 | 
 4 | Git-pandas is a python library for analyzing git repositories with pandas.  Our goal is twofold:
 5 | 
 6 |  * To make the analysis of git repositories or collections of repositories simple, fast, and pleasant.
 7 |  * To give developers interested in data analysis a simple interface to datasets that they understand and have already (git data) 
 8 |  
 9 | With that in mind, we welcome and in fact would love some help.
10 | 
11 | How to Contribute
12 | =================
13 | 
14 | The preferred workflow to contribute to git-pandas is:
15 | 
16 |  1. Fork this repository into your own github account.
17 |  2. Clone the fork on your account onto your local disk:
18 |  
19 |     $ git clone git@github.com:YourLogin/git-pandas.git
20 |     $ cd git-pandas
21 |     
22 |  3. Create a branch for your new awesome feature, do not work in the master branch:
23 |  
24 |     $ git checkout -b new-awesome-feature
25 |     
26 |  4. Write some code, or docs, or tests.
27 |  5. When you are done, submit a pull request.
28 |  
29 | Guidelines
30 | ==========
31 | 
32 | Git-pandas is still a very young project, but we do have a few guiding principles:
33 | 
34 |  1. Maintain feature and API parity between Repository and ProjectDirectory
35 |  2. Write detailed docstrings in sphinx format
36 |  3. Slow or potentially memory intensive functions should have a limit option
37 | 
38 | Running Tests
39 | =============
40 | 
41 | Test coverage is admittedly pretty bad right now, so help out by writing tests for new code. To run the tests, use:
42 | 
43 |     $ nosetests --with-coverage
44 |     $ coverage html
45 |     
46 | Easy Issues / Getting Started
47 | =============================
48 | 
49 | There are a number of issues on the near term horizon that would be great to have help with.
50 | 
51 |  1. Diff: it would be really nice to be able to call a function with 2 revs and return the diff as a dataframe. So columns for line number, filename, path, change, author, timestamp, etc. 
52 |  4. File-level tracking: it would be really cool to be able to get a dataframe of one file's history in great detail. It likely would be a subset of the existing file change history function.
53 |  5. Cross-Branch Analytics: finding differences between different branches of a single repository. Or aggregating the  results of other functions across multiple branches.
54 |  6. Verbose Mode: add logging in more functions when verbose is set to True.
55 |  7. Heirarchical bus factor: what's the bus factor of a file, directory, repo, project directory, etc
56 |  8. Language analytics: what languages do we use most, what are bus factors of those languages? Who should we hire next?


--------------------------------------------------------------------------------
/.cursor/rules/sphinx_docs_standards.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | description: 
 3 | globs: docs/*
 4 | ---
 5 | # Sphinx Documentation Standards
 6 | 
 7 | ## Project Setup
 8 | - Use `sphinx-quickstart` to initialize the documentation structure
 9 | - Configure `conf.py` with appropriate project information
10 | - Use the ReadTheDocs theme for consistent styling
11 | - Enable necessary extensions (e.g., `autodoc`, `napoleon`, `viewcode`)
12 | 
13 | ## Directory Structure
14 | - Keep documentation source files in the `docs/` directory
15 | - Organize documentation into logical sections (e.g., user guide, API reference)
16 | - Use a clear and consistent file naming convention
17 | - Include an `index.rst` file as the documentation entry point
18 | 
19 | ## Documentation Style
20 | - Write in clear, concise language
21 | - Use present tense and active voice
22 | - Be consistent with terminology
23 | - Include examples where appropriate
24 | - Target the appropriate audience (users, developers, etc.)
25 | 
26 | ## reStructuredText Formatting
27 | - Use proper heading hierarchy (=, -, ~, ^, ")
28 | - Use bullet lists for unordered items and numbered lists for sequences
29 | - Use code blocks with appropriate language for syntax highlighting
30 | - Use cross-references to link between documentation sections
31 | - Include images and diagrams where they add value
32 | 
33 | ## API Documentation
34 | - Use autodoc to generate API documentation from docstrings
35 | - Document all public modules, classes, methods, and functions
36 | - Follow Google or NumPy docstring style consistently
37 | - Include type information for parameters and return values
38 | - Document exceptions that may be raised
39 | 
40 | ## Examples and Tutorials
41 | - Include practical examples for common use cases
42 | - Provide step-by-step tutorials for complex operations
43 | - Ensure all examples are tested and working
44 | - Use `literalinclude` to include code examples from actual source files
45 | 
46 | ## Building and Testing
47 | - Build documentation locally before committing changes
48 | - Check for and fix all warnings during the build process
49 | - Verify that cross-references work correctly
50 | - Test documentation on different screen sizes
51 | 
52 | ## Deployment
53 | - Configure automatic documentation builds on ReadTheDocs or GitHub Pages
54 | - Include a link to the documentation in the project README
55 | - Version documentation to match software releases
56 | - Provide a changelog or release notes section
57 | 
58 | ## Maintenance
59 | - Keep documentation up-to-date with code changes
60 | - Review and update documentation during each release cycle
61 | - Address user feedback and questions in the documentation
62 | - Remove outdated or deprecated information 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "git-pandas"
  7 | version = "2.5.0"
  8 | description = "A utility for interacting with data from git repositories as Pandas dataframes"
  9 | readme = "README.md"
 10 | requires-python = ">=3.8"
 11 | license = {text = "BSD"}
 12 | authors = [
 13 |     {name = "Will McGinnis", email = "will@mcginniscommawill.com"},
 14 | ]
 15 | classifiers = [
 16 |     "Development Status :: 3 - Alpha",
 17 |     "Intended Audience :: Developers",
 18 |     "Programming Language :: Python :: 3",
 19 | ]
 20 | keywords = ["git", "pandas", "data", "analysis"]
 21 | 
 22 | dependencies = [
 23 |     "gitpython>=1.0.0",
 24 |     "numpy>=1.9.0",
 25 |     "pandas>=2.0.0",
 26 |     "requests",
 27 |     "redis",
 28 |     "coverage>=5.0.0",
 29 |     "importlib-metadata>=1.0; python_version<'3.8'",
 30 | ]
 31 | 
 32 | [project.urls]
 33 | Documentation = "https://gitpandas.mcginniscommawill.com"
 34 | Source = "https://github.com/wdm0006/git-pandas"
 35 | Issues = "https://github.com/wdm0006/git-pandas/issues"
 36 | 
 37 | [project.optional-dependencies]
 38 | examples = [
 39 |     "matplotlib",
 40 |     "lifelines",
 41 | ]
 42 | dev = [
 43 |     "pytest>=7.0.0",
 44 |     "pytest-cov>=4.0.0",
 45 |     "sphinx>=7.0.0",
 46 |     "sphinx-rtd-theme>=1.0.0",
 47 |     "ruff>=0.1.0",
 48 |     "matplotlib",
 49 |     "joblib",
 50 |     "pytest-mock"
 51 | ]
 52 | all = [
 53 |     "matplotlib",
 54 |     "lifelines",
 55 |     "pytest>=7.0.0",
 56 |     "pytest-cov>=4.0.0",
 57 |     "sphinx>=7.0.0",
 58 |     "sphinx-rtd-theme>=1.0.0",
 59 |     "ruff>=0.1.0",
 60 | ]
 61 | 
 62 | [tool.hatch.build.targets.wheel]
 63 | packages = ["gitpandas"]
 64 | 
 65 | [tool.pytest.ini_options]
 66 | testpaths = ["tests"]
 67 | python_files = ["test_*.py"]
 68 | python_functions = ["test_*"]
 69 | python_classes = ["Test*"]
 70 | addopts = "--verbose --strict-markers"
 71 | markers = [
 72 |     "slow: marks tests as slow (run with pytest -m slow)",
 73 |     "remote: marks tests that require remote access",
 74 |     "redis: marks tests that require Redis",
 75 | ]
 76 | 
 77 | [tool.coverage.run]
 78 | source = ["gitpandas"]
 79 | 
 80 | [tool.ruff]
 81 | target-version = "py311"
 82 | line-length = 120
 83 | exclude=["docs/**"]
 84 | 
 85 | [tool.ruff.lint]
 86 | select = ["E", "F", "B", "I", "N", "UP", "C4", "SIM", "ERA"]
 87 | ignore = ["UP017"]
 88 | 
 89 | [tool.ruff.lint.isort]
 90 | known-first-party = ["gitpandas"]
 91 | 
 92 | [tool.ruff.lint.flake8-annotations]
 93 | allow-star-arg-any = true
 94 | suppress-none-returning = true
 95 | 
 96 | [tool.ruff.format]
 97 | quote-style = "double"
 98 | indent-style = "space"
 99 | skip-magic-trailing-comma = false
100 | line-ending = "auto" 


--------------------------------------------------------------------------------
/examples/lifeline.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of analyzing file lifelines and ownership changes.
 3 | 
 4 | This example demonstrates:
 5 | 1. Creating a repository instance
 6 | 2. Analyzing file change history
 7 | 3. Identifying ownership changes and refactoring events
 8 | 4. Visualizing survival curves for file owners
 9 | """
10 | 
11 | import os
12 | import time
13 | 
14 | import matplotlib
15 | 
16 | matplotlib.use("Agg")  # Set the backend to Agg before importing pyplot
17 | 
18 | from gitpandas import Repository
19 | from gitpandas.utilities.plotting import plot_lifeline
20 | 
21 | __author__ = "willmcginnis"
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     print("Initializing repository...")
26 |     start_time = time.time()
27 | 
28 |     # Use pygeohash repository - a good size for examples
29 |     repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git")
30 | 
31 |     print("\nAnalyzing file change history...")
32 |     print("Using a limit of 20 commits for faster analysis")
33 | 
34 |     # Get file change history with limits
35 |     changes = repo.file_change_history(
36 |         branch="master",  # Use master instead of main
37 |         limit=20,  # Limit to 20 commits
38 |         include_globs=["*.py"],  # Focus on Python files only
39 |     )
40 | 
41 |     print("\nDataFrame structure:")
42 |     print("\nColumns:", changes.columns.tolist())
43 |     print("\nSample data:")
44 |     print(changes.head())
45 | 
46 |     print("\nIdentifying ownership changes...")
47 |     # Identify ownership changes
48 |     ownership_changes = changes.groupby("filename").filter(lambda x: len(x["committer"].unique()) > 1)
49 | 
50 |     print("\nIdentifying refactoring events...")
51 |     # Identify refactoring events (significant changes to files)
52 |     # Consider changes with >50 total lines changed (insertions + deletions) as refactoring
53 |     changes["total_changes"] = changes["insertions"] + changes["deletions"]
54 |     refactoring = changes[changes["total_changes"] > 50]
55 | 
56 |     print("\nGenerating visualization...")
57 |     # Create the plot and save it
58 |     fig = plot_lifeline(changes, ownership_changes, refactoring)
59 |     output_path = os.path.join("img", "lifeline.png")
60 |     fig.savefig(output_path)
61 |     print(f"Plot saved to {output_path}")
62 | 
63 |     end_time = time.time()
64 |     print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds")
65 | 
66 |     # Print summary statistics
67 |     print("\nSummary:")
68 |     print(f"Total files analyzed: {len(changes.filename.unique())}")
69 |     print(f"Total ownership changes: {len(ownership_changes)}")
70 |     print(f"Total refactoring events: {len(refactoring)}")
71 |     print("\nRefactoring events details:")
72 |     print(refactoring[["filename", "total_changes", "message"]].to_string())
73 | 


--------------------------------------------------------------------------------
/docs/source/contributors.rst:
--------------------------------------------------------------------------------
  1 | Contributing to Git-Pandas
  2 | ==========================
  3 | 
  4 | We welcome contributions to Git-Pandas! Our goal is to make Git repository analysis simple, fast, and accessible to developers interested in data analysis.
  5 | 
  6 | Getting Started
  7 | ---------------
  8 | 
  9 | 1. Fork the repository on GitHub
 10 | 2. Clone your fork locally:
 11 | 
 12 | .. code-block:: shell
 13 | 
 14 |     git clone git@github.com:YourLogin/git-pandas.git
 15 |     cd git-pandas
 16 | 
 17 | 3. Create a feature branch:
 18 | 
 19 | .. code-block:: shell
 20 | 
 21 |     git checkout -b feature/your-awesome-feature
 22 | 
 23 | 4. Make your changes
 24 | 5. Submit a pull request
 25 | 
 26 | Development Guidelines
 27 | ----------------------
 28 | 
 29 | Code Style
 30 | ~~~~~~~~~~
 31 | 
 32 | * Follow PEP 8 style guide
 33 | * Use 4 spaces for indentation
 34 | * Maximum line length of 88 characters (Black default)
 35 | * Use snake_case for variables and functions
 36 | * Use CamelCase for classes
 37 | * Add docstrings following Google style guide
 38 | 
 39 | Documentation
 40 | ~~~~~~~~~~~~~
 41 | 
 42 | * Write detailed docstrings for all public APIs
 43 | * Include type hints for function parameters and return values
 44 | * Document exceptions that may be raised
 45 | * Add examples where appropriate
 46 | * Update the documentation for any new features
 47 | 
 48 | Testing
 49 | ~~~~~~~
 50 | 
 51 | * Write unit tests for new code
 52 | * Maintain or improve test coverage
 53 | * Run tests before submitting PR:
 54 | 
 55 | .. code-block:: shell
 56 | 
 57 |     make test
 58 |     make coverage
 59 | 
 60 | API Design Principles
 61 | ~~~~~~~~~~~~~~~~~~~~~
 62 | 
 63 | * Maintain feature parity between Repository and ProjectDirectory
 64 | * Include limit options for memory-intensive functions
 65 | * Keep the API simple and intuitive
 66 | * Consider performance implications
 67 | 
 68 | Current Development Focus
 69 | -------------------------
 70 | 
 71 | High Priority
 72 | ~~~~~~~~~~~~~
 73 | 
 74 | * Improve test coverage with proper unit tests
 75 | * Add diff functionality between revisions
 76 | * Enhance documentation with more examples and visualizations
 77 | * Streamline documentation deployment
 78 | 
 79 | Recently Completed (v2.5.0)
 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 81 | 
 82 | * ✅ File-wise bus factor analysis (completed)
 83 | * ✅ Cache management methods (invalidate_cache, get_cache_stats)
 84 | * ✅ Performance documentation and optimization guide
 85 | * ✅ Remote operations (safe_fetch_remote, warm_cache, bulk operations)
 86 | * ✅ Enhanced caching system with timestamps and metadata
 87 | 
 88 | Feature Ideas
 89 | ~~~~~~~~~~~~~
 90 | 
 91 | * Cross-branch analytics
 92 | * Enhanced verbose logging
 93 | * Language analytics and insights
 94 | * Code complexity metrics
 95 | * Contributor network analysis
 96 | 
 97 | Development Setup
 98 | -----------------
 99 | 
100 | 1. Install development dependencies:
101 | 
102 | .. code-block:: shell
103 | 
104 |     make install-dev
105 | 
106 | 2. Set up pre-commit hooks:
107 | 
108 | .. code-block:: shell
109 | 
110 |     make pre-commit
111 | 
112 | 3. Run tests:
113 | 
114 | .. code-block:: shell
115 | 
116 |     make test
117 | 
118 | 4. Build documentation:
119 | 
120 | .. code-block:: shell
121 | 
122 |     make docs
123 | 
124 | Questions?
125 | ----------
126 | 
127 | * Open an issue for bug reports or feature requests
128 | * Join our discussions on GitHub
129 | * Check existing issues for similar problems
130 | 
131 | Thank you for contributing to Git-Pandas!
132 | 
133 | 


--------------------------------------------------------------------------------
/gitpandas/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Any
 3 | 
 4 | # Setup library logging
 5 | logger = logging.getLogger("gitpandas")  # Use a consistent name
 6 | logger.addHandler(logging.NullHandler())
 7 | 
 8 | 
 9 | def get_logger(name: str | None = None) -> logging.Logger:
10 |     """Get a logger instance for the specified name.
11 | 
12 |     Args:
13 |         name: The name of the logger to get. If None, returns the main gitpandas logger.
14 |               If specified, returns a child logger of the main gitpandas logger.
15 | 
16 |     Returns:
17 |         logging.Logger: The requested logger instance.
18 |     """
19 |     if name is None:
20 |         return logger
21 |     return logger.getChild(name)
22 | 
23 | 
24 | def set_log_level(level: int | str) -> None:
25 |     """Set the logging level for the gitpandas library.
26 | 
27 |     Args:
28 |         level: The logging level to set. Can be either a string (e.g., 'INFO')
29 |                or an integer (e.g., logging.INFO).
30 |     """
31 |     logger.setLevel(level)
32 | 
33 | 
34 | def add_stream_handler(
35 |     level: int | str = logging.INFO,
36 |     format_string: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
37 |     **handler_kwargs: Any,
38 | ) -> None:
39 |     """Add a stream handler to the gitpandas logger.
40 | 
41 |     Args:
42 |         level: The logging level for the handler. Defaults to INFO.
43 |         format_string: The format string for log messages.
44 |         **handler_kwargs: Additional keyword arguments to pass to StreamHandler.
45 |     """
46 |     # Avoid adding duplicate handlers
47 |     if any(isinstance(h, logging.StreamHandler) for h in logger.handlers):
48 |         logger.warning("StreamHandler already exists for gitpandas logger.")
49 |         return
50 | 
51 |     handler = logging.StreamHandler(**handler_kwargs)
52 |     handler.setLevel(level)
53 |     formatter = logging.Formatter(format_string)
54 |     handler.setFormatter(formatter)
55 |     logger.addHandler(handler)
56 | 
57 | 
58 | def add_file_handler(
59 |     filename: str,
60 |     level: int | str = logging.INFO,
61 |     format_string: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
62 |     **handler_kwargs: Any,
63 | ) -> None:
64 |     """Add a file handler to the gitpandas logger.
65 | 
66 |     Args:
67 |         filename: The name of the file to log to.
68 |         level: The logging level for the handler. Defaults to INFO.
69 |         format_string: The format string for log messages.
70 |         **handler_kwargs: Additional keyword arguments to pass to FileHandler.
71 |     """
72 |     # Avoid adding duplicate file handlers for the same file
73 |     if any(isinstance(h, logging.FileHandler) and h.baseFilename == filename for h in logger.handlers):
74 |         logger.warning(f"FileHandler for {filename} already exists for gitpandas logger.")
75 |         return
76 | 
77 |     handler = logging.FileHandler(filename, **handler_kwargs)
78 |     handler.setLevel(level)
79 |     formatter = logging.Formatter(format_string)
80 |     handler.setFormatter(formatter)
81 |     logger.addHandler(handler)
82 | 
83 | 
84 | def remove_all_handlers() -> None:
85 |     """Remove all handlers from the gitpandas logger (except the default NullHandler)."""
86 |     for handler in logger.handlers[:]:
87 |         if not isinstance(handler, logging.NullHandler):
88 |             logger.removeHandler(handler)
89 | 
90 | 
91 | __all__ = [
92 |     "logger",
93 |     "get_logger",
94 |     "set_log_level",
95 |     "add_stream_handler",
96 |     "add_file_handler",
97 |     "remove_all_handlers",
98 | ]
99 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: setup test test-all lint format clean docs build run-example test-single mcp gitnoc
  2 | 
  3 | # Use uv for all Python operations
  4 | PYTHON = python
  5 | UV = uv
  6 | 
  7 | # Project settings
  8 | PACKAGE_NAME = gitpandas
  9 | TESTS_DIR = tests
 10 | DOCS_DIR = docs
 11 | BUILD_DIR = dist
 12 | EXAMPLES_DIR = examples
 13 | 
 14 | setup:
 15 | 	$(UV) pip install -e ".[dev]"
 16 | 
 17 | setup-examples:
 18 | 	$(UV) pip install -e ".[examples]"
 19 | 
 20 | setup-all:
 21 | 	$(UV) pip install -e ".[all]"
 22 | 
 23 | test: setup-all
 24 | 	MPLBACKEND=Agg $(UV) run pytest $(TESTS_DIR) --cov=$(PACKAGE_NAME) --cov-report=term-missing -m "not slow"
 25 | 
 26 | test-single: setup-all
 27 | 	@if [ "$(test)" = "" ]; then \
 28 | 		echo "Error: Please specify a test using test=<path_to_test>"; \
 29 | 		echo "Example: make test-single test=tests/test_Repository/test_advanced.py::TestRepositoryAdvanced::test_parallel_cumulative_blame"; \
 30 | 		exit 1; \
 31 | 	fi
 32 | 	MPLBACKEND=Agg $(UV) run pytest $(test) -v
 33 | 
 34 | test-all:
 35 | 	MPLBACKEmND=Agg $(UV) run pytest $(TESTS_DIR) --cov=$(PACKAGE_NAME) --cov-report=term-missing
 36 | 
 37 | lint:
 38 | 	$(UV) run ruff check --fix --unsafe-fixes .
 39 | 
 40 | format:
 41 | 	$(UV) run ruff format .
 42 | 	
 43 | docs:
 44 | 	$(MAKE) -C $(DOCS_DIR) html
 45 | 
 46 | docs-serve:
 47 | 	cd $(DOCS_DIR)/build/html && $(PYTHON) -m http.server
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILD_DIR)
 51 | 	rm -rf $(DOCS_DIR)/build
 52 | 	rm -rf .pytest_cache
 53 | 	rm -rf .ruff_cache
 54 | 	rm -rf .coverage
 55 | 	find . -type d -name __pycache__ -exec rm -rf {} +
 56 | 	find . -type f -name "*.pyc" -delete
 57 | 
 58 | build:
 59 | 	$(UV) pip build
 60 | 
 61 | publish:
 62 | 	$(UV) pip publish
 63 | 
 64 | env-export:
 65 | 	$(UV) pip freeze > requirements.txt
 66 | 
 67 | run-example:
 68 | 	@if [ "$(example)" = "" ]; then \
 69 | 		echo "Error: Please specify an example to run using example=<name>"; \
 70 | 		echo "Available examples:"; \
 71 | 		ls $(EXAMPLES_DIR)/*.py | sed 's/$(EXAMPLES_DIR)\///' | sed 's/\.py$$//'; \
 72 | 		exit 1; \
 73 | 	fi
 74 | 	@if [ ! -f "$(EXAMPLES_DIR)/$(example).py" ]; then \
 75 | 		echo "Error: Example '$(example)' not found in $(EXAMPLES_DIR)"; \
 76 | 		exit 1; \
 77 | 	fi
 78 | 	MPLBACKEND=Agg $(UV) run python $(EXAMPLES_DIR)/$(example).py
 79 | 
 80 | mcp:
 81 | 	$(UV) run python mcp_server/server.py
 82 | 
 83 | gitnoc:
 84 | 	$(UV) run python gitnoc/app.py
 85 | 
 86 | help:
 87 | 	@echo "Available commands:"
 88 | 	@echo "  setup         Install the package in development mode"
 89 | 	@echo "  setup-examples Install the package with examples dependencies"
 90 | 	@echo "  setup-all     Install the package with all dependencies"
 91 | 	@echo "  setup-gitnoc   Install GitNOC dependencies"
 92 | 	@echo "  test          Run tests with pytest (excluding slow tests)"
 93 | 	@echo "  test-single   Run a single test (usage: make test-single test=<path_to_test>)"
 94 | 	@echo "  test-all      Run all tests including slow tests"
 95 | 	@echo "  lint          Run ruff linter"
 96 | 	@echo "  format        Format code with ruff"
 97 | 	@echo "  docs          Build documentation with Sphinx"
 98 | 	@echo "  docs-serve    Serve documentation locally"
 99 | 	@echo "  clean         Remove build artifacts"
100 | 	@echo "  build         Build distribution packages"
101 | 	@echo "  publish       Publish package to PyPI"
102 | 	@echo "  env-export    Export dependencies to requirements.txt"
103 | 	@echo "  run-example   Run a specific example (usage: make run-example example=<name>)"
104 | 	@echo "  mcp           Run the MCP server"
105 | 	@echo "  gitnoc        Run the GitNOC Streamlit app" 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shared pytest fixtures for git-pandas tests.
 3 | """
 4 | 
 5 | import subprocess
 6 | import pytest
 7 | 
 8 | __author__ = "willmcginnis"
 9 | 
10 | 
11 | def get_default_branch():
12 |     """Get the system's default branch name for new repositories."""
13 |     try:
14 |         # Try to get the configured default branch
15 |         result = subprocess.run(
16 |             ["git", "config", "--global", "init.defaultBranch"],
17 |             capture_output=True,
18 |             text=True,
19 |             check=False
20 |         )
21 |         if result.returncode == 0 and result.stdout.strip():
22 |             return result.stdout.strip()
23 |     except Exception:
24 |         pass
25 |     
26 |     # If no default branch is configured, create a temporary repo to see what git actually creates
27 |     import tempfile
28 |     import os
29 |     
30 |     try:
31 |         with tempfile.TemporaryDirectory() as temp_dir:
32 |             test_repo_path = os.path.join(temp_dir, "test_repo")
33 |             
34 |             # Initialize a repo and make an initial commit to see what branch git creates
35 |             init_result = subprocess.run(
36 |                 ["git", "init", test_repo_path],
37 |                 capture_output=True,
38 |                 text=True,
39 |                 check=False
40 |             )
41 |             
42 |             if init_result.returncode == 0:
43 |                 # Configure user for the test repo
44 |                 subprocess.run(
45 |                     ["git", "-C", test_repo_path, "config", "user.name", "Test"],
46 |                     capture_output=True,
47 |                     check=False
48 |                 )
49 |                 subprocess.run(
50 |                     ["git", "-C", test_repo_path, "config", "user.email", "test@example.com"],
51 |                     capture_output=True,
52 |                     check=False
53 |                 )
54 |                 
55 |                 # Create a file and commit to establish a branch
56 |                 test_file = os.path.join(test_repo_path, "test.txt")
57 |                 with open(test_file, "w") as f:
58 |                     f.write("test")
59 |                 
60 |                 subprocess.run(
61 |                     ["git", "-C", test_repo_path, "add", "test.txt"],
62 |                     capture_output=True,
63 |                     check=False
64 |                 )
65 |                 subprocess.run(
66 |                     ["git", "-C", test_repo_path, "commit", "-m", "initial"],
67 |                     capture_output=True,
68 |                     check=False
69 |                 )
70 |                 
71 |                 # Check what branch was created
72 |                 branch_result = subprocess.run(
73 |                     ["git", "-C", test_repo_path, "branch", "--show-current"],
74 |                     capture_output=True,
75 |                     text=True,
76 |                     check=False
77 |                 )
78 |                 
79 |                 if branch_result.returncode == 0 and branch_result.stdout.strip():
80 |                     return branch_result.stdout.strip()
81 |     except Exception:
82 |         pass
83 |     
84 |     # Final fallback to 'master' for maximum compatibility
85 |     return "master"
86 | 
87 | 
88 | @pytest.fixture(scope="session")
89 | def default_branch():
90 |     """Pytest fixture to get the default branch name."""
91 |     return get_default_branch()
92 | 
93 | 
94 | def pytest_configure(config):
95 |     """Register custom markers."""
96 |     config.addinivalue_line("markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')")
97 | 


--------------------------------------------------------------------------------
/examples/commit_history.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example of analyzing commit history in a repository.
  3 | 
  4 | This example demonstrates:
  5 | 1. Creating repository and project directory instances
  6 | 2. Analyzing commit history with reasonable limits
  7 | 3. Viewing committer statistics
  8 | 4. Analyzing file changes by extension
  9 | """
 10 | 
 11 | import os
 12 | import time
 13 | 
 14 | import numpy as np
 15 | from definitions import GIT_PANDAS_DIR
 16 | from pandas import set_option
 17 | 
 18 | from gitpandas import ProjectDirectory, Repository
 19 | 
 20 | __author__ = "willmcginnis"
 21 | 
 22 | 
 23 | def project(path):
 24 |     """Analyze commit history for a project directory."""
 25 |     print("\nAnalyzing project directory...")
 26 |     start_time = time.time()
 27 | 
 28 |     p = ProjectDirectory(working_dir=path)
 29 | 
 30 |     # Get commit history with reasonable limits
 31 |     print("\nGetting commit history (last 7 days, limited to 100 commits)...")
 32 |     ch = p.commit_history(
 33 |         "master",
 34 |         limit=100,
 35 |         include_globs=["*.py"],
 36 |         ignore_globs=["lib/*", "docs/*", "test/*", "tests/*", "tests_t/*"],
 37 |         days=7,
 38 |     )
 39 |     print("\nRecent commits:")
 40 |     print(ch.head())
 41 | 
 42 |     # Get committer statistics
 43 |     committers = set(ch["committer"].values)
 44 |     print(f"\nFound {len(committers)} unique committers:")
 45 |     for committer in sorted(committers):
 46 |         print(f"  - {committer}")
 47 | 
 48 |     # Calculate contributions
 49 |     print("\nContributions by committer:")
 50 |     attr = ch.reindex(columns=["committer", "lines", "insertions", "deletions", "net"]).groupby(["committer"])
 51 |     attr = attr.agg({"lines": np.sum, "insertions": np.sum, "deletions": np.sum, "net": np.sum})
 52 |     print(attr)
 53 | 
 54 |     print(f"\nProject analysis completed in {time.time() - start_time:.2f} seconds")
 55 | 
 56 | 
 57 | def repository(path):
 58 |     """Analyze commit history for a single repository."""
 59 |     print("\nAnalyzing repository...")
 60 |     start_time = time.time()
 61 | 
 62 |     # Build repository object
 63 |     ignore_dirs = ["docs/*", "tests/*", "Data/*"]
 64 |     r = Repository(path, default_branch="main")
 65 | 
 66 |     # Check if bare
 67 |     print("\nRepository type:")
 68 |     print(f"  Bare repository: {r.is_bare()}")
 69 | 
 70 |     # Get commit history with limits
 71 |     print("\nGetting commit history (limited to 50 commits)...")
 72 |     ch = r.commit_history("HEAD", limit=50, include_globs=["*.py"], ignore_globs=ignore_dirs)
 73 |     print("\nRecent commits:")
 74 |     print(ch.head(5))
 75 | 
 76 |     # Get committer statistics
 77 |     committers = set(ch["committer"].values)
 78 |     print(f"\nFound {len(committers)} unique committers:")
 79 |     for committer in sorted(committers):
 80 |         print(f"  - {committer}")
 81 | 
 82 |     # Calculate contributions
 83 |     print("\nContributions by committer:")
 84 |     attr = ch.reindex(columns=["committer", "lines", "insertions", "deletions"]).groupby(["committer"])
 85 |     attr = attr.agg({"lines": np.sum, "insertions": np.sum, "deletions": np.sum})
 86 |     print(attr)
 87 | 
 88 |     # Get file change history with limits
 89 |     print("\nAnalyzing file changes (limited to 50 commits)...")
 90 |     fh = r.file_change_history("HEAD", limit=50, ignore_globs=ignore_dirs)
 91 |     fh["ext"] = fh["filename"].map(lambda x: x.split(".")[-1])
 92 |     print("\nRecent file changes:")
 93 |     print(fh.head(10))
 94 | 
 95 |     # Analyze by extension
 96 |     print("\nChanges by file extension:")
 97 |     etns = fh.reindex(columns=["ext", "insertions", "deletions"]).groupby(["ext"])
 98 |     etns = etns.agg({"insertions": np.sum, "deletions": np.sum})
 99 |     print(etns)
100 | 
101 |     print(f"\nRepository analysis completed in {time.time() - start_time:.2f} seconds")
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     # Configure pandas display options
106 |     set_option("display.max_rows", 500)
107 |     set_option("display.max_columns", 500)
108 |     set_option("display.width", 1000)
109 | 
110 |     path = os.path.abspath(GIT_PANDAS_DIR)
111 |     project(path)
112 |     repository(path)
113 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. git-pandas documentation master file, created by
  2 |    sphinx-quickstart on Sun Nov  8 21:21:04 2015.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to Git-Pandas Documentation
  7 | ===================================
  8 | 
  9 | Git-Pandas is a Python library that transforms Git repository data into pandas DataFrames, making it easy to analyze your codebase's history and development patterns. Built on top of GitPython, it provides a simple interface for extracting insights from your Git repositories.
 10 | 
 11 | .. image:: https://raw.githubusercontent.com/wdm0006/git-pandas/master/examples/img/githubblame.png
 12 |    :alt: Cumulative Blame Visualization
 13 |    :align: center
 14 | 
 15 | Quick Start
 16 | -----------
 17 | 
 18 | Install Git-Pandas using pip:
 19 | 
 20 | .. code-block:: bash
 21 | 
 22 |     pip install git-pandas
 23 | 
 24 | Basic Usage
 25 | ~~~~~~~~~~~
 26 | 
 27 | Analyze a single repository:
 28 | 
 29 | .. code-block:: python
 30 | 
 31 |     from gitpandas import Repository
 32 |     
 33 |     # Create a repository instance
 34 |     repo = Repository('/path/to/repo')
 35 |     
 36 |     # Get commit history with filtering
 37 |     commits_df = repo.commit_history(
 38 |         branch='main',
 39 |         ignore_globs=['*.pyc'],
 40 |         include_globs=['*.py']
 41 |     )
 42 |     
 43 |     # Analyze blame information
 44 |     blame_df = repo.blame(by='repository')
 45 |     
 46 |     # Calculate bus factor
 47 |     bus_factor_df = repo.bus_factor()
 48 | 
 49 | Analyze multiple repositories:
 50 | 
 51 | .. code-block:: python
 52 | 
 53 |     from gitpandas import ProjectDirectory
 54 |     project = ProjectDirectory('/path/to/project')
 55 | 
 56 | Key Features
 57 | ------------
 58 | 
 59 | * **Repository Analysis**: Extract commit history, file changes, and blame information
 60 | * **Project Insights**: Calculate bus factor and analyze repository metrics
 61 | * **Multi-Repository Support**: Analyze multiple repositories together
 62 | * **Remote Operations**: Safely fetch changes from remote repositories
 63 | * **Cache Warming**: Pre-populate caches for improved performance
 64 | * **Bulk Operations**: Efficiently process multiple repositories in parallel
 65 | * **Performance Optimization**: Advanced caching support and glob-based filtering
 66 | 
 67 | Core Components
 68 | ---------------
 69 | 
 70 | The library is built around two main components:
 71 | 
 72 | Repository
 73 | ~~~~~~~~~~
 74 | A wrapper around a single Git repository that provides:
 75 | 
 76 | * Commit history analysis with filtering options
 77 | * File change tracking and blame information
 78 | * Branch existence checking and repository status
 79 | * Bus factor calculation and repository metrics
 80 | * Punchcard statistics generation
 81 | 
 82 | ProjectDirectory
 83 | ~~~~~~~~~~~~~~~
 84 | A collection of Git repositories that enables:
 85 | 
 86 | * Analysis across multiple repositories
 87 | * Aggregated metrics and statistics
 88 | * Project-level insights
 89 | 
 90 | Common Parameters
 91 | ----------------
 92 | 
 93 | Most analysis methods support these filtering parameters:
 94 | 
 95 | * **branch**: Branch to analyze (defaults to repository's default branch)
 96 | * **limit**: Maximum number of commits to analyze
 97 | * **days**: Limit analysis to last N days
 98 | * **ignore_globs**: List of glob patterns for files to ignore
 99 | * **include_globs**: List of glob patterns for files to include
100 | * **by**: How to group results (usually 'repository' or 'file')
101 | 
102 | Documentation
103 | -------------
104 | 
105 | For detailed information about the components and their usage, see:
106 | 
107 | .. toctree::
108 |    :maxdepth: 1
109 |    :caption: Contents:
110 | 
111 |    repository
112 |    project
113 |    cache
114 |    remote_operations
115 |    performance
116 |    usecases
117 |    contributors
118 | 
119 | Additional Resources
120 | --------------------
121 | 
122 | * :ref:`genindex` - Complete API reference
123 | * :ref:`modindex` - Module index
124 | * :ref:`search` - Search the documentation
125 | 
126 | License
127 | -------
128 | 
129 | This project is BSD licensed (see LICENSE.md)
130 | 
131 | 


--------------------------------------------------------------------------------
/docs/source/project.rst:
--------------------------------------------------------------------------------
  1 | Project Directory
  2 | =================
  3 | 
  4 | The ProjectDirectory class enables analysis across multiple Git repositories. It can aggregate metrics and insights from multiple repositories into a single output.
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | The ProjectDirectory class provides:
 10 | 
 11 | * Analysis across multiple repositories
 12 | * Aggregated metrics and statistics
 13 | * Project-level insights
 14 | * Multi-repository bus factor analysis
 15 | * Consolidated commit history and blame information
 16 | 
 17 | Creating a ProjectDirectory
 18 | ---------------------------
 19 | 
 20 | You can create a ProjectDirectory object in three ways:
 21 | 
 22 | Directory of Repositories
 23 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 24 | 
 25 | Create a ProjectDirectory from a directory containing multiple repositories:
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     from gitpandas import ProjectDirectory
 30 |     project = ProjectDirectory(
 31 |         working_dir='/path/to/dir/',
 32 |         ignore_repos=['repo_to_ignore'],
 33 |         verbose=True,
 34 |         default_branch='main'  # Optional, will auto-detect if not specified
 35 |     )
 36 | 
 37 | The `ignore_repos` parameter can be a list of repository names to exclude. This method uses `os.walk` to search for `.git` directories recursively.
 38 | 
 39 | Explicit Local Repositories
 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 41 | 
 42 | Create a ProjectDirectory from a list of local repositories:
 43 | 
 44 | .. code-block:: python
 45 | 
 46 |     from gitpandas import ProjectDirectory
 47 |     project = ProjectDirectory(
 48 |         working_dir=['/path/to/repo1/', '/path/to/repo2/'],
 49 |         ignore_repos=['repo_to_ignore'],
 50 |         verbose=True,
 51 |         default_branch='main'  # Optional, will auto-detect if not specified
 52 |     )
 53 | 
 54 | Explicit Remote Repositories
 55 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 56 | 
 57 | Create a ProjectDirectory from a list of remote repositories:
 58 | 
 59 | .. code-block:: python
 60 | 
 61 |     from gitpandas import ProjectDirectory
 62 |     project = ProjectDirectory(
 63 |         working_dir=['git://github.com/user/repo1.git', 'git://github.com/user/repo2.git'],
 64 |         ignore_repos=['repo_to_ignore'],
 65 |         verbose=True,
 66 |         default_branch='main'  # Optional, will auto-detect if not specified
 67 |     )
 68 | 
 69 | Available Methods
 70 | ----------------
 71 | 
 72 | Core Analysis
 73 | ~~~~~~~~~~~~
 74 | 
 75 | .. code-block:: python
 76 | 
 77 |     # Commit history across repositories
 78 |     project.commit_history(
 79 |         branch=None,          # Branch to analyze
 80 |         limit=None,           # Maximum number of commits
 81 |         days=None,           # Limit to last N days
 82 |         ignore_globs=None,   # Files to ignore
 83 |         include_globs=None   # Files to include
 84 |     )
 85 | 
 86 |     # File change history across repositories
 87 |     project.file_change_history(
 88 |         branch=None,
 89 |         limit=None,
 90 |         days=None,
 91 |         ignore_globs=None,
 92 |         include_globs=None
 93 |     )
 94 | 
 95 |     # Blame analysis across repositories
 96 |     project.blame(
 97 |         rev="HEAD",          # Revision to analyze
 98 |         committer=True,      # Group by committer (False for author)
 99 |         by="repository",     # Group by 'repository' or 'file'
100 |         ignore_globs=None,
101 |         include_globs=None
102 |     )
103 | 
104 |     # Bus factor analysis across repositories
105 |     project.bus_factor(
106 |         by="repository",     # How to group results ('projectd', 'repository', or 'file')
107 |         ignore_globs=None,
108 |         include_globs=None
109 |     )
110 | 
111 | Common Parameters
112 | ----------------
113 | 
114 | Most analysis methods support these filtering parameters:
115 | 
116 | * **branch**: Branch to analyze (defaults to repository's default branch)
117 | * **limit**: Maximum number of commits to analyze
118 | * **days**: Limit analysis to last N days
119 | * **ignore_globs**: List of glob patterns for files to ignore
120 | * **include_globs**: List of glob patterns for files to include
121 | * **by**: How to group results (usually 'repository' or 'file')
122 | 
123 | API Reference
124 | -------------
125 | 
126 | .. currentmodule:: gitpandas.project
127 | 
128 | .. autoclass:: ProjectDirectory
129 |    :members:
130 |    :undoc-members:
131 |    :show-inheritance:
132 |    :inherited-members:
133 |    :special-members: __init__, __str__, __repr__
134 | 
135 |    .. rubric:: Methods
136 | 


--------------------------------------------------------------------------------
/docs/source/repository.rst:
--------------------------------------------------------------------------------
  1 | Repository
  2 | ==========
  3 | 
  4 | The Repository class provides an interface for analyzing a single Git repository. It can be created from either a local or remote repository.
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | The Repository class offers methods for:
 10 | 
 11 | * Commit history analysis with filtering options
 12 | * File change tracking and blame information
 13 | * Branch existence checking and repository status
 14 | * Bus factor calculation and repository metrics
 15 | * Punchcard statistics generation
 16 | 
 17 | Creating a Repository
 18 | ---------------------
 19 | 
 20 | You can create a Repository object in two ways:
 21 | 
 22 | Local Repository
 23 | ~~~~~~~~~~~~~~~~
 24 | 
 25 | Create a Repository from a local Git repository:
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     from gitpandas import Repository
 30 |     repo = Repository(
 31 |         working_dir='/path/to/repo/',
 32 |         verbose=True,
 33 |         default_branch='main'  # Optional, will auto-detect if not specified
 34 |     )
 35 | 
 36 | The directory must contain a `.git` directory. Subdirectories are not searched.
 37 | 
 38 | Remote Repository
 39 | ~~~~~~~~~~~~~~~~~
 40 | 
 41 | Create a Repository from a remote Git repository:
 42 | 
 43 | .. code-block:: python
 44 | 
 45 |     from gitpandas import Repository
 46 |     repo = Repository(
 47 |         working_dir='git://github.com/user/repo.git',
 48 |         verbose=True,
 49 |         default_branch='main'  # Optional, will auto-detect if not specified
 50 |     )
 51 | 
 52 | The repository will be cloned locally into a temporary directory. This can be slow for large repositories.
 53 | 
 54 | Available Methods
 55 | ----------------
 56 | 
 57 | Core Analysis
 58 | ~~~~~~~~~~~~
 59 | 
 60 | .. code-block:: python
 61 | 
 62 |     # Commit history analysis
 63 |     repo.commit_history(
 64 |         branch=None,          # Branch to analyze
 65 |         limit=None,           # Maximum number of commits
 66 |         days=None,           # Limit to last N days
 67 |         ignore_globs=None,   # Files to ignore
 68 |         include_globs=None   # Files to include
 69 |     )
 70 | 
 71 |     # File change history
 72 |     repo.file_change_history(
 73 |         branch=None,
 74 |         limit=None,
 75 |         days=None,
 76 |         ignore_globs=None,
 77 |         include_globs=None
 78 |     )
 79 | 
 80 |     # Blame analysis
 81 |     repo.blame(
 82 |         rev="HEAD",          # Revision to analyze
 83 |         committer=True,      # Group by committer (False for author)
 84 |         by="repository",     # Group by 'repository' or 'file'
 85 |         ignore_globs=None,
 86 |         include_globs=None
 87 |     )
 88 | 
 89 |     # Bus factor analysis
 90 |     repo.bus_factor(
 91 |         by="repository",     # How to group results ('repository' or 'file')
 92 |         ignore_globs=None,
 93 |         include_globs=None
 94 |     )
 95 | 
 96 |     # Commit pattern analysis
 97 |     repo.punchcard(
 98 |         branch=None,
 99 |         limit=None,
100 |         days=None,
101 |         by=None,            # Additional grouping
102 |         normalize=None,     # Normalize values
103 |         ignore_globs=None,
104 |         include_globs=None
105 |     )
106 | 
107 | Repository Information
108 | ~~~~~~~~~~~~~~~~~~~~
109 | 
110 | .. code-block:: python
111 | 
112 |     # List files in repository
113 |     repo.list_files(rev="HEAD")
114 | 
115 |     # Check branch existence
116 |     repo.has_branch(branch)
117 | 
118 |     # Check if repository is bare
119 |     repo.is_bare()
120 | 
121 |     # Check for coverage information
122 |     repo.has_coverage()
123 |     repo.coverage()
124 | 
125 |     # Get specific commit content
126 |     repo.get_commit_content(
127 |         rev,                # Revision to analyze
128 |         ignore_globs=None,
129 |         include_globs=None
130 |     )
131 | 
132 | Common Parameters
133 | ----------------
134 | 
135 | Most analysis methods support these filtering parameters:
136 | 
137 | * **branch**: Branch to analyze (defaults to repository's default branch)
138 | * **limit**: Maximum number of commits to analyze
139 | * **days**: Limit analysis to last N days
140 | * **ignore_globs**: List of glob patterns for files to ignore
141 | * **include_globs**: List of glob patterns for files to include
142 | * **by**: How to group results (usually 'repository' or 'file')
143 | 
144 | API Reference
145 | -------------
146 | 
147 | .. currentmodule:: gitpandas.repository
148 | 
149 | .. autoclass:: Repository
150 |    :members:
151 |    :undoc-members:
152 |    :show-inheritance:
153 |    :inherited-members:
154 |    :special-members: __init__, __str__, __repr__
155 | 
156 | .. autoclass:: GitFlowRepository
157 |    :members:
158 |    :undoc-members:
159 |    :show-inheritance:
160 |    :inherited-members:


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | v2.5.0
  2 | ======
  3 | 
  4 | ## New Features
  5 | 
  6 | ### Remote Operations & Cache Warming
  7 |  * **NEW**: `Repository.safe_fetch_remote()` - Safely fetch changes from remote repositories without modifying working directory
  8 |    - Read-only operation with comprehensive error handling
  9 |    - Support for dry-run preview and remote validation
 10 |    - Configurable remote names and pruning options
 11 |  * **NEW**: `Repository.warm_cache()` - Pre-populate repository cache for improved performance  
 12 |    - Configurable method selection with intelligent parameter handling
 13 |    - Performance metrics and cache entry tracking
 14 |    - Significant performance improvements (1.5-10x speedup demonstrated)
 15 |  * **NEW**: `ProjectDirectory.bulk_fetch_and_warm()` - Efficiently process multiple repositories
 16 |    - Parallel processing support when joblib is available
 17 |    - Error isolation (failures in one repo don't affect others)
 18 |    - Comprehensive summary statistics and progress tracking
 19 | 
 20 | ### Enhanced Caching System
 21 |  * **NEW**: `CacheEntry` class with metadata tracking (timestamps, age calculation)
 22 |  * **ENHANCED**: Thread-safe cache operations with proper locking mechanisms  
 23 |  * **ENHANCED**: Cache key consistency improvements using `||` delimiter format
 24 |  * **ENHANCED**: Cache timestamp and metadata access methods (`get_cache_info()`, `list_cached_keys()`)
 25 | 
 26 | ### Documentation & Examples
 27 |  * **NEW**: Comprehensive remote operations documentation (`docs/source/remote_operations.rst`)
 28 |  * **NEW**: Cache warming and remote fetch example (`examples/remote_fetch_and_cache_warming.py`)
 29 |  * **NEW**: Cache timestamp usage example (`examples/cache_timestamps.py`)
 30 |  * **NEW**: Release analytics example (`examples/release_analytics.py`)
 31 | 
 32 | ## Testing & Quality
 33 |  * **NEW**: 38 comprehensive tests for remote operations and cache warming
 34 |  * **NEW**: Thread safety tests for cache operations
 35 |  * **NEW**: Edge case and error handling test coverage
 36 |  * **IMPROVED**: Overall test coverage and reliability
 37 |  * **FIXED**: Various minor bugs and future warnings
 38 | 
 39 | ## Backward Compatibility
 40 |  * All new features are fully backward compatible
 41 |  * No breaking changes to existing APIs
 42 |  * Existing cache backends work seamlessly with new features
 43 | 
 44 | v2.4.0
 45 | ======
 46 | 
 47 |  * Significant caching bugfixes and updates
 48 |  * Added a DiskCache that persists across runs
 49 |  * Added release analytics 
 50 | 
 51 | v2.3.0
 52 | ======
 53 | 
 54 |  * Updated coverage file parsing to use coverage.py API instead of direct file parsing
 55 |  * Added coverage>=5.0.0 as a core dependency
 56 |  * Added a basic MCP server
 57 |  * Added methods to `Repository` for getting files in repo, getting content of a file, and getting diffs of a revision
 58 | 
 59 | 
 60 | v2.2.1
 61 | ======
 62 | 
 63 |  * Docs CI bugfix
 64 | 
 65 | v2.2.0
 66 | ======
 67 | 
 68 |  * Support for default branch setting instead of assuming master, will infer if not passed
 69 |  * Better handling of ignore repos in project directory setup
 70 |  * Added a branch exists helper in repository 
 71 |  * Docs corrections
 72 | 
 73 | v2.1.0
 74 | ======
 75 | 
 76 |  * Imrpoved test suite
 77 |  * Many bugfixes
 78 |  * Updates for pandas v2
 79 | 
 80 | v2.0.0
 81 | ======
 82 | 
 83 |  * Fully transitioned to ignore_globs and include_globs style syntax
 84 |  * Parallelized cumulative blame support with joblib threading backend
 85 |  * Added threading parallelism to many project directory functions.
 86 |  * Added a chaching module for optional redis or memory backed caching of certain resultsets
 87 |  
 88 | v1.2.0
 89 | ======
 90 | 
 91 |  * Added ignore_globs option alongside all methods with ignore_dir and extensions, will be the only method for filtering files in v2.0.0
 92 |  
 93 | v1.1.0
 94 | ======
 95 | 
 96 |  * _repo_name changed to repo_name in project directories (old method left with deprecation warning)
 97 |  * repo_name property added to repositories
 98 |  
 99 | v1.0.3
100 | ======
101 | 
102 |  * Support for estimating time spent developing on projects.
103 |  
104 | v1.0.2
105 | ======
106 | 
107 |  * bugfix in ignore_dir option for root level directories
108 | 
109 | v1.0.1
110 | ======
111 | 
112 |  * file details function
113 |  
114 | v1.0.0
115 | ======
116 | 
117 |  * Stable API
118 |  * Punchcard dataframe added
119 |  * Plotting helpers added to library under utilities module
120 |  * Added github.com profile object
121 | 
122 | v0.0.6
123 | ======
124 | 
125 |  * Added file owner utility
126 |  * Added lifelines example
127 |  * Added rev to file change history table
128 |  * Added file-wise blame using by='file' parameter
129 |  * Bus Factor returns a dataframe
130 |  * Now supporting python 2.7+ and 3.3+
131 | 
132 | v0.0.5
133 | ======
134 | 
135 |  * Added file change rates table with risk metrics
136 |  * Added basic functionality with coverage files
137 |  * Added limited time window based dataset functionality
138 |  * Expanded docs
139 |  
140 | v0.0.4
141 | ======
142 | 
143 |  * Added cumulative blame and revision history
144 |  
145 | v0.0.3
146 | ======
147 | 
148 |  * Added approximate bus factor analysis
149 | 
150 | v0.0.2
151 | ======
152 | 
153 |  * Added blame
154 | 
155 | v0.0.1
156 | ======
157 | 
158 |  * Initial release, basic interface to commit history and descriptors


--------------------------------------------------------------------------------
/tests/test_Repository/test_properties.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import pytest
  3 | from pandas import DataFrame
  4 | 
  5 | from gitpandas import Repository
  6 | 
  7 | __author__ = "willmcginnis"
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def remote_repo():
 12 |     """Fixture for a remote repository."""
 13 |     repo = Repository(working_dir="https://github.com/wdm0006/git-pandas.git", verbose=True)
 14 |     yield repo
 15 |     repo.__del__()
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def local_repo(tmp_path, default_branch):
 20 |     """Create a local git repository for testing."""
 21 |     repo_path = tmp_path / "repository1"
 22 |     repo_path.mkdir()
 23 |     repo = git.Repo.init(repo_path)
 24 | 
 25 |     # Configure git user
 26 |     repo.config_writer().set_value("user", "name", "Test User").release()
 27 |     repo.config_writer().set_value("user", "email", "test@example.com").release()
 28 | 
 29 |     # Create and checkout default branch
 30 |     repo.git.checkout("-b", default_branch)
 31 | 
 32 |     # Create initial commit
 33 |     (repo_path / "README.md").write_text("# Test Repository")
 34 |     repo.index.add(["README.md"])
 35 |     repo.index.commit("Initial commit")
 36 | 
 37 |     # Create test files
 38 |     py_content = """import os
 39 | import sys
 40 | import json
 41 | def main():
 42 |     print('Hello, World!')
 43 |     return True
 44 | def helper():
 45 |     return True
 46 | if __name__ == '__main__':
 47 |     main()"""
 48 |     (repo_path / "test.py").write_text(py_content)
 49 |     (repo_path / "test.js").write_text("console.log('Hello, World!');")
 50 |     repo.index.add(["test.py", "test.js"])
 51 |     repo.index.commit("Add test files")
 52 | 
 53 |     return repo_path
 54 | 
 55 | 
 56 | # Remote repository tests
 57 | class TestRemoteProperties:
 58 |     @pytest.mark.remote
 59 |     def test_repo_name(self, remote_repo):
 60 |         assert remote_repo.repo_name == "git-pandas"
 61 | 
 62 |     @pytest.mark.remote
 63 |     def test_branches(self, remote_repo, default_branch):
 64 |         branches = list(remote_repo.branches()["branch"].values)
 65 |         # Check for the default branch (could be master or main)
 66 |         assert default_branch in branches or "master" in branches or "main" in branches
 67 |         assert "gh-pages" in branches
 68 | 
 69 |     @pytest.mark.remote
 70 |     def test_tags(self, remote_repo):
 71 |         tags = list(remote_repo.tags()["tag"].values)
 72 |         assert "0.0.1" in tags
 73 |         assert "0.0.2" in tags
 74 | 
 75 |     @pytest.mark.remote
 76 |     def test_is_bare(self, remote_repo):
 77 |         assert not remote_repo.is_bare()
 78 | 
 79 | 
 80 | # Local repository tests
 81 | class TestLocalProperties:
 82 |     def test_repo_name(self, local_repo, default_branch):
 83 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
 84 |         assert repo.repo_name == "repository1"
 85 | 
 86 |     def test_branches(self, local_repo, default_branch):
 87 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
 88 |         branches = list(repo.branches()["branch"].values)
 89 |         assert default_branch in branches
 90 | 
 91 |     def test_tags(self, local_repo, default_branch):
 92 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
 93 |         tags = repo.tags()
 94 |         assert len(tags) == 0
 95 | 
 96 |     def test_is_bare(self, local_repo, default_branch):
 97 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
 98 |         assert not repo.is_bare()
 99 | 
100 |     def test_commit_history(self, local_repo, default_branch):
101 |         """Test commit history retrieval."""
102 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
103 |         history = repo.commit_history(branch=default_branch)
104 |         assert isinstance(history, DataFrame)
105 |         assert "repository" in history.columns
106 |         assert len(history) > 0
107 | 
108 |     def test_file_change_history(self, local_repo, default_branch):
109 |         """Test file change history retrieval."""
110 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
111 |         history = repo.file_change_history(branch=default_branch)
112 |         assert isinstance(history, DataFrame)
113 |         assert "repository" in history.columns
114 |         assert len(history) > 0
115 | 
116 |     def test_file_change_rates(self, local_repo, default_branch):
117 |         """Test file change rates calculation."""
118 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
119 |         rates = repo.file_change_rates(branch=default_branch)
120 |         assert isinstance(rates, DataFrame)
121 |         assert "repository" in rates.columns
122 |         assert len(rates) > 0
123 | 
124 |     def test_has_coverage(self, local_repo, default_branch):
125 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
126 |         # We know this repo doesn't have coverage
127 |         assert not repo.has_coverage()
128 | 
129 |     def test_bus_factor(self, local_repo, default_branch):
130 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
131 |         # We know this repo only has one committer
132 |         assert repo.bus_factor(by="repository")["bus factor"].values[0] == 1
133 | 
134 |     def test_blame(self, local_repo, default_branch):
135 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
136 |         blame = repo.blame(ignore_globs=["*.[!p][!y]"])
137 |         assert blame["loc"].sum() == 10
138 |         assert blame.shape[0] == 1
139 | 
140 |     def test_cumulative_blame(self, local_repo, default_branch):
141 |         """Test cumulative blame calculation."""
142 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
143 |         blame = repo.cumulative_blame(branch=default_branch)
144 |         assert isinstance(blame, DataFrame)
145 |         assert len(blame) > 0
146 | 
147 |     def test_revs(self, local_repo, default_branch):
148 |         """Test revision history retrieval."""
149 |         repo = Repository(working_dir=str(local_repo), default_branch=default_branch)
150 |         revs = repo.revs(branch=default_branch)
151 |         assert isinstance(revs, DataFrame)
152 |         assert "repository" in revs.columns
153 |         assert len(revs) > 0
154 | 


--------------------------------------------------------------------------------
/docs/source/usecases.rst:
--------------------------------------------------------------------------------
  1 | Use Cases and Examples
  2 | ======================
  3 | 
  4 | Git-Pandas provides a powerful interface for analyzing Git repositories using pandas DataFrames. This guide demonstrates common use cases and provides practical examples.
  5 | 
  6 | Basic Repository Analysis
  7 | -------------------------
  8 | 
  9 | Repository Attributes
 10 | ~~~~~~~~~~~~~~~~~~~~~
 11 | 
 12 | Get basic information about a repository:
 13 | 
 14 | .. code-block:: python
 15 | 
 16 |     from gitpandas import Repository
 17 |     repo = Repository('/path/to/repo')
 18 |     
 19 |     # Get repository name
 20 |     print(repo.repo_name)
 21 |     
 22 |     # Check if repository is bare
 23 |     print(repo.is_bare())
 24 |     
 25 |     # Get all tags
 26 |     print(repo.tags())
 27 |     
 28 |     # Get all branches
 29 |     print(repo.branches())
 30 |     
 31 |     # Get all revisions
 32 |     print(repo.revs())
 33 |     
 34 |     # Get blame information
 35 |     print(repo.blame(include_globs=['*.py']))
 36 | 
 37 | Commit History Analysis
 38 | ~~~~~~~~~~~~~~~~~~~~~~~
 39 | 
 40 | Analyze commit patterns and history:
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |     # Get commit history
 45 |     commits_df = repo.commit_history()
 46 |     
 47 |     # Get file change history
 48 |     changes_df = repo.file_change_history()
 49 |     
 50 |     # Filter by file extension
 51 |     python_changes = repo.file_change_history(include_globs=['*.py'])
 52 |     
 53 |     # Filter by directory
 54 |     src_changes = repo.file_change_history(include_globs=['src/*'])
 55 |     
 56 |     # Get commits in tags
 57 |     tag_commits = repo.commits_in_tags()
 58 | 
 59 | Project-Level Analysis
 60 | ----------------------
 61 | 
 62 | Multiple Repository Analysis
 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 64 | 
 65 | Analyze multiple repositories simultaneously:
 66 | 
 67 | .. code-block:: python
 68 | 
 69 |     from gitpandas import ProjectDirectory
 70 |     
 71 |     # Create project from multiple repositories
 72 |     project = ProjectDirectory([
 73 |         'git://github.com/user/repo1.git',
 74 |         'git://github.com/user/repo2.git'
 75 |     ])
 76 |     
 77 |     # Get repository information
 78 |     print(project.repo_information())
 79 |     
 80 |     # Calculate bus factor
 81 |     print(project.bus_factor())
 82 |     
 83 |     # Get file change history
 84 |     print(project.file_change_history())
 85 |     
 86 |     # Get blame information
 87 |     print(project.blame())
 88 | 
 89 | Advanced Analysis
 90 | -----------------
 91 | 
 92 | Cumulative Blame Analysis
 93 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 94 | 
 95 | Track code ownership over time:
 96 | 
 97 | .. code-block:: python
 98 | 
 99 |     # Get cumulative blame
100 |     blame_df = repo.cumulative_blame()
101 |     
102 |     # Plot cumulative blame using pandas plotting
103 |     import matplotlib.pyplot as plt
104 |     blame_df.plot(x='date', y='loc', title='Cumulative Blame Over Time')
105 |     plt.show()
106 | 
107 | Bus Factor Analysis
108 | ~~~~~~~~~~~~~~~~~~~
109 | 
110 | Analyze project sustainability:
111 | 
112 | .. code-block:: python
113 | 
114 |     # Calculate bus factor for repository
115 |     bus_factor = repo.bus_factor()
116 |     
117 |     # Calculate file-wise bus factor (new in v2.5.0)
118 |     file_bus_factor = repo.bus_factor(by='file')
119 |     
120 |     # Get detailed blame information
121 |     blame_df = repo.blame(by='file')  # Get file-level blame details
122 |     
123 |     # Analyze ownership patterns
124 |     ownership_patterns = repo.blame(committer=True, by='repository')
125 | 
126 | Performance Optimization
127 | ------------------------
128 | 
129 | Using Caching
130 | ~~~~~~~~~~~~~
131 | 
132 | Optimize performance with caching:
133 | 
134 | .. code-block:: python
135 | 
136 |     from gitpandas import Repository
137 |     from gitpandas.cache import EphemeralCache, DiskCache, RedisDFCache
138 |     
139 |     # Use in-memory caching
140 |     cache = EphemeralCache(max_keys=1000)
141 |     repo = Repository('/path/to/repo', cache_backend=cache)
142 |     
143 |     # Use persistent disk caching (new in v2.5.0)
144 |     disk_cache = DiskCache('/tmp/gitpandas_cache.gz', max_keys=500)
145 |     repo = Repository('/path/to/repo', cache_backend=disk_cache)
146 |     
147 |     # Or use Redis for distributed caching
148 |     redis_cache = RedisDFCache(
149 |         host='localhost',
150 |         port=6379,
151 |         db=12,
152 |         ttl=3600  # Cache entries expire after 1 hour
153 |     )
154 |     repo = Repository('/path/to/repo', cache_backend=redis_cache)
155 | 
156 | Cache Management (New in v2.5.0)
157 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
158 | 
159 | Manage cache performance and contents:
160 | 
161 | .. code-block:: python
162 | 
163 |     # Get cache statistics
164 |     stats = repo.get_cache_stats()
165 |     print(f"Cache usage: {stats['global_cache_stats']['cache_usage_percent']:.1f}%")
166 |     
167 |     # Invalidate specific cache entries
168 |     repo.invalidate_cache(keys=['commit_history'])
169 |     
170 |     # Clear all cache for this repository
171 |     repo.invalidate_cache()
172 |     
173 |     # Warm cache for better performance
174 |     result = repo.warm_cache(methods=['commit_history', 'blame'], limit=100)
175 |     print(f"Created {result['cache_entries_created']} cache entries")
176 | 
177 | Visualization Examples
178 | ----------------------
179 | 
180 | Commit Analysis
181 | ~~~~~~~~~~~~~~~
182 | 
183 | Visualize commit patterns:
184 | 
185 | .. code-block:: python
186 | 
187 |     # Get commit history
188 |     commit_df = repo.commit_history()
189 |     
190 |     # Plot commits over time using pandas
191 |     commit_df.resample('D').size().plot(
192 |         kind='bar',
193 |         title='Commits per Day'
194 |     )
195 |     plt.show()
196 | 
197 | File Change Analysis
198 | ~~~~~~~~~~~~~~~~~~~~
199 | 
200 | Visualize file changes:
201 | 
202 | .. code-block:: python
203 | 
204 |     # Get file change history
205 |     changes_df = repo.file_change_history()
206 |     
207 |     # Plot changes over time using pandas
208 |     changes_df.groupby('filename')['insertions'].sum().plot(
209 |         kind='bar',
210 |         title='Lines Added by File'
211 |     )
212 |     plt.show()
213 | 
214 | Best Practices
215 | --------------
216 | 
217 | * Use caching for expensive operations like blame analysis
218 | * Filter data early using include_globs/ignore_globs
219 | * Leverage pandas operations for analysis
220 | * Consider memory usage with large repositories
221 | * Use appropriate branch names (main/master)
222 | * Handle repository cleanup properly when using remote repositories
223 | 
224 | For more examples and detailed API documentation, see the :doc:`repository` and :doc:`project` pages.
225 | 


--------------------------------------------------------------------------------
/tests/test_cache_key_consistency.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | 
  7 | from gitpandas.cache import DiskCache, multicache
  8 | 
  9 | 
 10 | class RepositoryMock:
 11 |     """Mock Repository class to test cache key generation consistency"""
 12 | 
 13 |     def __init__(self, working_dir="/mock/repo/path", cache_backend=None):
 14 |         self.working_dir = working_dir
 15 |         self.repo_name = working_dir  # Simulate how Repository sets repo_name
 16 |         self.cache_backend = cache_backend
 17 |         self.execution_count = 0
 18 | 
 19 |     @multicache(key_prefix="list_files", key_list=["directory", "filter_regex"])
 20 |     def list_files(self, directory=None, filter_regex=None, force_refresh=False):
 21 |         """Mock list_files method that demonstrates the caching behavior"""
 22 |         self.execution_count += 1
 23 |         return pd.DataFrame(
 24 |             {
 25 |                 "file": [f"file{i}.txt" for i in range(3)],
 26 |                 "directory": [directory] * 3,
 27 |                 "call_number": [self.execution_count] * 3,
 28 |             }
 29 |         )
 30 | 
 31 |     @multicache(key_prefix="complex_method", key_list=["param1", "param2", "param3"])
 32 |     def complex_method(self, param1=None, param2=None, param3=None, force_refresh=False):
 33 |         """Method with multiple parameters to test key generation with many parameters"""
 34 |         self.execution_count += 1
 35 |         return pd.DataFrame(
 36 |             {
 37 |                 "result": [f"result{i}" for i in range(2)],
 38 |                 "param_values": [f"{param1}_{param2}_{param3}"] * 2,
 39 |                 "call_number": [self.execution_count] * 2,
 40 |             }
 41 |         )
 42 | 
 43 | 
 44 | class TestCacheKeyConsistency:
 45 |     """Tests specifically focused on the cache key consistency issue"""
 46 | 
 47 |     @pytest.fixture
 48 |     def temp_cache_path(self):
 49 |         """Create temporary file path for cache"""
 50 |         fd, path = tempfile.mkstemp(suffix=".gz")
 51 |         os.close(fd)
 52 |         yield path
 53 |         if os.path.exists(path):
 54 |             os.unlink(path)
 55 | 
 56 |     def test_consistent_cache_keys(self, temp_cache_path):
 57 |         """Test that cache keys are consistent between method calls"""
 58 |         cache = DiskCache(filepath=temp_cache_path)
 59 |         repo = RepositoryMock(cache_backend=cache)
 60 | 
 61 |         # Capture cache keys generated
 62 |         original_set = cache.set
 63 |         captured_keys = []
 64 | 
 65 |         def mock_set(k, v):
 66 |             captured_keys.append(k)
 67 |             return original_set(k, v)
 68 | 
 69 |         cache.set = mock_set
 70 | 
 71 |         # First call
 72 |         repo.list_files(directory="src", filter_regex="*.py")
 73 |         first_key = captured_keys[0]
 74 | 
 75 |         # Clear captured keys
 76 |         captured_keys.clear()
 77 | 
 78 |         # Second call with identical parameters
 79 |         repo.list_files(directory="src", filter_regex="*.py")
 80 | 
 81 |         # No key should be captured on second call (cache hit)
 82 |         assert len(captured_keys) == 0
 83 | 
 84 |         # Force refresh should use the same key
 85 |         repo.list_files(directory="src", filter_regex="*.py", force_refresh=True)
 86 |         assert len(captured_keys) == 1
 87 |         assert captured_keys[0] == first_key
 88 | 
 89 |     def test_fix_resolves_reported_issue(self, temp_cache_path):
 90 |         """Test specifically addressing the reported issue"""
 91 |         cache = DiskCache(filepath=temp_cache_path)
 92 |         repo = RepositoryMock(working_dir="/absolute/path/to/repo", cache_backend=cache)
 93 | 
 94 |         # First call
 95 |         result1 = repo.list_files()
 96 |         assert repo.execution_count == 1
 97 | 
 98 |         # Second call to the same method (should use cache)
 99 |         result2 = repo.list_files()
100 |         assert repo.execution_count == 1  # Should NOT increment
101 | 
102 |         # Results should match
103 |         pd.testing.assert_frame_equal(result1, result2)
104 | 
105 |     def test_varied_path_formats(self, temp_cache_path):
106 |         """Test with different path formats to ensure key consistency"""
107 |         cache = DiskCache(filepath=temp_cache_path)
108 | 
109 |         # Different repo path formats
110 |         repo1 = RepositoryMock(working_dir="/path/to/repo", cache_backend=cache)
111 |         repo2 = RepositoryMock(working_dir="/path/to/repo/", cache_backend=cache)  # Extra slash
112 | 
113 |         # Capture all keys set in the cache
114 |         original_set = cache.set
115 |         captured_keys = []
116 | 
117 |         def mock_set(k, v):
118 |             captured_keys.append(k)
119 |             return original_set(k, v)
120 | 
121 |         cache.set = mock_set
122 | 
123 |         # Call method on first repo
124 |         repo1.list_files(directory="src")
125 |         assert repo1.execution_count == 1
126 |         key1 = captured_keys[0]
127 | 
128 |         # Clear keys
129 |         captured_keys.clear()
130 | 
131 |         # Call on second repo with same parameters - should generate a different key
132 |         # due to different repo_name ("/path/to/repo" vs "/path/to/repo/")
133 |         repo2.list_files(directory="src")
134 |         assert repo2.execution_count == 1  # Should increment for repo2
135 |         key2 = captured_keys[0]
136 | 
137 |         # Keys should be different because repo_name is different
138 |         assert key1 != key2
139 |         assert "||/path/to/repo||" in key1
140 |         assert "||/path/to/repo/||" in key2
141 | 
142 |     def test_complex_key_generation(self, temp_cache_path):
143 |         """Test key generation with complex parameters"""
144 |         cache = DiskCache(filepath=temp_cache_path)
145 |         repo = RepositoryMock(cache_backend=cache)
146 | 
147 |         # Capture keys
148 |         original_set = cache.set
149 |         captured_keys = []
150 | 
151 |         def mock_set(k, v):
152 |             captured_keys.append(k)
153 |             return original_set(k, v)
154 | 
155 |         cache.set = mock_set
156 | 
157 |         # Call with complex parameters
158 |         repo.complex_method(param1="value1", param2="value2", param3="value3")
159 | 
160 |         # Check key format
161 |         key = captured_keys[0]
162 |         assert key.startswith("complex_method||")
163 |         assert "value1_" in key
164 |         assert "value2_" in key
165 |         assert "value3" in key
166 | 
167 |         # Call again with different order of parameters in the call
168 |         # Python should normalize kwargs, so the key should be the same
169 |         captured_keys.clear()
170 |         repo.complex_method(param3="value3", param1="value1", param2="value2", force_refresh=True)
171 | 
172 |         # Key should be the same despite different parameter order
173 |         assert captured_keys[0] == key
174 | 


--------------------------------------------------------------------------------
/examples/release_analytics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example demonstrating release analytics features.
  3 | 
  4 | This example shows how to use gitpandas to analyze changes between release tags.
  5 | """
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from gitpandas import Repository
 10 | 
 11 | # --- Instantiate Repository ---
 12 | # Using the gitpandas repository URI as an example
 13 | # Replace with your repository URI or local path
 14 | repo_uri = "https://github.com/wdm0006/git-pandas.git"
 15 | print(f"Attempting to instantiate Repository for: {repo_uri}")
 16 | 
 17 | try:
 18 |     # Instantiate the Repository object.
 19 |     # For remote repositories, gitpandas will clone it to a temporary directory.
 20 |     # verbose=True can be helpful for debugging, but is optional here.
 21 |     repo = Repository(working_dir=repo_uri, verbose=False)
 22 |     print(f"Successfully instantiated Repository for: {repo.repo_name}")
 23 |     print(f"Cloned to temporary directory: {repo.git_dir}")
 24 | except Exception as e:
 25 |     print(f"Error instantiating repository: {e}")
 26 |     repo = None  # Ensure repo is None if instantiation fails
 27 | 
 28 | # --- Call release_tag_summary ---
 29 | if repo:
 30 |     print("\n--- Release Tag Summary ---")
 31 |     try:
 32 |         # This method analyzes the repository's tags to provide a summary of
 33 |         # changes between each tagged release.
 34 |         # It looks at the differences from the previous tag to the current one.
 35 |         # Output includes diff statistics (insertions, deletions),
 36 |         # committers, authors, and files changed during that period.
 37 |         # You can use tag_glob to filter for specific tag patterns (e.g., 'v*.*').
 38 |         release_summary_df = repo.release_tag_summary()
 39 | 
 40 |         if not release_summary_df.empty:
 41 |             print("Release summary retrieved successfully:")
 42 |             # Display the DataFrame. Pandas default display might be wide,
 43 |             # but for an example, direct print is usually fine.
 44 |             # For better display in production, consider setting pandas display options
 45 |             print(release_summary_df)
 46 | 
 47 |             # Example of how to access specific information:
 48 |             if "tag" in release_summary_df.columns and len(release_summary_df) > 1:
 49 |                 # Show files changed between the first two listed tags (if available)
 50 |                 # Note: The first tag in the summary won't have "previous tag" data.
 51 |                 second_tag_entry = release_summary_df.iloc[1]  # Second tag in the sorted list
 52 |                 print(f"\nExample: Files changed for tag '{second_tag_entry['tag']}' (since previous tag):")
 53 |                 if isinstance(second_tag_entry["files"], list) and second_tag_entry["files"]:
 54 |                     for file_path in second_tag_entry["files"]:
 55 |                         print(f"  - {file_path}")
 56 |                 else:
 57 |                     print("  No files listed or files column is not a list.")
 58 | 
 59 |         elif release_summary_df is not None:  # Empty DataFrame
 60 |             print("No release summary data returned. The repository might not have tags, or no tags match the glob.")
 61 |         else:  # None was returned, indicating an issue
 62 |             print("Failed to retrieve release summary (method returned None).")
 63 | 
 64 |     except Exception as e:
 65 |         print(f"Error calling release_tag_summary: {e}")
 66 | 
 67 |     # --- Optional: Demonstrate get_commit_content ---
 68 |     # The release_tag_summary gives you information about *what* changed (files, stats).
 69 |     # If you need to see the *actual content* of a specific commit that was part of a
 70 |     # release (perhaps a commit listed by commits_in_tags, or the commit SHA
 71 |     # directly associated with a tag), you can use get_commit_content.
 72 | 
 73 |     print("\n--- Optional: Get Content of a Specific Commit ---")
 74 |     # Note: For a real scenario, you'd get a relevant commit SHA from your analysis,
 75 |     # for example, from the 'commit_sha' column in release_summary_df or from repo.commits_in_tags().
 76 |     # As this is a brief example, we'll try to pick one from the summary if possible,
 77 |     # otherwise, we'll use a placeholder.
 78 |     target_commit_sha = None
 79 |     if "release_summary_df" in locals() and not release_summary_df.empty and "commit_sha" in release_summary_df.columns:
 80 |         # Let's try to get the commit SHA of the first tag listed (if any)
 81 |         # This commit is what the tag points to.
 82 |         potential_sha = release_summary_df["commit_sha"].iloc[0]
 83 |         if pd.notna(potential_sha):  # Check if the SHA is not NaN or None
 84 |             target_commit_sha = potential_sha
 85 |             print(f"Attempting to get content for commit SHA (from first tag's commit_sha): {target_commit_sha}")
 86 |         else:
 87 |             print("Could not get a valid commit SHA from the release_summary_df's first entry.")
 88 | 
 89 |     if not target_commit_sha:
 90 |         target_commit_sha = "PLACEHOLDER_COMMIT_SHA"  # Replace with an actual commit SHA from the repo
 91 |         print(f"Using placeholder commit SHA: {target_commit_sha}. Replace with a real one for actual output.")
 92 | 
 93 |     if target_commit_sha != "PLACEHOLDER_COMMIT_SHA":
 94 |         try:
 95 |             # The 'rev' parameter takes the commit SHA.
 96 |             commit_content_df = repo.get_commit_content(rev=target_commit_sha)
 97 | 
 98 |             if commit_content_df is not None and not commit_content_df.empty:
 99 |                 print(f"Content changes for commit {target_commit_sha} (showing first 5 lines):")
100 |                 # Displaying only a part of the DataFrame for brevity.
101 |                 # Columns typically include: 'file_path', 'change_type', 'diff', 'old_blob_sha', 'new_blob_sha'
102 |                 print(commit_content_df.head())
103 |             elif commit_content_df is not None:  # Empty DataFrame
104 |                 print(
105 |                     f"No content changes (e.g. diffs) found for commit {target_commit_sha}. "
106 |                     f"This can be normal for merge commits with no textual changes, "
107 |                     f"or if the commit only modified tree structure."
108 |                 )
109 |             else:  # None was returned
110 |                 print(
111 |                     f"Failed to get content for commit {target_commit_sha} (method returned None). "
112 |                     f"Could be an invalid SHA or repository issue."
113 |                 )
114 |         except Exception as e:
115 |             print(f"Error calling get_commit_content for {target_commit_sha}: {e}")
116 |     else:
117 |         print("Skipping get_commit_content due to placeholder SHA or if no valid SHA was found.")
118 | 
119 | else:
120 |     print("\nSkipping release_tag_summary and get_commit_content because repository instantiation failed.")
121 | 


--------------------------------------------------------------------------------
/docs/AUDIT_REPORT.md:
--------------------------------------------------------------------------------
  1 | # Git-Pandas Documentation Audit Report
  2 | 
  3 | ## Executive Summary
  4 | 
  5 | The documentation audit has been completed for git-pandas v2.5.0. Overall, the documentation is comprehensive and well-structured, but several inconsistencies and outdated information were identified and corrected.
  6 | 
  7 | ## Issues Found and Corrected
  8 | 
  9 | ### ✅ FIXED - Critical Issues
 10 | 
 11 | 1. **Version Information Inconsistency**
 12 |    - **Issue**: conf.py showed version "2.2.1" while current version is "2.5.0"
 13 |    - **Fix**: Updated conf.py to reflect correct version "2.5.0"
 14 |    - **Files**: `docs/source/conf.py`
 15 | 
 16 | 2. **Missing Performance Documentation**
 17 |    - **Issue**: performance.rst was created but not included in main table of contents
 18 |    - **Fix**: Added performance.rst to index.rst toctree
 19 |    - **Files**: `docs/source/index.rst`
 20 | 
 21 | 3. **Incomplete Bus Factor Documentation**
 22 |    - **Issue**: Bus factor docs didn't mention file-wise analysis option
 23 |    - **Fix**: Updated repository.rst and project.rst to show 'file' option
 24 |    - **Files**: `docs/source/repository.rst`, `docs/source/project.rst`
 25 | 
 26 | ### ✅ FIXED - Content Updates
 27 | 
 28 | 4. **Cache System Documentation**
 29 |    - **Issue**: Overview didn't mention cache management features
 30 |    - **Fix**: Added cache management and statistics to feature list
 31 |    - **Files**: `docs/source/cache.rst`
 32 | 
 33 | 5. **Use Cases Missing New Features**
 34 |    - **Issue**: No examples of file-wise bus factor or cache management
 35 |    - **Fix**: Added comprehensive examples for v2.5.0 features
 36 |    - **Files**: `docs/source/usecases.rst`
 37 | 
 38 | 6. **Contributors Page Outdated**
 39 |    - **Issue**: Didn't reflect recently completed features
 40 |    - **Fix**: Added "Recently Completed (v2.5.0)" section
 41 |    - **Files**: `docs/source/contributors.rst`
 42 | 
 43 | ## Documentation Quality Assessment
 44 | 
 45 | ### ✅ EXCELLENT - Well Documented Areas
 46 | 
 47 | 1. **Cache System** (`cache.rst`)
 48 |    - Comprehensive coverage of all cache backends
 49 |    - Clear examples for each cache type
 50 |    - Good coverage of cache management features
 51 |    - Proper API reference with autodoc
 52 | 
 53 | 2. **Remote Operations** (`remote_operations.rst`)
 54 |    - Detailed documentation of safe_fetch_remote
 55 |    - Complete coverage of cache warming features
 56 |    - Good examples and error handling documentation
 57 |    - Comprehensive return value documentation
 58 | 
 59 | 3. **Performance Guide** (`performance.rst`)
 60 |    - Thorough performance optimization strategies
 61 |    - Benchmark data and real-world examples
 62 |    - Best practices and anti-patterns
 63 |    - Memory management guidance
 64 | 
 65 | ### ✅ GOOD - Generally Well Documented
 66 | 
 67 | 4. **Repository API** (`repository.rst`)
 68 |    - Good coverage of core methods
 69 |    - Clear parameter documentation
 70 |    - Could benefit from more advanced examples
 71 | 
 72 | 5. **Project Directory** (`project.rst`)
 73 |    - Adequate coverage of multi-repository features
 74 |    - Good examples of different initialization methods
 75 | 
 76 | 6. **Use Cases** (`usecases.rst`)
 77 |    - Good practical examples
 78 |    - Now includes v2.5.0 features
 79 |    - Could use more visualization examples
 80 | 
 81 | ### ✅ ADEQUATE - Basic Documentation
 82 | 
 83 | 7. **Index Page** (`index.rst`)
 84 |    - Clear quick start examples
 85 |    - Good feature overview
 86 |    - Proper navigation structure
 87 | 
 88 | 8. **Contributors Guide** (`contributors.rst`)
 89 |    - Standard contribution guidelines
 90 |    - Now reflects current development status
 91 | 
 92 | ## Remaining Recommendations
 93 | 
 94 | ### High Priority
 95 | 
 96 | 1. **README.md Synchronization**
 97 |    - Update Python version requirements (currently claims 2.7+ support)
 98 |    - Add examples of new v2.5.0 features
 99 |    - Update installation instructions for optional dependencies
100 | 
101 | 2. **API Documentation Verification**
102 |    - Ensure all public methods have proper docstrings
103 |    - Verify autodoc is picking up all new methods
104 |    - Check that method signatures in docs match implementation
105 | 
106 | ### Medium Priority
107 | 
108 | 3. **Cross-Reference Verification**
109 |    - Verify all internal links work correctly
110 |    - Check that all referenced examples exist
111 |    - Ensure consistent terminology across documents
112 | 
113 | 4. **Example Code Testing**
114 |    - Systematically test all code examples in documentation
115 |    - Add automated testing for documentation examples
116 |    - Ensure examples use realistic file paths and parameters
117 | 
118 | ### Low Priority
119 | 
120 | 5. **Enhancement Opportunities**
121 |    - Add more visualization examples using matplotlib/seaborn
122 |    - Include performance benchmarks in appropriate sections
123 |    - Add troubleshooting section for common issues
124 | 
125 | ## Testing Performed
126 | 
127 | ### ✅ Verified Working
128 | - Basic imports work correctly
129 | - Cache management methods exist and are callable
130 | - New features are accessible through public APIs
131 | - Documentation structure builds correctly
132 | 
133 | ### Manual Verification Needed
134 | - All code examples execute without errors
135 | - External links are valid and accessible
136 | - Cross-references resolve correctly
137 | 
138 | ## Files Modified in This Audit
139 | 
140 | 1. `docs/source/conf.py` - Version update to 2.5.0
141 | 2. `docs/source/index.rst` - Added performance.rst to toctree
142 | 3. `docs/source/repository.rst` - Updated bus_factor documentation
143 | 4. `docs/source/project.rst` - Updated bus_factor options
144 | 5. `docs/source/cache.rst` - Enhanced feature overview
145 | 6. `docs/source/usecases.rst` - Added v2.5.0 feature examples
146 | 7. `docs/source/contributors.rst` - Added recently completed features section
147 | 
148 | ## Overall Assessment
149 | 
150 | **Grade: B+ (Good with room for improvement)**
151 | 
152 | The documentation is comprehensive and covers all major features well. The recent additions for v2.5.0 are well-documented, particularly the cache management and remote operations features. The main areas for improvement are:
153 | 
154 | 1. Synchronizing README.md with current documentation
155 | 2. Ensuring all code examples are tested and working
156 | 3. Verifying cross-references and links
157 | 
158 | The documentation successfully serves its purpose of helping users understand and use git-pandas effectively, with clear examples and comprehensive API coverage.
159 | 
160 | ## Next Steps
161 | 
162 | 1. **Immediate**: Update README.md to match documentation
163 | 2. **Short-term**: Test all documentation examples
164 | 3. **Medium-term**: Add automated testing for documentation examples
165 | 4. **Long-term**: Consider adding more advanced use case examples
166 | 
167 | ---
168 | 
169 | *Audit completed: January 2025*
170 | *Documentation version: 2.5.0*
171 | *Status: Ready for release with minor README updates needed*


--------------------------------------------------------------------------------
/examples/cache_timestamps.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example of accessing cache timestamp information.
  3 | 
  4 | This example demonstrates how users can check when cache entries were populated
  5 | without any changes to the Repository or ProjectDirectory API.
  6 | """
  7 | 
  8 | import os
  9 | import time
 10 | from datetime import datetime, timezone
 11 | 
 12 | from definitions import GIT_PANDAS_DIR
 13 | 
 14 | from gitpandas import Repository
 15 | from gitpandas.cache import DiskCache, EphemeralCache
 16 | 
 17 | 
 18 | def demonstrate_cache_timestamps():
 19 |     """Demonstrate accessing cache timestamp information."""
 20 |     print("Cache Timestamp Information Demo")
 21 |     print("=" * 40)
 22 | 
 23 |     # Create a repository with a cache backend
 24 |     cache = EphemeralCache(max_keys=100)
 25 |     repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master")
 26 | 
 27 |     print(f"Repository: {repo.repo_name}")
 28 |     print(f"Cache backend: {type(cache).__name__}")
 29 |     print()
 30 | 
 31 |     # Call some methods to populate the cache
 32 |     print("Populating cache with repository data...")
 33 | 
 34 |     print("  - Getting commit history...")
 35 |     repo.commit_history(limit=10)
 36 | 
 37 |     print("  - Getting file list...")
 38 |     repo.list_files()
 39 | 
 40 |     print("  - Getting blame information...")
 41 |     repo.blame()
 42 | 
 43 |     print(f"Cache now contains {len(cache._cache)} entries")
 44 |     print()
 45 | 
 46 |     # Show cache information
 47 |     print("Cache Contents and Timestamps:")
 48 |     print("-" * 40)
 49 | 
 50 |     cached_keys = cache.list_cached_keys()
 51 |     for entry in cached_keys:
 52 |         print(f"Key: {entry['key']}")
 53 |         print(f"  Cached at: {entry['cached_at'].strftime('%Y-%m-%d %H:%M:%S UTC')}")
 54 |         print(f"  Age: {entry['age_seconds']:.1f} seconds")
 55 |         print()
 56 | 
 57 |     # Wait a moment and call one method again
 58 |     print("Waiting 2 seconds and refreshing commit history...")
 59 |     time.sleep(2)
 60 | 
 61 |     # This should hit the cache
 62 |     repo.commit_history(limit=10)
 63 | 
 64 |     # This should create a new cache entry
 65 |     repo.commit_history(limit=20)
 66 | 
 67 |     print("\nUpdated cache contents:")
 68 |     print("-" * 40)
 69 | 
 70 |     cached_keys = cache.list_cached_keys()
 71 |     for entry in cached_keys:
 72 |         print(f"Key: {entry['key']}")
 73 |         print(f"  Cached at: {entry['cached_at'].strftime('%Y-%m-%d %H:%M:%S UTC')}")
 74 |         print(f"  Age: {entry['age_seconds']:.1f} seconds")
 75 |         print()
 76 | 
 77 |     # Demonstrate getting specific cache info
 78 |     print("Getting specific cache information:")
 79 |     print("-" * 40)
 80 | 
 81 |     # Find a commit_history cache key
 82 |     commit_keys = [k for k in cached_keys if "commit_history" in k["key"]]
 83 |     if commit_keys:
 84 |         key = commit_keys[0]["key"]
 85 |         info = cache.get_cache_info(key)
 86 |         if info:
 87 |             print(f"Cache info for key '{key}':")
 88 |             print(f"  Cached at: {info['cached_at']}")
 89 |             print(f"  Age: {info['age_minutes']:.2f} minutes")
 90 |             print(f"  Age: {info['age_hours']:.4f} hours")
 91 | 
 92 | 
 93 | def demonstrate_disk_cache_persistence():
 94 |     """Demonstrate cache persistence with DiskCache."""
 95 |     print("\n" + "=" * 50)
 96 |     print("Disk Cache Persistence Demo")
 97 |     print("=" * 50)
 98 | 
 99 |     cache_file = "/tmp/gitpandas_demo_cache.gz"
100 | 
101 |     # Clean up any existing cache file
102 |     if os.path.exists(cache_file):
103 |         os.remove(cache_file)
104 | 
105 |     print("Creating repository with DiskCache...")
106 |     cache = DiskCache(filepath=cache_file, max_keys=50)
107 |     repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master")
108 | 
109 |     # Populate cache
110 |     print("Populating cache...")
111 |     repo.commit_history(limit=5)
112 |     repo.list_files()
113 | 
114 |     print(f"Cache file created: {cache_file}")
115 |     print(f"Cache contains {len(cache._cache)} entries")
116 | 
117 |     # Show initial cache info
118 |     cached_keys = cache.list_cached_keys()
119 |     print("\nInitial cache entries:")
120 |     for entry in cached_keys:
121 |         print(f"  {entry['key']}: {entry['cached_at'].strftime('%H:%M:%S')}")
122 | 
123 |     # Create a new cache instance from the same file
124 |     print("\nCreating new cache instance from saved file...")
125 |     cache2 = DiskCache(filepath=cache_file, max_keys=50)
126 | 
127 |     print(f"Loaded cache contains {len(cache2._cache)} entries")
128 | 
129 |     # Show loaded cache info
130 |     cached_keys2 = cache2.list_cached_keys()
131 |     print("\nLoaded cache entries:")
132 |     for entry in cached_keys2:
133 |         print(f"  {entry['key']}: {entry['cached_at'].strftime('%H:%M:%S')} (age: {entry['age_seconds']:.1f}s)")
134 | 
135 |     # Clean up
136 |     if os.path.exists(cache_file):
137 |         os.remove(cache_file)
138 |         print(f"\nCleaned up cache file: {cache_file}")
139 | 
140 | 
141 | def demonstrate_cache_with_force_refresh():
142 |     """Demonstrate cache behavior with force_refresh."""
143 |     print("\n" + "=" * 50)
144 |     print("Force Refresh Demo")
145 |     print("=" * 50)
146 | 
147 |     cache = EphemeralCache(max_keys=10)
148 |     repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master")
149 | 
150 |     print("Getting commit history (first time)...")
151 |     start_time = datetime.now(timezone.utc)
152 |     repo.commit_history(limit=5)
153 | 
154 |     time.sleep(1)
155 | 
156 |     print("Getting commit history (should use cache)...")
157 |     repo.commit_history(limit=5)
158 | 
159 |     time.sleep(1)
160 | 
161 |     print("Getting commit history with force_refresh=True...")
162 |     repo.commit_history(limit=5, force_refresh=True)
163 | 
164 |     print("\nCache timeline:")
165 |     cached_keys = cache.list_cached_keys()
166 |     for entry in cached_keys:
167 |         if "commit_history" in entry["key"]:
168 |             age_from_start = (entry["cached_at"] - start_time).total_seconds()
169 |             print(f"  Commit history cached at: +{age_from_start:.1f}s from start")
170 |             print(f"  Current age: {entry['age_seconds']:.1f}s")
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     try:
175 |         demonstrate_cache_timestamps()
176 |         demonstrate_disk_cache_persistence()
177 |         demonstrate_cache_with_force_refresh()
178 | 
179 |         print("\n" + "=" * 50)
180 |         print("Summary:")
181 |         print("- Cache backends now track when entries were created")
182 |         print("- No changes to Repository or ProjectDirectory API")
183 |         print("- Users can access cache info via cache_backend.get_cache_info()")
184 |         print("- Users can list all cached keys via cache_backend.list_cached_keys()")
185 |         print("- Backward compatibility maintained with existing caches")
186 | 
187 |     except Exception as e:
188 |         print(f"Error running demo: {e}")
189 |         print("Make sure you're running this from the git-pandas directory")
190 | 


--------------------------------------------------------------------------------
/tests/test_Repository/test_punchcard.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import pandas as pd
  3 | import pytest
  4 | 
  5 | from gitpandas import Repository
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def local_repo(tmp_path, default_branch):
 10 |     """Fixture for a local repository with commits at different times."""
 11 |     # Create a temporary directory
 12 |     repo_dir = tmp_path / "repository1"
 13 |     repo_dir.mkdir()
 14 | 
 15 |     # Initialize a git repo
 16 |     grepo = git.Repo.init(str(repo_dir))
 17 | 
 18 |     # Configure git user
 19 |     grepo.git.config("user.name", "Test User")
 20 |     grepo.git.config("user.email", "test@example.com")
 21 | 
 22 |     # Create and checkout default branch
 23 |     grepo.git.checkout("-b", default_branch)
 24 | 
 25 |     # Add a README file
 26 |     readme_path = repo_dir / "README.md"
 27 |     readme_path.write_text("Sample README for a sample project\n")
 28 | 
 29 |     # Commit it
 30 |     grepo.git.add("README.md")
 31 | 
 32 |     # Set the environment variables for the commit time
 33 |     # Monday morning
 34 |     env = {
 35 |         "GIT_AUTHOR_DATE": "2023-01-02T09:30:00",
 36 |         "GIT_COMMITTER_DATE": "2023-01-02T09:30:00",
 37 |     }
 38 |     grepo.git.commit(m="first commit", env=env)
 39 | 
 40 |     # Add some Python files at different times
 41 |     for idx, (day, hour) in enumerate(
 42 |         [
 43 |             # Tuesday afternoon
 44 |             (3, 14),
 45 |             # Wednesday evening
 46 |             (4, 19),
 47 |             # Thursday night
 48 |             (5, 22),
 49 |             # Friday morning
 50 |             (6, 8),
 51 |             # Saturday afternoon
 52 |             (7, 15),
 53 |         ]
 54 |     ):
 55 |         py_file = repo_dir / f"file_{idx}.py"
 56 |         py_file.write_text("import sys\nimport os\n")
 57 | 
 58 |         grepo.git.add(all=True)
 59 | 
 60 |         # Set the environment variables for the commit time
 61 |         env = {
 62 |             "GIT_AUTHOR_DATE": f"2023-01-{day:02d}T{hour:02d}:00:00",
 63 |             "GIT_COMMITTER_DATE": f"2023-01-{day:02d}T{hour:02d}:00:00",
 64 |         }
 65 |         grepo.git.commit(m=f"adding file_{idx}.py", env=env)
 66 | 
 67 |     # Create the Repository object
 68 |     git_pandas_repo = Repository(working_dir=str(repo_dir), verbose=True, default_branch=default_branch)
 69 | 
 70 |     yield git_pandas_repo
 71 | 
 72 |     # Cleanup
 73 |     git_pandas_repo.__del__()
 74 | 
 75 | 
 76 | class TestPunchcard:
 77 |     def test_punchcard_basic(self, local_repo, default_branch):
 78 |         """Test basic functionality of the punchcard method."""
 79 |         punchcard = local_repo.punchcard(branch=default_branch)
 80 | 
 81 |         # Check the shape and columns
 82 |         assert isinstance(punchcard, pd.DataFrame)
 83 |         assert punchcard.shape[0] > 0
 84 |         assert "hour_of_day" in punchcard.columns
 85 |         assert "day_of_week" in punchcard.columns
 86 |         assert "lines" in punchcard.columns
 87 |         assert "insertions" in punchcard.columns
 88 |         assert "deletions" in punchcard.columns
 89 |         assert "net" in punchcard.columns
 90 | 
 91 |         # Check that we have data for the days and hours we committed
 92 |         days_hours = [
 93 |             (0, 9),  # Monday 9am
 94 |             (1, 14),  # Tuesday 2pm
 95 |             (2, 19),  # Wednesday 7pm
 96 |             (3, 22),  # Thursday 10pm
 97 |             (4, 8),  # Friday 8am
 98 |             (5, 15),  # Saturday 3pm
 99 |         ]
100 | 
101 |         for day, hour in days_hours:
102 |             matching_rows = punchcard[(punchcard["day_of_week"] == day) & (punchcard["hour_of_day"] == hour)]
103 |             if len(matching_rows) > 0:
104 |                 assert matching_rows["net"].values[0] > 0
105 | 
106 |     def test_punchcard_normalize(self, local_repo, default_branch):
107 |         """Test the normalize parameter of the punchcard method."""
108 |         # Get punchcard without normalization
109 |         local_repo.punchcard(branch=default_branch)
110 | 
111 |         # Get punchcard with normalization by value
112 |         punchcard_norm = local_repo.punchcard(branch=default_branch, normalize=1.0)
113 | 
114 |         # Check that the normalized values are between 0 and 1
115 |         assert punchcard_norm["net"].max() <= 1.0
116 |         assert punchcard_norm["net"].min() >= 0.0
117 | 
118 |         # Check that the row normalization works correctly
119 |         for day in range(7):
120 |             day_rows = punchcard_norm[punchcard_norm["day_of_week"] == day]
121 |             if len(day_rows) > 0 and day_rows["net"].sum() > 0:
122 |                 # If there are commits on this day, the max value should be 1.0 or close to it
123 |                 assert day_rows["net"].max() <= 1.0
124 | 
125 |     def test_punchcard_by_parameter(self, local_repo):
126 |         """Test the 'by' parameter of the punchcard method."""
127 |         # Test with by='committer'
128 |         punchcard_committer = local_repo.punchcard(branch="master", by="committer")
129 | 
130 |         # Check that we have the committer column
131 |         assert "committer" in punchcard_committer.columns
132 | 
133 |         # Test with by='repository'
134 |         punchcard_repo = local_repo.punchcard(branch="master", by="repository")
135 | 
136 |         # Check that we have the repository column
137 |         assert "repository" in punchcard_repo.columns
138 | 
139 |     def test_punchcard_with_globs(self, local_repo):
140 |         """Test the ignore_globs and include_globs parameters."""
141 |         # Get punchcard for all files
142 |         punchcard_all = local_repo.punchcard(branch="master")
143 | 
144 |         # Get punchcard ignoring Python files
145 |         punchcard_no_py = local_repo.punchcard(branch="master", ignore_globs=["*.py"])
146 | 
147 |         # Check that we have fewer lines in the filtered punchcard
148 |         assert punchcard_no_py["lines"].sum() < punchcard_all["lines"].sum()
149 | 
150 |         # Get punchcard including only Python files
151 |         punchcard_only_py = local_repo.punchcard(branch="master", include_globs=["*.py"])
152 | 
153 |         # Check that we have fewer lines than the full punchcard
154 |         assert punchcard_only_py["lines"].sum() < punchcard_all["lines"].sum()
155 | 
156 |         # Check that the sum of the filtered punchcards equals the total
157 |         assert punchcard_no_py["lines"].sum() + punchcard_only_py["lines"].sum() == punchcard_all["lines"].sum()
158 | 
159 |     def test_punchcard_with_limit(self, local_repo):
160 |         """Test the limit parameter of the punchcard method."""
161 |         # Get punchcard with all commits
162 |         punchcard_all = local_repo.punchcard(branch="master")
163 | 
164 |         # Get punchcard with limited commits
165 |         punchcard_limited = local_repo.punchcard(branch="master", limit=3)
166 | 
167 |         # Check that we have fewer lines in the limited punchcard
168 |         assert punchcard_limited["lines"].sum() <= punchcard_all["lines"].sum()
169 | 
170 |     def test_punchcard_with_days(self, local_repo):
171 |         """Test the days parameter of the punchcard method."""
172 |         # Get punchcard with all commits
173 |         local_repo.punchcard(branch="master")
174 | 
175 |         # Get punchcard with commits from the last 2 days
176 |         # Since our test data is from 2023, this should return an empty DataFrame
177 |         punchcard_recent = local_repo.punchcard(branch="master", days=2)
178 | 
179 |         # Check that we have no lines in the recent punchcard
180 |         assert punchcard_recent["lines"].sum() == 0
181 | 


--------------------------------------------------------------------------------
/tests/test_Repository/test_file_operations.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import pandas as pd
  3 | import pytest
  4 | 
  5 | from gitpandas import Repository
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def local_repo(tmp_path, default_branch):
 10 |     """Create a local git repository with various file types and structures."""
 11 |     repo_path = tmp_path / "test_repo"
 12 |     repo_path.mkdir()
 13 |     repo = git.Repo.init(repo_path)
 14 | 
 15 |     # Configure git user
 16 |     repo.config_writer().set_value("user", "name", "Test User").release()
 17 |     repo.config_writer().set_value("user", "email", "test@example.com").release()
 18 | 
 19 |     # Create and checkout default branch
 20 |     repo.git.checkout("-b", default_branch)
 21 | 
 22 |     # Create and checkout master branch
 23 |     repo.git.checkout("-b", "master")
 24 | 
 25 |     # Create initial structure
 26 |     (repo_path / "src").mkdir()
 27 |     (repo_path / "docs").mkdir()
 28 |     (repo_path / "tests").mkdir()
 29 | 
 30 |     # Create various files
 31 |     files = {
 32 |         "README.md": "# Test Repository\nA test repository for gitpandas.",
 33 |         "src/main.py": "def main():\n    print('Hello, World!')\n    return True",
 34 |         "src/utils.py": "def helper():\n    return 'helper'",
 35 |         "docs/index.md": "# Documentation\nThis is the documentation.",
 36 |         "tests/test_main.py": "def test_main():\n    assert True",
 37 |         ".gitignore": "*.pyc\n__pycache__/\n.DS_Store",
 38 |     }
 39 | 
 40 |     # Create and commit files
 41 |     for path, content in files.items():
 42 |         file_path = repo_path / path
 43 |         file_path.write_text(content)
 44 |         repo.index.add([str(file_path)])
 45 | 
 46 |     repo.index.commit("Initial commit")
 47 | 
 48 |     # Create some ignored files
 49 |     (repo_path / "src/main.pyc").write_text("compiled python")
 50 |     (repo_path / "src/__pycache__").mkdir()
 51 |     (repo_path / "src/__pycache__/main.cpython-39.pyc").write_text("cached python")
 52 | 
 53 |     # Make a change to test commit content
 54 |     main_py = repo_path / "src/main.py"
 55 |     main_py.write_text("def main():\n    print('Hello, Universe!')\n    return True")
 56 |     repo.index.add([str(main_py)])
 57 |     commit = repo.index.commit("Update greeting")
 58 | 
 59 |     return {"repo_path": repo_path, "repo": Repository(working_dir=str(repo_path), default_branch=default_branch), "last_commit": commit.hexsha}
 60 | 
 61 | 
 62 | class TestFileOperations:
 63 |     def test_list_files(self, local_repo, default_branch):
 64 |         """Test listing files in the repository."""
 65 |         repo = local_repo["repo"]
 66 | 
 67 |         # Get all files
 68 |         files = repo.list_files()
 69 | 
 70 |         # Check basic DataFrame properties
 71 |         assert isinstance(files, pd.DataFrame)
 72 |         assert "file" in files.columns
 73 |         assert "mode" in files.columns
 74 |         assert "type" in files.columns
 75 |         assert "sha" in files.columns
 76 |         assert "repository" in files.columns
 77 | 
 78 |         # Check that we have the expected files
 79 |         file_paths = set(files["file"].values)
 80 |         expected_files = {
 81 |             "README.md",
 82 |             "src/main.py",
 83 |             "src/utils.py",
 84 |             "docs/index.md",
 85 |             "tests/test_main.py",
 86 |             ".gitignore",
 87 |         }
 88 |         assert file_paths == expected_files
 89 | 
 90 |         # Check that ignored files are not included
 91 |         assert "src/main.pyc" not in file_paths
 92 |         assert "src/__pycache__/main.cpython-39.pyc" not in file_paths
 93 | 
 94 |         # Check file types
 95 |         assert all(files["type"] == "blob")  # All should be files, not trees
 96 | 
 97 |         # Check file modes (should be regular files)
 98 |         assert all(files["mode"].isin(["100644"]))
 99 | 
100 |     def test_get_file_content(self, local_repo, default_branch):
101 |         """Test getting file content from the repository."""
102 |         repo = local_repo["repo"]
103 | 
104 |         # Test getting content of an existing file
105 |         content = repo.get_file_content("src/main.py")
106 |         assert content == "def main():\n    print('Hello, Universe!')\n    return True"
107 | 
108 |         # Test getting content at a specific revision (first commit)
109 |         first_content = repo.get_file_content("src/main.py", rev="HEAD^")
110 |         assert first_content == "def main():\n    print('Hello, World!')\n    return True"
111 | 
112 |         # Test getting content of a non-existent file
113 |         assert repo.get_file_content("nonexistent.txt") is None
114 | 
115 |         # Test getting content of an ignored file
116 |         assert repo.get_file_content("src/main.pyc") is None
117 | 
118 |         # Test getting content with invalid revision
119 |         assert repo.get_file_content("src/main.py", rev="invalid_rev") is None
120 | 
121 |     def test_get_commit_content(self, local_repo, default_branch):
122 |         """Test getting detailed content changes from a commit."""
123 |         repo = local_repo["repo"]
124 |         commit_sha = local_repo["last_commit"]
125 | 
126 |         # Get changes from the last commit
127 |         changes = repo.get_commit_content(commit_sha)
128 | 
129 |         # Check basic DataFrame properties
130 |         assert isinstance(changes, pd.DataFrame)
131 |         assert "file" in changes.columns
132 |         assert "change_type" in changes.columns
133 |         assert "old_line_num" in changes.columns
134 |         assert "new_line_num" in changes.columns
135 |         assert "content" in changes.columns
136 |         assert "repository" in changes.columns
137 | 
138 |         # Check that we have the expected changes
139 |         assert len(changes) > 0
140 |         file_changes = changes[changes["file"] == "src/main.py"]
141 |         assert len(file_changes) > 0
142 | 
143 |         # Check for removed line
144 |         removed = file_changes[file_changes["old_line_num"].notna()]
145 |         assert len(removed) == 1
146 |         assert "Hello, World!" in removed.iloc[0]["content"]
147 | 
148 |         # Check for added line
149 |         added = file_changes[file_changes["new_line_num"].notna()]
150 |         assert len(added) == 1
151 |         assert "Hello, Universe!" in added.iloc[0]["content"]
152 | 
153 |         # Test with glob filters
154 |         # Should find no changes when excluding .py files
155 |         filtered = repo.get_commit_content(commit_sha, ignore_globs=["*.py"])
156 |         assert len(filtered) == 0
157 | 
158 |         # Should find changes when including only .py files
159 |         filtered = repo.get_commit_content(commit_sha, include_globs=["*.py"])
160 |         assert len(filtered) > 0
161 | 
162 |         # Test with invalid commit
163 |         invalid = repo.get_commit_content("invalid_sha")
164 |         assert len(invalid) == 0
165 | 
166 |     def test_file_change_rates_index_ambiguity(self, local_repo, default_branch):
167 |         """Test that file_change_rates handles file index/column ambiguity correctly."""
168 |         repo = local_repo["repo"]
169 | 
170 |         # Get file change rates
171 |         change_rates = repo.file_change_rates()
172 | 
173 |         # Verify file is not both an index and column
174 |         assert "file" not in change_rates.index.names, "file should not be in index"
175 |         assert "file" in change_rates.columns, "file should be a column"
176 | 
177 |         # Try merging with another DataFrame to ensure no ambiguity
178 |         file_details = repo.file_detail()
179 | 
180 |         # This merge should work without ambiguity
181 |         try:
182 |             pd.merge(file_details, change_rates, on="file", how="outer")
183 |         except ValueError as e:
184 |             pytest.fail(f"Merge failed due to ambiguity: {e}")
185 | 


--------------------------------------------------------------------------------
/gitpandas/utilities/plotting.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: plotting
  3 |    :platform: Unix, Windows
  4 |    :synopsis: helper functions for plotting tables from gitpandas
  5 | 
  6 | .. moduleauthor:: Will McGinnis <will@pedalwrencher.com>
  7 | 
  8 | 
  9 | """
 10 | 
 11 | try:
 12 |     import matplotlib.pyplot as plt
 13 |     import matplotlib.style
 14 |     import pandas as pd
 15 | 
 16 |     matplotlib.style.use("ggplot")
 17 |     HAS_MPL = True
 18 | except ImportError:
 19 |     HAS_MPL = False
 20 | 
 21 | __author__ = "willmcginnis"
 22 | 
 23 | 
 24 | def plot_punchcard(df, metric="lines", title="punchcard", by=None):
 25 |     """
 26 |     Uses modified plotting code from https://bitbucket.org/birkenfeld/hgpunchcard
 27 | 
 28 |     :param df:
 29 |     :param metric:
 30 |     :param title:
 31 |     :return:
 32 |     """
 33 | 
 34 |     if not HAS_MPL:
 35 |         raise ImportError("Must have matplotlib installed to use the plotting functions")
 36 | 
 37 |     # Validate input DataFrame
 38 |     required_columns = ["hour_of_day", "day_of_week", metric]
 39 |     if df.empty or not all(col in df.columns for col in required_columns):
 40 |         raise KeyError(f"DataFrame must contain columns: {required_columns}")
 41 | 
 42 |     # Validate data types and ranges
 43 |     if not pd.api.types.is_numeric_dtype(df[metric]):
 44 |         raise ValueError(f"Metric column '{metric}' must be numeric")
 45 | 
 46 |     if not all(0 <= x <= 6 for x in df["day_of_week"]):
 47 |         raise ValueError("day_of_week values must be between 0 and 6")
 48 | 
 49 |     if not all(0 <= x <= 23 for x in df["hour_of_day"]):
 50 |         raise ValueError("hour_of_day values must be between 0 and 23")
 51 | 
 52 |     # find how many plots we are making
 53 |     unique_vals = set(df[by].values.tolist()) if by is not None else ["foo"]
 54 |     for idx, val in enumerate(unique_vals):
 55 |         sub_df = df[df[by] == val] if by is not None else df
 56 |         fig = plt.figure(figsize=(8, title and 3 or 2.5), facecolor="#ffffff")
 57 |         ax = fig.add_subplot(111, facecolor="#ffffff")
 58 |         fig.subplots_adjust(left=0.06, bottom=0.04, right=0.98, top=0.95)
 59 |         if by is not None:
 60 |             ax.set_title(title + f" ({str(val)})", y=0.96).set_color("#333333")
 61 |         else:
 62 |             ax.set_title(title, y=0.96).set_color("#333333")
 63 |         ax.set_frame_on(False)
 64 |         ax.scatter(
 65 |             sub_df["hour_of_day"],
 66 |             sub_df["day_of_week"],
 67 |             s=sub_df[metric],
 68 |             c="#333333",
 69 |             edgecolor="#333333",
 70 |         )
 71 |         for line in ax.get_xticklines() + ax.get_yticklines():
 72 |             line.set_alpha(0.0)
 73 |         dist = -0.8
 74 |         ax.plot([dist, 23.5], [dist, dist], c="#555555")
 75 |         ax.plot([dist, dist], [dist, 6.4], c="#555555")
 76 |         ax.set_xlim(-1, 24)
 77 |         ax.set_ylim(-0.9, 6.9)
 78 |         ax.set_yticks(range(7))
 79 |         for tx in ax.set_yticklabels(["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"]):
 80 |             tx.set_color("#555555")
 81 |             tx.set_size("x-small")
 82 |         ax.set_xticks(range(24))
 83 |         for tx in ax.set_xticklabels([f"{x:02d}" for x in range(24)]):
 84 |             tx.set_color("#555555")
 85 |             tx.set_size("x-small")
 86 |         ax.set_aspect("equal")
 87 |         if idx + 1 == len(unique_vals):
 88 |             plt.show(block=True)
 89 |         else:
 90 |             plt.show(block=False)
 91 | 
 92 | 
 93 | def plot_cumulative_blame(df):
 94 |     """
 95 |     Plot cumulative blame information as a stacked area chart.
 96 | 
 97 |     Args:
 98 |         df (pandas.DataFrame): DataFrame with dates as index and committers as columns
 99 | 
100 |     Returns:
101 |         matplotlib.figure.Figure: The generated figure
102 |     """
103 | 
104 |     if not HAS_MPL:
105 |         raise ImportError("Must have matplotlib installed to use the plotting functions")
106 | 
107 |     # Validate input DataFrame
108 |     if df.empty:
109 |         raise ValueError("DataFrame cannot be empty")
110 | 
111 |     # Handle NaN values by filling with 0
112 |     df = df.fillna(0)
113 | 
114 |     ax = df.plot(kind="area", stacked=True)
115 |     plt.title("Cumulative Blame")
116 |     plt.xlabel("date")
117 |     plt.ylabel("LOC")
118 |     box = ax.get_position()
119 |     ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
120 |     ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
121 | 
122 |     # Only try to show if not using Agg backend
123 |     if plt.get_backend() != "Agg":
124 |         plt.show()
125 | 
126 |     return plt.gcf()
127 | 
128 | 
129 | def plot_lifeline(changes, ownership_changes, refactoring):
130 |     """
131 |     Plot file lifelines with ownership changes and refactoring events.
132 | 
133 |     Args:
134 |         changes (pd.DataFrame): DataFrame containing file change history
135 |         ownership_changes (pd.DataFrame): DataFrame containing ownership change events
136 |         refactoring (pd.DataFrame): DataFrame containing refactoring events
137 | 
138 |     Returns:
139 |         matplotlib.figure.Figure: The generated plot figure
140 |     """
141 |     if not HAS_MPL:
142 |         raise ImportError("matplotlib is required for plotting. Please install it first.")
143 | 
144 |     # Validate input DataFrames
145 |     if changes.empty:
146 |         raise ValueError("changes DataFrame cannot be empty")
147 | 
148 |     # Validate that all files in ownership_changes and refactoring exist in changes
149 |     if not ownership_changes.empty:
150 |         invalid_files = set(ownership_changes.filename) - set(changes.filename)
151 |         if invalid_files:
152 |             raise ValueError(f"Files in ownership_changes not found in changes: {invalid_files}")
153 | 
154 |     if not refactoring.empty:
155 |         invalid_files = set(refactoring.filename) - set(changes.filename)
156 |         if invalid_files:
157 |             raise ValueError(f"Files in refactoring not found in changes: {invalid_files}")
158 | 
159 |     # Create a new figure
160 |     fig, ax = plt.subplots(figsize=(12, 6))
161 | 
162 |     # Get unique files and sort them
163 |     files = changes.filename.unique()
164 |     files.sort()
165 | 
166 |     # Create y-axis positions for each file
167 |     file_positions = {file: i for i, file in enumerate(files)}
168 | 
169 |     # Plot lifelines
170 |     for file in files:
171 |         file_changes = changes[changes.filename == file]
172 |         ax.plot(
173 |             file_changes.index,
174 |             [file_positions[file]] * len(file_changes),
175 |             "-",
176 |             label="_nolegend_",
177 |             alpha=0.5,
178 |         )
179 | 
180 |     # Plot ownership changes
181 |     if len(ownership_changes) > 0:
182 |         ax.scatter(
183 |             ownership_changes.index,
184 |             [file_positions[f] for f in ownership_changes.filename],
185 |             marker="o",
186 |             c="red",
187 |             label="Ownership Change",
188 |             alpha=0.7,
189 |         )
190 | 
191 |     # Plot refactoring events
192 |     if len(refactoring) > 0:
193 |         ax.scatter(
194 |             refactoring.index,
195 |             [file_positions[f] for f in refactoring.filename],
196 |             marker="s",
197 |             c="blue",
198 |             label="Refactoring",
199 |             alpha=0.7,
200 |         )
201 | 
202 |     # Customize the plot
203 |     ax.set_yticks(range(len(files)))
204 |     ax.set_yticklabels(files)
205 |     ax.set_xlabel("Time")
206 |     ax.set_ylabel("Files")
207 |     ax.set_title("File Lifelines with Ownership Changes and Refactoring Events")
208 |     ax.legend()
209 |     ax.grid(True, alpha=0.3)
210 | 
211 |     # Rotate dates for better readability
212 |     plt.xticks(rotation=45)
213 | 
214 |     # Adjust layout to prevent label cutoff
215 |     plt.tight_layout()
216 | 
217 |     return fig
218 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/git-pandas.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/git-pandas.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/git-pandas"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/git-pandas"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/tests/test_repository_cache.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | from unittest import mock
  5 | 
  6 | import pytest
  7 | from git import Repo
  8 | 
  9 | from gitpandas import Repository
 10 | from gitpandas.cache import DiskCache
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def temp_git_repo(default_branch):
 15 |     """Create a temporary git repository for testing."""
 16 |     temp_dir = tempfile.mkdtemp()
 17 | 
 18 |     # Initialize git repo
 19 |     repo = Repo.init(temp_dir)
 20 |     # Explicitly create and checkout default branch
 21 |     repo.git.checkout(b=default_branch)
 22 | 
 23 |     # Create a test file
 24 |     test_file_path = os.path.join(temp_dir, "test_file.txt")
 25 |     with open(test_file_path, "w") as f:
 26 |         f.write("Initial content")
 27 | 
 28 |     # Add and commit
 29 |     repo.git.add(test_file_path)
 30 |     repo.git.commit("-m", "Initial commit")
 31 | 
 32 |     # Create a second file and commit
 33 |     test_file2_path = os.path.join(temp_dir, "test_file2.txt")
 34 |     with open(test_file2_path, "w") as f:
 35 |         f.write("Second file content")
 36 | 
 37 |     # Add and commit
 38 |     repo.git.add(test_file2_path)
 39 |     repo.git.commit("-m", "Second commit")
 40 | 
 41 |     yield temp_dir
 42 | 
 43 |     # Cleanup
 44 |     shutil.rmtree(temp_dir)
 45 | 
 46 | 
 47 | @pytest.fixture
 48 | def temp_cache_file():
 49 |     """Create a temporary cache file."""
 50 |     fd, path = tempfile.mkstemp(suffix=".gz")
 51 |     os.close(fd)
 52 |     yield path
 53 |     if os.path.exists(path):
 54 |         os.unlink(path)
 55 | 
 56 | 
 57 | class TestRepositoryCache:
 58 |     """Test caching behavior with the actual Repository class."""
 59 | 
 60 |     def test_repository_list_files_cache(self, temp_git_repo, temp_cache_file, default_branch):
 61 |         """Test that list_files method properly uses cache."""
 62 |         # Create cache and repository
 63 |         cache = DiskCache(filepath=temp_cache_file)
 64 | 
 65 |         # Mock the cache _get_entry and set methods to track calls
 66 |         with (
 67 |             mock.patch.object(cache, "_get_entry", wraps=cache._get_entry) as mock_get_entry,
 68 |             mock.patch.object(cache, "set", wraps=cache.set) as mock_set,
 69 |         ):
 70 |             repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch)
 71 | 
 72 |             # First call - should set cache but not get from it
 73 |             result1 = repo.list_files()
 74 |             assert mock_set.call_count > 0, "Cache set should be called"
 75 |             assert mock_get_entry.call_count > 0, "Cache _get_entry should be called (but returns miss)"
 76 |             mock_set.reset_mock()
 77 |             mock_get_entry.reset_mock()
 78 | 
 79 |             # Second call - should get from cache
 80 |             result2 = repo.list_files()
 81 |             assert mock_get_entry.call_count > 0, "Cache _get_entry should be called"
 82 |             assert mock_set.call_count == 0, "Cache set should not be called"
 83 | 
 84 |             # Results should be identical
 85 |             assert result1.equals(result2), "Results should be identical when using cache"
 86 | 
 87 |             # Force refresh - should set cache again
 88 |             mock_set.reset_mock()
 89 |             mock_get_entry.reset_mock()
 90 |             result3 = repo.list_files(force_refresh=True)
 91 |             assert mock_set.call_count > 0, "Cache set should be called with force_refresh"
 92 | 
 93 |             # Results should match (unchanged repo)
 94 |             assert result1.equals(result3), "Results should match even with force_refresh"
 95 | 
 96 |     def test_repository_cache_with_different_params(self, temp_git_repo, temp_cache_file, default_branch):
 97 |         """Test caching with different parameters."""
 98 |         cache = DiskCache(filepath=temp_cache_file)
 99 | 
100 |         # Spy on the cache's set method
101 |         with mock.patch.object(cache, "set", wraps=cache.set) as mock_set:
102 |             repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch)
103 | 
104 |             # Call with default revision (HEAD)
105 |             repo.list_files()
106 |             first_call_count = mock_set.call_count
107 |             assert first_call_count > 0, "Cache set should be called"
108 |             mock_set.reset_mock()
109 | 
110 |             # Call with same parameters - should use cache
111 |             repo.list_files()
112 |             assert mock_set.call_count == 0, "Cache set should not be called for same parameters"
113 | 
114 |             # Call with different revision - should set new cache entry
115 |             repo.list_files(rev="HEAD~1")
116 |             assert mock_set.call_count > 0, "Cache set should be called for different parameters"
117 | 
118 |     def test_repository_cache_persistence(self, temp_git_repo, temp_cache_file, default_branch):
119 |         """Test that cache persists between Repository instances."""
120 |         # First repository
121 |         cache1 = DiskCache(filepath=temp_cache_file)
122 |         repo1 = Repository(working_dir=temp_git_repo, cache_backend=cache1, default_branch=default_branch)
123 | 
124 |         # Call method and get result
125 |         result1 = repo1.list_files()
126 | 
127 |         # Create second repository with same cache file
128 |         cache2 = DiskCache(filepath=temp_cache_file)
129 | 
130 |         # Mock the _get_entry method to verify it's called
131 |         with (
132 |             mock.patch.object(cache2, "_get_entry", wraps=cache2._get_entry) as mock_get_entry,
133 |             mock.patch.object(cache2, "set", wraps=cache2.set) as mock_set,
134 |         ):
135 |             repo2 = Repository(working_dir=temp_git_repo, cache_backend=cache2, default_branch=default_branch)
136 | 
137 |             # Call same method - should use cache
138 |             result2 = repo2.list_files()
139 |             assert mock_get_entry.call_count > 0, "Cache _get_entry should be called"
140 |             assert mock_set.call_count == 0, "Cache set should not be called"
141 | 
142 |             # Results should match
143 |             assert result1.equals(result2), "Results should match between repository instances"
144 | 
145 |     def test_multiple_repository_methods_cache(self, temp_git_repo, temp_cache_file, default_branch):
146 |         """Test caching behavior across different repository methods."""
147 |         cache = DiskCache(filepath=temp_cache_file)
148 |         repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch)
149 | 
150 |         # Create a dictionary to store original results
151 |         results = {}
152 | 
153 |         # First calls - should create cache entries
154 |         results["commits"] = repo.commit_history()
155 |         results["files"] = repo.list_files()
156 | 
157 |         # Mock the cache _get_entry method to track calls
158 |         with mock.patch.object(cache, "_get_entry", wraps=cache._get_entry) as mock_get_entry:
159 |             # Second calls - should use cache
160 |             commits2 = repo.commit_history()
161 |             files2 = repo.list_files()
162 | 
163 |             # Both should match original results
164 |             assert commits2.equals(results["commits"]), "Commit results should match"
165 |             assert files2.equals(results["files"]), "File results should match"
166 | 
167 |             # Verify _get_entry was called for both
168 |             assert mock_get_entry.call_count >= 2, "Cache _get_entry should be called for both methods"
169 | 
170 |             # Force refresh one method
171 |             mock_get_entry.reset_mock()
172 |             with mock.patch.object(cache, "set", wraps=cache.set) as mock_set:
173 |                 commits3 = repo.commit_history(force_refresh=True)
174 |                 assert commits3.equals(results["commits"]), "Results should still match after force refresh"
175 |                 assert mock_set.call_count > 0, "Cache set should be called for force refresh"
176 | 
177 |                 # Other method should still use cache
178 |                 mock_get_entry.reset_mock()
179 |                 files3 = repo.list_files()
180 |                 assert files3.equals(results["files"]), "Files should still match"
181 |                 assert mock_get_entry.call_count > 0, "Cache _get_entry should be called for unchanged method"
182 | 


--------------------------------------------------------------------------------
/tests/test_Repository/test_file_detail.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pytest
  5 | 
  6 | from gitpandas import Repository
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def local_repo(tmp_path, default_branch):
 11 |     """Fixture for a local repository with different file types."""
 12 |     # Create a temporary directory
 13 |     repo_dir = tmp_path / "repository1"
 14 |     repo_dir.mkdir()
 15 | 
 16 |     # Initialize a git repo
 17 |     grepo = git.Repo.init(str(repo_dir))
 18 | 
 19 |     # Configure git user
 20 |     grepo.git.config("user.name", "Test User")
 21 |     grepo.git.config("user.email", "test@example.com")
 22 | 
 23 |     # Create and checkout default branch
 24 |     grepo.git.checkout("-b", default_branch)
 25 | 
 26 |     # Add a README file
 27 |     readme_path = repo_dir / "README.md"
 28 |     readme_path.write_text("Sample README for a sample project\n")
 29 | 
 30 |     # Commit it
 31 |     grepo.git.add("README.md")
 32 |     grepo.git.commit(m="first commit")
 33 | 
 34 |     # Add some Python files with different content
 35 |     for idx in range(3):
 36 |         py_file = repo_dir / f"file_{idx}.py"
 37 |         py_file.write_text(f"import sys\nimport os\n\ndef function_{idx}():\n    return {idx}\n")
 38 | 
 39 |         grepo.git.add(all=True)
 40 |         grepo.git.commit(m=f"adding file_{idx}.py")
 41 | 
 42 |     # Add a JavaScript file
 43 |     js_file = repo_dir / "script.js"
 44 |     js_file.write_text('function hello() {\n    console.log("Hello, world!");\n}\n')
 45 | 
 46 |     grepo.git.add("script.js")
 47 |     grepo.git.commit(m="adding script.js")
 48 | 
 49 |     # Add a CSS file
 50 |     css_file = repo_dir / "style.css"
 51 |     css_file.write_text("body {\n    margin: 0;\n    padding: 0;\n}\n")
 52 | 
 53 |     grepo.git.add("style.css")
 54 |     grepo.git.commit(m="adding style.css")
 55 | 
 56 |     # Create a subdirectory
 57 |     subdir = repo_dir / "subdir"
 58 |     subdir.mkdir()
 59 | 
 60 |     # Add a file in the subdirectory
 61 |     subdir_file = subdir / "subfile.py"
 62 |     subdir_file.write_text('import sys\n\ndef subfunction():\n    return "sub"\n')
 63 | 
 64 |     grepo.git.add(all=True)
 65 |     grepo.git.commit(m="adding subdir/subfile.py")
 66 | 
 67 |     # Create the Repository object
 68 |     git_pandas_repo = Repository(working_dir=str(repo_dir), verbose=True, default_branch=default_branch)
 69 | 
 70 |     yield git_pandas_repo
 71 | 
 72 |     # Cleanup
 73 |     git_pandas_repo.__del__()
 74 | 
 75 | 
 76 | class TestFileDetail:
 77 |     def test_file_detail_basic(self, local_repo):
 78 |         """Test basic functionality of the file_detail method."""
 79 |         file_detail = local_repo.file_detail()
 80 | 
 81 |         # Check the shape and columns
 82 |         assert isinstance(file_detail, pd.DataFrame)
 83 |         assert file_detail.shape[0] > 0
 84 | 
 85 |         # Check that we have the expected columns
 86 |         expected_columns = ["loc", "file_owner", "ext", "last_edit_date", "repository"]
 87 |         for col in expected_columns:
 88 |             assert col in file_detail.columns
 89 | 
 90 |         # Check that we have entries for each file type
 91 |         exts = file_detail["ext"].unique()
 92 |         assert "md" in exts
 93 |         assert "py" in exts
 94 |         assert "js" in exts
 95 |         assert "css" in exts
 96 | 
 97 |         # Check that the LOC counts are correct for different file types
 98 |         md_loc = file_detail.loc[file_detail["ext"] == "md", "loc"].sum()
 99 |         py_loc = file_detail.loc[file_detail["ext"] == "py", "loc"].sum()
100 |         js_loc = file_detail.loc[file_detail["ext"] == "js", "loc"].sum()
101 |         css_loc = file_detail.loc[file_detail["ext"] == "css", "loc"].sum()
102 | 
103 |         assert md_loc == 1  # README.md has 1 line
104 |         assert py_loc == 19  # 3 Python files with 5 lines each + 1 with 4 lines
105 |         assert js_loc == 3  # script.js has 3 lines
106 |         assert css_loc == 4  # style.css has 4 lines
107 | 
108 |     def test_file_detail_with_globs(self, local_repo):
109 |         """Test the ignore_globs and include_globs parameters."""
110 |         # Get file detail for all files
111 |         file_detail_all = local_repo.file_detail()
112 | 
113 |         # Get file detail ignoring Python files
114 |         file_detail_no_py = local_repo.file_detail(ignore_globs=["*.py"])
115 | 
116 |         # Check that we have fewer files in the filtered file detail
117 |         assert file_detail_no_py.shape[0] < file_detail_all.shape[0]
118 | 
119 |         # Check that no Python files are included
120 |         assert "py" not in file_detail_no_py["ext"].values
121 | 
122 |         # Get file detail including only Python files
123 |         file_detail_only_py = local_repo.file_detail(include_globs=["*.py"])
124 | 
125 |         # Check that we have fewer files than the full file detail
126 |         assert file_detail_only_py.shape[0] < file_detail_all.shape[0]
127 | 
128 |         # Check that only Python files are included
129 |         for ext in file_detail_only_py["ext"].values:
130 |             assert ext == "py"
131 | 
132 |         # Check that the sum of the filtered file details equals the total
133 |         assert file_detail_no_py.shape[0] + file_detail_only_py.shape[0] == file_detail_all.shape[0]
134 | 
135 |     def test_file_detail_with_rev(self, local_repo, default_branch):
136 |         """Test the rev parameter of the file_detail method."""
137 |         # Get file detail for the current revision
138 |         file_detail_head = local_repo.file_detail(rev="HEAD")
139 | 
140 |         # Get file detail for the first commit
141 |         # This should only include the README.md file
142 |         first_commit = local_repo.revs(branch=default_branch).iloc[-1]["rev"]
143 |         file_detail_first = local_repo.file_detail(rev=first_commit)
144 | 
145 |         # Check that we have fewer files in the first commit
146 |         assert file_detail_first.shape[0] < file_detail_head.shape[0]
147 | 
148 |         # Check that only md extension is included in the first commit
149 |         assert file_detail_first.shape[0] == 1
150 |         assert "md" in file_detail_first["ext"].values
151 | 
152 |     def test_file_detail_committer(self, local_repo):
153 |         """Test the committer parameter of the file_detail method."""
154 |         # Get file detail with committer=True
155 |         file_detail_committer = local_repo.file_detail(committer=True)
156 | 
157 |         # Check that we have the file_owner column
158 |         assert "file_owner" in file_detail_committer.columns
159 | 
160 |         # Check that all file owners are 'Test User'
161 |         for owner in file_detail_committer["file_owner"]:
162 |             if owner is not None:  # Check if owner is not None before asserting
163 |                 assert isinstance(owner, str)
164 |                 assert owner == "Test User"
165 | 
166 |         # Get file detail with committer=False
167 |         file_detail_no_committer = local_repo.file_detail(committer=False)
168 | 
169 |         # Check that we have the file_owner column
170 |         assert "file_owner" in file_detail_no_committer.columns
171 | 
172 |         # Check that all file owners are 'Test User'
173 |         for owner in file_detail_no_committer["file_owner"]:
174 |             if owner is not None:  # Check if owner is not None before asserting
175 |                 assert isinstance(owner, str)
176 |                 assert owner == "Test User"
177 | 
178 |     def test_file_detail_last_edit(self, local_repo):
179 |         """Test that the last_edit_date column contains valid timestamps."""
180 |         file_detail = local_repo.file_detail()
181 | 
182 |         # Check that all last_edit_date values are datetime64 objects
183 |         for last_edit in file_detail["last_edit_date"].values:
184 |             assert isinstance(last_edit, np.datetime64 | pd.Timestamp)
185 | 
186 |         # Check that the last file added has the most recent timestamp
187 |         # Find the file with the py extension in the subdir directory
188 |         subdir_files = [idx for idx, ext in enumerate(file_detail["ext"].values) if ext == "py"]
189 |         last_edit_dates = [file_detail["last_edit_date"].values[idx] for idx in subdir_files]
190 |         last_edit = max(last_edit_dates)
191 | 
192 |         # All other files should have earlier or equal timestamps
193 |         for edit_time in file_detail["last_edit_date"].values:
194 |             assert edit_time <= last_edit
195 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\git-pandas.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\git-pandas.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/.cursor/rules/project_overview.mdc:
--------------------------------------------------------------------------------
  1 | ---
  2 | description: Overview of the gitpandas project
  3 | globs: 
  4 | alwaysApply: false
  5 | ---
  6 | I'll do a thorough analysis of the gitpandas library to help new developers understand it. Let me start by exploring the codebase to get a comprehensive understanding.
  7 | 
  8 | Ran tool
  9 | Let's first look at the README to get an overview of the project:
 10 | 
 11 | Read file: README.md
 12 | Now let's look at the main module structure:
 13 | 
 14 | Ran tool
 15 | Let's check the __init__.py file to understand the main components:
 16 | 
 17 | Read file: gitpandas/__init__.py
 18 | Now let's look at the repository.py file, which seems to be one of the main components:
 19 | 
 20 | Read file: gitpandas/repository.py
 21 | Let's also look at the project.py file to understand the ProjectDirectory class:
 22 | 
 23 | Read file: gitpandas/project.py
 24 | Let's also check the cache.py file to understand the caching mechanism:
 25 | 
 26 | Read file: gitpandas/cache.py
 27 | Let's check the utilities directory to understand what helper functions are available:
 28 | 
 29 | Ran tool
 30 | Let's look at the utilities/__init__.py file:
 31 | 
 32 | Read file: gitpandas/utilities/__init__.py
 33 | Let's look at the plotting utilities:
 34 | 
 35 | Read file: gitpandas/utilities/plotting.py
 36 | Let's check the examples directory to see how the library is used:
 37 | 
 38 | Ran tool
 39 | Let's look at a couple of example files to understand how the library is used:
 40 | 
 41 | Read file: examples/commit_history.py
 42 | 
 43 | Read file: examples/bus_analysis.py
 44 | 
 45 | Read file: examples/cumulative_blame.py
 46 | Let's check the pyproject.toml file to understand the project dependencies and metadata:
 47 | 
 48 | Read file: pyproject.toml
 49 | Based on my thorough analysis of the gitpandas library, here's a comprehensive overview for new developers:
 50 | 
 51 | # GitPandas: A Comprehensive Overview
 52 | 
 53 | ## What is GitPandas?
 54 | 
 55 | GitPandas is a Python library that provides a set of wrappers around GitPython for creating pandas DataFrames from git repository data. It enables data-driven analysis of git repositories, allowing developers and data scientists to extract meaningful insights from their codebase's history and structure.
 56 | 
 57 | ## Core Components
 58 | 
 59 | GitPandas is built around two primary objects:
 60 | 
 61 | 1. **Repository**: Represents a single git repository and provides methods to analyze it.
 62 | 2. **ProjectDirectory**: Represents a directory containing multiple git repositories, allowing for aggregated analysis across all repositories.
 63 | 3. **GitHubProfile**: A specialized ProjectDirectory that analyzes all repositories owned by a GitHub user.
 64 | 
 65 | ## Key Features
 66 | 
 67 | ### Repository Analysis
 68 | - **Commit History**: Extract detailed commit history with filtering by file extensions and directories
 69 | - **File Change History**: Track how files have changed over time
 70 | - **Blame Analysis**: Determine who wrote each line of code in the repository
 71 | - **Branch and Tag Information**: Extract metadata about branches and tags
 72 | - **Bus Factor Analysis**: Estimate the project's "bus factor" (risk if key contributors leave)
 73 | - **Cumulative Blame**: Track how code ownership changes over time
 74 | - **Punchcard Analysis**: Visualize when code is written (day/hour patterns)
 75 | - **Hours Estimation**: Estimate time spent on the project by contributors
 76 | 
 77 | ### Project Directory Analysis
 78 | - Aggregate all the above analyses across multiple repositories
 79 | - Compare metrics across repositories
 80 | - Generate project-wide statistics and visualizations
 81 | 
 82 | ### Visualization Utilities
 83 | - Plot cumulative blame over time
 84 | - Create punchcard visualizations of commit patterns
 85 | 
 86 | ### Caching System
 87 | - In-memory caching via `EphemeralCache`
 88 | - Redis-based caching via `RedisDFCache`
 89 | - Decorator-based caching system for expensive operations
 90 | 
 91 | ## Core Concepts
 92 | 
 93 | ### Data Representation
 94 | All data in GitPandas is represented as pandas DataFrames, making it easy to:
 95 | - Filter and transform the data
 96 | - Visualize results
 97 | - Export to various formats
 98 | - Integrate with other data analysis workflows
 99 | 
100 | ### Filtering Mechanisms
101 | GitPandas provides consistent filtering options across most methods:
102 | - `ignore_globs`: Glob patterns for files to exclude (e.g., `['tests/*', 'docs/*']`)
103 | - `include_globs`: Glob patterns for files to include (e.g., `['*.py', '*.js']`)
104 | - `limit`: Limit the number of commits to analyze
105 | - `days`: Limit analysis to commits within a certain number of days
106 | - `branch`: Specify which branch to analyze
107 | 
108 | ### Caching
109 | GitPandas implements a flexible caching system to improve performance:
110 | - The `multicache` decorator allows caching results of expensive operations
111 | - Cache backends can be swapped (memory or Redis)
112 | - Cache keys are generated based on repository name and method parameters
113 | 
114 | ## Usage Patterns
115 | 
116 | ### Single Repository Analysis
117 | ```python
118 | from gitpandas import Repository
119 | 
120 | # Create a Repository object
121 | repo = Repository(working_dir='/path/to/repo')
122 | 
123 | # Get commit history
124 | commit_history = repo.commit_history(branch='master', include_globs=['*.py'])
125 | 
126 | # Analyze blame
127 | blame = repo.blame(committer=True)
128 | 
129 | # Estimate bus factor
130 | bus_factor = repo.bus_factor()
131 | ```
132 | 
133 | ### Multi-Repository Analysis
134 | ```python
135 | from gitpandas import ProjectDirectory
136 | 
137 | # Create a ProjectDirectory object
138 | project = ProjectDirectory(working_dir='/path/to/project')
139 | 
140 | # Get aggregated commit history
141 | commit_history = project.commit_history(branch='master')
142 | 
143 | # Get aggregated blame
144 | blame = project.blame()
145 | ```
146 | 
147 | ### GitHub Profile Analysis
148 | ```python
149 | from gitpandas import GitHubProfile
150 | 
151 | # Analyze a GitHub user's repositories
152 | profile = GitHubProfile(username='username')
153 | 
154 | # Get repository information
155 | repo_info = profile.repo_information()
156 | ```
157 | 
158 | ### Visualization
159 | ```python
160 | from gitpandas import Repository
161 | from gitpandas.utilities.plotting import plot_cumulative_blame
162 | 
163 | repo = Repository(working_dir='/path/to/repo')
164 | blame = repo.cumulative_blame(branch='master')
165 | plot_cumulative_blame(blame)
166 | ```
167 | 
168 | ## Technical Implementation Details
169 | 
170 | ### Git Integration
171 | - Uses GitPython under the hood to interact with git repositories
172 | - Can work with local repositories or clone remote ones
173 | - Handles bare repositories appropriately
174 | 
175 | ### Performance Considerations
176 | - Caching system for expensive operations
177 | - Optional parallel processing for certain operations (requires joblib)
178 | - Filtering mechanisms to limit analysis scope
179 | 
180 | ### Data Processing
181 | - Heavy use of pandas for data manipulation and aggregation
182 | - Consistent DataFrame structures across different analyses
183 | - Time series analysis capabilities
184 | 
185 | ## Development Conventions
186 | 
187 | ### Code Organization
188 | - Core classes in separate modules (`repository.py`, `project.py`)
189 | - Utility functions in the `utilities` package
190 | - Caching mechanisms in `cache.py`
191 | 
192 | ### Error Handling
193 | - Graceful handling of missing repositories
194 | - Proper cleanup of temporary repositories
195 | - Appropriate warnings for deprecated functionality
196 | 
197 | ### Dependencies
198 | - Core dependencies: GitPython, pandas, numpy
199 | - Optional dependencies: matplotlib (for plotting), redis (for Redis caching), joblib (for parallel processing)
200 | 
201 | ## Practical Applications
202 | 
203 | GitPandas can be used for:
204 | 1. **Project Management**: Track contributor activity and estimate effort
205 | 2. **Code Quality Analysis**: Identify areas with high churn or single-owner code
206 | 3. **Team Dynamics**: Understand how team members collaborate and contribute
207 | 4. **Historical Analysis**: Track how a project evolves over time
208 | 5. **Risk Assessment**: Identify bus factor and knowledge silos
209 | 6. **Performance Metrics**: Generate developer and team performance metrics
210 | 
211 | ## Extending GitPandas
212 | 
213 | Developers can extend GitPandas by:
214 | 1. Adding new analysis methods to the Repository or ProjectDirectory classes
215 | 2. Creating new visualization utilities
216 | 3. Implementing additional caching backends
217 | 4. Building higher-level abstractions for specific use cases
218 | 
219 | ## Best Practices
220 | 
221 | When working with GitPandas:
222 | 1. Use appropriate filtering to limit analysis scope for large repositories
223 | 2. Implement caching for repeated analyses
224 | 3. Consider using parallel processing for intensive operations
225 | 4. Combine multiple analyses for comprehensive insights
226 | 5. Visualize results for easier interpretation
227 | 
228 | This overview should provide new developers with a solid understanding of GitPandas, its capabilities, and how to effectively use it for git repository analysis.
229 | 


--------------------------------------------------------------------------------
/tests/test_Project/test_properties.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import pytest
  3 | 
  4 | from gitpandas import ProjectDirectory
  5 | 
  6 | __author__ = "willmcginnis"
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def remote_project():
 11 |     """Fixture for a remote project directory."""
 12 |     project = ProjectDirectory(working_dir=["https://github.com/wdm0006/git-pandas.git"], verbose=True)
 13 |     yield project
 14 |     project.__del__()
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def local_project(tmp_path):
 19 |     """Fixture for a local project directory with multiple repositories."""
 20 |     # Create a temporary directory for the project
 21 |     project_dir = tmp_path / "repos"
 22 |     project_dir.mkdir()
 23 | 
 24 |     # Create repository directories
 25 |     repo1_dir = project_dir / "repository1"
 26 |     repo2_dir = project_dir / "repository2"
 27 |     repo1_dir.mkdir()
 28 |     repo2_dir.mkdir()
 29 | 
 30 |     # Initialize git repos
 31 |     grepo1 = git.Repo.init(str(repo1_dir))
 32 |     grepo2 = git.Repo.init(str(repo2_dir))
 33 | 
 34 |     # Configure git user
 35 |     grepo1.git.config("user.name", "Test User")
 36 |     grepo1.git.config("user.email", "test@example.com")
 37 |     grepo2.git.config("user.name", "Test User")
 38 |     grepo2.git.config("user.email", "test@example.com")
 39 | 
 40 |     # Rename master to main
 41 |     grepo1.git.branch("-M", "main")
 42 |     grepo2.git.branch("-M", "main")
 43 | 
 44 |     # Add README files
 45 |     with open(f"{repo1_dir}/README.md", "w") as f:
 46 |         f.write("Sample README for a sample python project\n")
 47 | 
 48 |     with open(f"{repo2_dir}/README.md", "w") as f:
 49 |         f.write("Sample README for a sample js project\n")
 50 | 
 51 |     # Commit them
 52 |     grepo1.git.add("README.md")
 53 |     grepo1.git.commit(m="first commit")
 54 | 
 55 |     grepo2.git.add("README.md")
 56 |     grepo2.git.commit(m="first commit")
 57 | 
 58 |     # Add Python files to repo1
 59 |     for idx in range(5):  # Increased from 3 to 5 files
 60 |         with open(f"{repo1_dir}/file_{idx}.py", "w") as f:
 61 |             f.write("import sys\nimport os\n")
 62 | 
 63 |         grepo1.git.add(all=True)
 64 |         grepo1.git.commit(m=f"adding file_{idx}.py")
 65 | 
 66 |     # Add JS files to repo2
 67 |     for idx in range(5):  # Increased from 3 to 5 files
 68 |         with open(f"{repo2_dir}/file_{idx}.js", "w") as f:
 69 |             f.write('document.write("hello world!");\n')
 70 | 
 71 |         grepo2.git.add(all=True)
 72 |         grepo2.git.commit(m=f"adding file_{idx}.js")
 73 | 
 74 |     # Create ProjectDirectory objects
 75 |     projectd_1 = ProjectDirectory(working_dir=[str(repo1_dir), str(repo2_dir)], verbose=True)
 76 |     projectd_2 = ProjectDirectory(working_dir=str(project_dir), verbose=True)
 77 | 
 78 |     yield {"projectd_1": projectd_1, "projectd_2": projectd_2}
 79 | 
 80 |     # Cleanup
 81 |     projectd_1.__del__()
 82 |     projectd_2.__del__()
 83 | 
 84 | 
 85 | # Remote project tests
 86 | class TestRemoteProperties:
 87 |     @pytest.mark.remote
 88 |     def test_repo_name(self, remote_project):
 89 |         assert "git-pandas" in list(remote_project.repo_name()["repository"].values)
 90 | 
 91 |     @pytest.mark.remote
 92 |     def test_branches(self, remote_project):
 93 |         branches = list(remote_project.branches()["branch"].values)
 94 |         assert "master" in branches
 95 |         assert "gh-pages" in branches
 96 | 
 97 |     @pytest.mark.remote
 98 |     def test_tags(self, remote_project):
 99 |         tags = list(remote_project.tags()["tag"].values)
100 |         assert "0.0.1" in tags
101 |         assert "0.0.2" in tags
102 | 
103 |     @pytest.mark.remote
104 |     def test_is_bare(self, remote_project):
105 |         for x in remote_project.is_bare()["is_bare"].values:
106 |             assert not x
107 | 
108 | 
109 | # Local project tests
110 | class TestLocalProperties:
111 |     def test_repo_name(self, local_project):
112 |         projectd_1 = local_project["projectd_1"]
113 |         projectd_2 = local_project["projectd_2"]
114 | 
115 |         assert "repository1" in list(projectd_1.repo_name()["repository"].values)
116 |         assert "repository2" in list(projectd_1.repo_name()["repository"].values)
117 |         assert "repository1" in list(projectd_2.repo_name()["repository"].values)
118 |         assert "repository2" in list(projectd_2.repo_name()["repository"].values)
119 | 
120 |     def test_branches(self, local_project):
121 |         projectd_1 = local_project["projectd_1"]
122 |         projectd_2 = local_project["projectd_2"]
123 | 
124 |         branches = list(projectd_1.branches()["branch"].values)
125 |         assert "main" in branches
126 | 
127 |         branches = list(projectd_2.branches()["branch"].values)
128 |         assert "main" in branches
129 | 
130 |     def test_tags(self, local_project):
131 |         projectd_1 = local_project["projectd_1"]
132 |         projectd_2 = local_project["projectd_2"]
133 | 
134 |         tags = projectd_1.tags()
135 |         assert len(tags) == 0
136 | 
137 |         tags = projectd_2.tags()
138 |         assert len(tags) == 0
139 | 
140 |     def test_is_bare(self, local_project):
141 |         projectd_1 = local_project["projectd_1"]
142 |         projectd_2 = local_project["projectd_2"]
143 | 
144 |         bares = projectd_1.is_bare()["is_bare"].values
145 |         for bare in bares:
146 |             assert not bare
147 | 
148 |         bares = projectd_2.is_bare()["is_bare"].values
149 |         for bare in bares:
150 |             assert not bare
151 | 
152 |     def test_commit_history(self, local_project):
153 |         projectd_1 = local_project["projectd_1"]
154 | 
155 |         ch = projectd_1.commit_history(branch="main")
156 |         assert ch.shape[0] == 12
157 | 
158 |         ch2 = projectd_1.commit_history(branch="main", ignore_globs=["*.[!p][!y]"])
159 |         assert ch2.shape[0] == 5
160 | 
161 |         ch3 = projectd_1.commit_history(branch="main", limit=4)
162 |         assert ch3.shape[0] == 4
163 | 
164 |         ch4 = projectd_1.commit_history(branch="main", days=5)
165 |         assert ch4.shape[0] == 12
166 | 
167 |     def test_file_change_history(self, local_project):
168 |         projectd_1 = local_project["projectd_1"]
169 | 
170 |         fch = projectd_1.file_change_history(branch="main")
171 |         assert fch.shape[0] == 12  # 2 READMEs + 5 py files + 5 js files
172 | 
173 |         fch2 = projectd_1.file_change_history(branch="main", ignore_globs=["*.[!p][!y]"])
174 |         assert fch2.shape[0] == 5  # 5 py files
175 | 
176 |         fch4 = projectd_1.file_change_history(branch="main", ignore_globs=["*.[!j][!s]"])
177 |         assert fch4.shape[0] == 5  # 5 js files
178 | 
179 |         fch3 = projectd_1.file_change_history(branch="main", limit=4)
180 |         assert fch3.shape[0] == 4
181 | 
182 |     def test_file_change_rates(self, local_project):
183 |         projectd_1 = local_project["projectd_1"]
184 | 
185 |         fcr = projectd_1.file_change_rates(branch="main")
186 |         assert fcr.shape[0] == 12
187 |         assert "unique_committers" in fcr.columns
188 |         assert "abs_rate_of_change" in fcr.columns
189 |         assert "net_rate_of_change" in fcr.columns
190 |         assert "net_change" in fcr.columns
191 |         assert "abs_change" in fcr.columns
192 |         assert "edit_rate" in fcr.columns
193 |         assert "file" in fcr.columns
194 |         assert "lines" in fcr.columns
195 |         assert "repository" in fcr.columns
196 |         # Check that data is populated
197 |         assert fcr["unique_committers"].sum() > 0
198 | 
199 |     def test_has_coverage(self, local_project):
200 |         projectd_1 = local_project["projectd_1"]
201 | 
202 |         # We know this repo doesn't have coverage
203 |         coverages = projectd_1.has_coverage()["has_coverage"].values
204 |         for coverage in coverages:
205 |             assert not coverage
206 | 
207 |     def test_bus_factor(self, local_project):
208 |         projectd_1 = local_project["projectd_1"]
209 | 
210 |         # We know this repo only has one committer
211 |         bf = projectd_1.bus_factor(by="projectd")
212 |         assert bf["bus factor"].values[0] == 1
213 | 
214 |     def test_blame(self, local_project):
215 |         projectd_1 = local_project["projectd_1"]
216 | 
217 |         blame = projectd_1.blame(ignore_globs=["*.[!p][!y]"])
218 |         assert blame["loc"].sum() == 10  # 5 files * 2 lines each
219 |         assert blame.shape[0] == 1
220 | 
221 |     def test_cumulative_blame(self, local_project):
222 |         projectd_1 = local_project["projectd_1"]
223 | 
224 |         cblame = projectd_1.cumulative_blame(by="committer", branch="main")
225 |         assert cblame.shape[0] > 0  # Just check that we have some rows
226 |         assert cblame[cblame.columns.values[0]].sum() > 0  # Check that we have some blame data
227 | 
228 |     def test_revs(self, local_project):
229 |         projectd_1 = local_project["projectd_1"]
230 | 
231 |         revs = projectd_1.revs(branch="main", num_datapoints=2)
232 |         assert revs.shape[0] == 2
233 | 
234 |         revs = projectd_1.revs(branch="main", limit=2)
235 |         assert revs.shape[0] == 2
236 | 
237 |         revs = projectd_1.revs(branch="main")
238 |         assert revs.shape[0] == 12  # 2 READMEs + 5 py files + 5 js files
239 | 


--------------------------------------------------------------------------------
/docs/source/cache.rst:
--------------------------------------------------------------------------------
  1 | Caching
  2 | =======
  3 | 
  4 | Git-Pandas supports pluggable cache backends to optimize performance for expensive, repetitive operations. This is particularly useful for large repositories or when running multiple analyses.
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | The caching system provides:
 10 | * In-memory caching for temporary results
 11 | * Disk-based caching for persistent storage across sessions
 12 | * Redis-based caching for distributed storage
 13 | * Cache management and invalidation methods
 14 | * Decorator-based caching for expensive operations
 15 | * **Cache timestamp tracking** - know when cache entries were populated
 16 | * **Cache statistics and monitoring** - track cache performance and usage
 17 | 
 18 | Available Cache Backends
 19 | ------------------------
 20 | 
 21 | In-Memory Cache (EphemeralCache)
 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 23 | 
 24 | The default in-memory cache is ephemeral and will be cleared when the process ends:
 25 | 
 26 | .. code-block:: python
 27 | 
 28 |     from gitpandas import Repository
 29 |     from gitpandas.cache import EphemeralCache
 30 |     
 31 |     # Create an in-memory cache with default settings
 32 |     cache = EphemeralCache()
 33 |     
 34 |     # Or customize the cache size
 35 |     cache = EphemeralCache(max_keys=500)
 36 |     
 37 |     # Use the cache with a repository
 38 |     repo = Repository('/path/to/repo', cache_backend=cache)
 39 | 
 40 | Disk Cache (DiskCache)
 41 | ~~~~~~~~~~~~~~~~~~~~~~
 42 | 
 43 | For persistent caching that survives between sessions:
 44 | 
 45 | .. code-block:: python
 46 | 
 47 |     from gitpandas import Repository
 48 |     from gitpandas.cache import DiskCache
 49 |     
 50 |     # Create a disk cache
 51 |     cache = DiskCache(filepath='/path/to/cache.gz', max_keys=1000)
 52 |     
 53 |     # Use the cache with a repository
 54 |     repo = Repository('/path/to/repo', cache_backend=cache)
 55 | 
 56 | Redis Cache (RedisDFCache)
 57 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
 58 | 
 59 | For persistent caching across sessions, use Redis:
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |     from gitpandas import Repository
 64 |     from gitpandas.cache import RedisDFCache
 65 |     
 66 |     # Create a Redis cache with default settings
 67 |     cache = RedisDFCache()
 68 |     
 69 |     # Or customize Redis connection and cache settings
 70 |     cache = RedisDFCache(
 71 |         host='localhost',
 72 |         port=6379,
 73 |         db=12,
 74 |         max_keys=1000,
 75 |         ttl=3600  # Cache entries expire after 1 hour
 76 |     )
 77 |     
 78 |     # Use the cache with a repository
 79 |     repo = Repository('/path/to/repo', cache_backend=cache)
 80 | 
 81 | Cache Timestamp Information
 82 | ---------------------------
 83 | 
 84 | All cache backends now track when cache entries were populated. You can access this information
 85 | without any changes to the Repository or ProjectDirectory API:
 86 | 
 87 | .. code-block:: python
 88 | 
 89 |     from gitpandas import Repository
 90 |     from gitpandas.cache import EphemeralCache
 91 |     
 92 |     # Create repository with cache
 93 |     cache = EphemeralCache()
 94 |     repo = Repository('/path/to/repo', cache_backend=cache)
 95 |     
 96 |     # Populate cache with some operations
 97 |     commit_history = repo.commit_history(limit=10)
 98 |     file_list = repo.list_files()
 99 |     
100 |     # Check what's in the cache and when it was cached
101 |     cached_keys = cache.list_cached_keys()
102 |     for entry in cached_keys:
103 |         print(f"Key: {entry['key']}")
104 |         print(f"Cached at: {entry['cached_at']}")
105 |         print(f"Age: {entry['age_seconds']:.1f} seconds")
106 |     
107 |     # Get specific cache information
108 |     key = "commit_history_main_10_None_None_None_None"
109 |     info = cache.get_cache_info(key)
110 |     if info:
111 |         print(f"Cache entry age: {info['age_minutes']:.2f} minutes")
112 | 
113 | Cache Information Methods
114 | ~~~~~~~~~~~~~~~~~~~~~~~~~
115 | 
116 | All cache backends support these methods for accessing timestamp information:
117 | 
118 | * ``list_cached_keys()`` - Returns list of all cached keys with metadata
119 | * ``get_cache_info(key)`` - Returns detailed information about a specific cache entry
120 | 
121 | The returned information includes:
122 | 
123 | * ``cached_at`` - UTC timestamp when the entry was cached
124 | * ``age_seconds`` - Age of the cache entry in seconds
125 | * ``age_minutes`` - Age of the cache entry in minutes  
126 | * ``age_hours`` - Age of the cache entry in hours
127 | * ``cache_key`` - The original cache key
128 | 
129 | Using the Cache Decorator
130 | -------------------------
131 | 
132 | The `@multicache` decorator can be used to cache method results:
133 | 
134 | .. code-block:: python
135 | 
136 |     from gitpandas.cache import multicache
137 |     
138 |     @multicache(
139 |         key_prefix="method_name",
140 |         key_list=["param1", "param2"],
141 |         skip_if=lambda x: x.get("param1") is None
142 |     )
143 |     def expensive_method(self, param1, param2):
144 |         # Method implementation
145 |         pass
146 | 
147 | Configuration
148 | -------------
149 | 
150 | Cache backends can be configured with various parameters:
151 | 
152 | EphemeralCache:
153 | * ``max_keys``: Maximum number of keys to store in memory (default: 1000)
154 | 
155 | DiskCache:
156 | * ``filepath``: Path to the cache file (required)
157 | * ``max_keys``: Maximum number of keys to store (default: 1000)
158 | 
159 | RedisDFCache:
160 | * ``host``: Redis host (default: 'localhost')
161 | * ``port``: Redis port (default: 6379)
162 | * ``db``: Redis database number (default: 12)
163 | * ``max_keys``: Maximum number of keys to store (default: 1000)
164 | * ``ttl``: Time-to-live in seconds for cache entries (default: None, no expiration)
165 | * Additional keyword arguments are passed to redis.StrictRedis
166 | 
167 | Backward Compatibility
168 | ----------------------
169 | 
170 | The cache timestamp functionality is fully backward compatible:
171 | 
172 | * Existing cache files will continue to work
173 | * Old cache entries without timestamps will be automatically converted
174 | * No changes to Repository or ProjectDirectory APIs
175 | * All existing code continues to work unchanged
176 | 
177 | Best Practices
178 | --------------
179 | 
180 | Shared Cache Usage
181 | ~~~~~~~~~~~~~~~~~~
182 | 
183 | .. warning::
184 |    **Recommendation: Use Separate Cache Instances**
185 |    
186 |    While it's technically possible to share the same cache object across multiple Repository instances, 
187 |    we **strongly recommend using separate cache instances** for each repository for the following reasons:
188 | 
189 | **Recommended Approach - Separate Caches:**
190 | 
191 | .. code-block:: python
192 | 
193 |     from gitpandas import Repository
194 |     from gitpandas.cache import DiskCache
195 |     
196 |     # Create separate cache instances for each repository
197 |     cache1 = DiskCache(filepath='repo1_cache.gz')
198 |     cache2 = DiskCache(filepath='repo2_cache.gz')
199 |     
200 |     repo1 = Repository('/path/to/repo1', cache_backend=cache1)
201 |     repo2 = Repository('/path/to/repo2', cache_backend=cache2)
202 | 
203 | **Benefits of Separate Caches:**
204 | 
205 | * **Complete Isolation**: No risk of cache eviction conflicts between repositories
206 | * **Predictable Memory Usage**: Each repository has its own memory budget
207 | * **Easier Debugging**: Cache issues are isolated to specific repositories  
208 | * **Better Performance**: No lock contention in multi-threaded scenarios
209 | * **Clear Cache Management**: You can clear or manage each repository's cache independently
210 | 
211 | **If You Must Share Caches:**
212 | 
213 | If you need to share a cache object across multiple repositories (e.g., for memory constraints), 
214 | the system is designed to handle this safely:
215 | 
216 | .. code-block:: python
217 | 
218 |     from gitpandas import Repository
219 |     from gitpandas.cache import EphemeralCache
220 |     
221 |     # Shared cache (not recommended but supported)
222 |     shared_cache = EphemeralCache(max_keys=1000)
223 |     
224 |     repo1 = Repository('/path/to/repo1', cache_backend=shared_cache)
225 |     repo2 = Repository('/path/to/repo2', cache_backend=shared_cache)
226 |     
227 |     # Each repository gets separate cache entries
228 |     files1 = repo1.list_files()  # Creates cache key: list_files||repo1||None
229 |     files2 = repo2.list_files()  # Creates cache key: list_files||repo2||None
230 | 
231 | **Shared Cache Considerations:**
232 | 
233 | * Repository names are included in cache keys to prevent collisions
234 | * Cache eviction affects all repositories sharing the cache
235 | * Memory usage is shared across all repositories
236 | * Very active repositories may evict cache entries from less active ones
237 | 
238 | Cache Size Planning
239 | ~~~~~~~~~~~~~~~~~~~
240 | 
241 | When planning cache sizes, consider:
242 | 
243 | * **Repository Size**: Larger repositories generate more cache entries
244 | * **Operation Types**: Some operations (like ``cumulative_blame``) create many cache entries
245 | * **Memory Constraints**: Balance cache size with available system memory
246 | * **Analysis Patterns**: Frequently repeated analyses benefit from larger caches
247 | 
248 | **Recommended Cache Sizes:**
249 | 
250 | .. code-block:: python
251 | 
252 |     # Small repositories (< 1000 commits)
253 |     cache = EphemeralCache(max_keys=100)
254 |     
255 |     # Medium repositories (1000-10000 commits)  
256 |     cache = EphemeralCache(max_keys=500)
257 |     
258 |     # Large repositories (> 10000 commits)
259 |     cache = EphemeralCache(max_keys=1000)
260 |     
261 |     # For disk/Redis caches, you can use larger sizes
262 |     cache = DiskCache(filepath='cache.gz', max_keys=5000)
263 | 
264 | API Reference
265 | -------------
266 | 
267 | .. currentmodule:: gitpandas.cache
268 | 
269 | .. autoclass:: EphemeralCache
270 |    :members:
271 |    :undoc-members:
272 |    :show-inheritance:
273 |    :inherited-members:
274 |    :special-members: __init__
275 | 
276 | .. autoclass:: DiskCache
277 |    :members:
278 |    :undoc-members:
279 |    :show-inheritance:
280 |    :inherited-members:
281 |    :special-members: __init__
282 | 
283 | .. autoclass:: RedisDFCache
284 |    :members:
285 |    :undoc-members:
286 |    :show-inheritance:
287 |    :inherited-members:
288 |    :special-members: __init__
289 | 
290 | .. autofunction:: multicache
291 | 
292 | .. autoclass:: CacheEntry
293 |    :members:
294 |    :undoc-members:
295 |    :show-inheritance:
296 |    :special-members: __init__
297 | 


--------------------------------------------------------------------------------
/tests/test_Repository/test_bus_factor.py:
--------------------------------------------------------------------------------
  1 | import git
  2 | import pandas as pd
  3 | import pytest
  4 | 
  5 | from gitpandas import Repository
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def multi_committer_repo(tmp_path, default_branch):
 10 |     """Fixture for a local repository with multiple committers."""
 11 |     # Create a temporary directory
 12 |     repo_dir = tmp_path / "repository1"
 13 |     repo_dir.mkdir()
 14 | 
 15 |     # Initialize a git repo
 16 |     grepo = git.Repo.init(str(repo_dir))
 17 | 
 18 |     # Add a README file with first committer
 19 |     readme_path = repo_dir / "README.md"
 20 |     readme_path.write_text("Sample README for a sample project\n")
 21 | 
 22 |     # Commit it with first committer
 23 |     grepo.git.config("user.name", "User One")
 24 |     grepo.git.config("user.email", "user1@example.com")
 25 |     grepo.git.add("README.md")
 26 |     grepo.git.commit(m="first commit")
 27 | 
 28 |     # Add Python files with different committers
 29 |     committers = [
 30 |         ("User One", "user1@example.com"),
 31 |         ("User Two", "user2@example.com"),
 32 |         ("User Three", "user3@example.com"),
 33 |     ]
 34 | 
 35 |     # Each committer adds 3 files
 36 |     for committer_idx, (name, email) in enumerate(committers):
 37 |         grepo.git.config("user.name", name)
 38 |         grepo.git.config("user.email", email)
 39 | 
 40 |         for file_idx in range(3):
 41 |             py_file = repo_dir / f"file_{committer_idx}_{file_idx}.py"
 42 |             py_file.write_text(
 43 |                 f"import sys\nimport os\n\ndef function_{committer_idx}_{file_idx}():\n    return {committer_idx * 10 + file_idx}\n"  # noqa: E501
 44 |             )
 45 | 
 46 |             grepo.git.add(all=True)
 47 |             grepo.git.commit(m=f"adding file_{committer_idx}_{file_idx}.py")
 48 | 
 49 |     # Create a shared file that all committers contribute to
 50 |     shared_file = repo_dir / "shared.py"
 51 | 
 52 |     # First committer creates the file
 53 |     grepo.git.config("user.name", committers[0][0])
 54 |     grepo.git.config("user.email", committers[0][1])
 55 |     shared_file.write_text("import sys\nimport os\n\n# Shared file\n")
 56 |     grepo.git.add("shared.py")
 57 |     grepo.git.commit(m="creating shared file")
 58 | 
 59 |     # Second committer adds to the file
 60 |     grepo.git.config("user.name", committers[1][0])
 61 |     grepo.git.config("user.email", committers[1][1])
 62 |     with open(shared_file, "a") as f:
 63 |         f.write('\ndef shared_function_1():\n    return "shared1"\n')
 64 |     grepo.git.add("shared.py")
 65 |     grepo.git.commit(m="adding to shared file")
 66 | 
 67 |     # Third committer adds to the file
 68 |     grepo.git.config("user.name", committers[2][0])
 69 |     grepo.git.config("user.email", committers[2][1])
 70 |     with open(shared_file, "a") as f:
 71 |         f.write('\ndef shared_function_2():\n    return "shared2"\n')
 72 |     grepo.git.add("shared.py")
 73 |     grepo.git.commit(m="adding more to shared file")
 74 | 
 75 |     # Create the Repository object
 76 |     git_pandas_repo = Repository(working_dir=str(repo_dir), default_branch=default_branch, verbose=True)
 77 | 
 78 |     yield git_pandas_repo
 79 | 
 80 |     # Cleanup
 81 |     git_pandas_repo.__del__()
 82 | 
 83 | 
 84 | class TestBusFactor:
 85 |     def test_bus_factor_by_repository(self, multi_committer_repo, default_branch):
 86 |         """Test the bus_factor method with by='repository'."""
 87 |         bus_factor = multi_committer_repo.bus_factor(by="repository")
 88 | 
 89 |         # Check the shape and columns
 90 |         assert isinstance(bus_factor, pd.DataFrame)
 91 |         assert bus_factor.shape[0] == 1
 92 | 
 93 |         # Check that we have the expected columns
 94 |         expected_columns = ["repository", "bus factor"]
 95 |         for col in expected_columns:
 96 |             assert col in bus_factor.columns
 97 | 
 98 |         # With 3 committers, the bus factor should be at least 1 and at most 3
 99 |         assert 1 <= bus_factor["bus factor"].values[0] <= 3
100 | 
101 |         # Since each committer has contributed roughly equally, the bus factor should be close to 3
102 |         # But we'll just check it's at least 1 to be safe
103 |         assert bus_factor["bus factor"].values[0] >= 1
104 | 
105 |     def test_bus_factor_with_globs(self, multi_committer_repo, default_branch):
106 |         """Test the ignore_globs and include_globs parameters."""
107 |         # Get bus factor for all files
108 |         bus_factor_all = multi_committer_repo.bus_factor(by="repository")
109 | 
110 |         # Get bus factor ignoring files from the first committer
111 |         bus_factor_no_user1 = multi_committer_repo.bus_factor(by="repository", ignore_globs=["file_0_*.py"])
112 | 
113 |         # Get bus factor including only files from the first committer
114 |         bus_factor_only_user1 = multi_committer_repo.bus_factor(by="repository", include_globs=["file_0_*.py"])
115 | 
116 |         # The bus factor should be different when we filter files
117 |         assert bus_factor_no_user1["bus factor"].values[0] <= bus_factor_all["bus factor"].values[0]
118 |         assert bus_factor_only_user1["bus factor"].values[0] <= bus_factor_all["bus factor"].values[0]
119 | 
120 |     def test_bus_factor_calculation(self, multi_committer_repo, default_branch):
121 |         """Test the bus factor calculation logic."""
122 |         # Get the blame data to understand the distribution of contributions
123 |         blame = multi_committer_repo.blame(by="repository")
124 | 
125 |         # Calculate the bus factor manually for the repository
126 |         committer_loc = blame.groupby("committer")["loc"].sum()
127 |         total_loc = committer_loc.sum()
128 | 
129 |         # Sort committers by LOC in descending order
130 |         sorted_committers = committer_loc.sort_values(ascending=False)
131 | 
132 |         # Calculate cumulative percentage
133 |         cumulative_pct = sorted_committers.cumsum() / total_loc
134 | 
135 |         # Find the number of committers needed to reach 50%
136 |         # The implementation counts the number of committers needed to reach >= 50%
137 |         manual_bus_factor = (cumulative_pct < 0.5).sum() + 1
138 | 
139 |         # Get the bus factor from the method
140 |         bus_factor = multi_committer_repo.bus_factor(by="repository")["bus factor"].values[0]
141 | 
142 |         # The calculated bus factor should match our manual calculation
143 |         assert bus_factor == manual_bus_factor
144 | 
145 |     def test_bus_factor_by_file(self, multi_committer_repo, default_branch):
146 |         """Test the bus_factor method with by='file'."""
147 |         bus_factor_df = multi_committer_repo.bus_factor(by="file")
148 | 
149 |         # Check the shape and columns
150 |         assert isinstance(bus_factor_df, pd.DataFrame)
151 |         assert bus_factor_df.shape[0] > 0  # Should have at least one file
152 | 
153 |         # Check that we have the expected columns
154 |         expected_columns = ["file", "bus factor", "repository"]
155 |         for col in expected_columns:
156 |             assert col in bus_factor_df.columns
157 | 
158 |         # All bus factors should be at least 1 (minimum one contributor per file)
159 |         assert (bus_factor_df["bus factor"] >= 1).all()
160 | 
161 |         # Bus factors should be reasonable (not exceed total number of committers)
162 |         max_committers = 3  # We created 3 committers in the fixture
163 |         assert (bus_factor_df["bus factor"] <= max_committers).all()
164 | 
165 |         # Repository column should be consistent
166 |         assert len(bus_factor_df["repository"].unique()) == 1
167 | 
168 |         # Check that we have results for the expected files
169 |         file_list = bus_factor_df["file"].tolist()
170 | 
171 |         # We should have Python files from our test fixture
172 |         python_files = [f for f in file_list if f.endswith(".py")]
173 |         assert len(python_files) > 0, "Should have Python files in results"
174 | 
175 |     def test_bus_factor_by_file_with_globs(self, multi_committer_repo, default_branch):
176 |         """Test the file-wise bus factor with glob filtering."""
177 |         # Get bus factor for all files
178 |         bus_factor_all = multi_committer_repo.bus_factor(by="file")
179 | 
180 |         # Get bus factor for only Python files
181 |         bus_factor_py = multi_committer_repo.bus_factor(by="file", include_globs=["*.py"])
182 | 
183 |         # Get bus factor excluding Python files
184 |         bus_factor_no_py = multi_committer_repo.bus_factor(by="file", ignore_globs=["*.py"])
185 | 
186 |         # Python-only results should be a subset of all results
187 |         assert len(bus_factor_py) <= len(bus_factor_all)
188 | 
189 |         # All files in Python-only results should end with .py
190 |         if not bus_factor_py.empty:
191 |             assert bus_factor_py["file"].str.endswith(".py").all()
192 | 
193 |         # No files in no-Python results should end with .py
194 |         if not bus_factor_no_py.empty:
195 |             assert not bus_factor_no_py["file"].str.endswith(".py").any()
196 | 
197 |     def test_bus_factor_by_file_single_committer_files(self, multi_committer_repo, default_branch):
198 |         """Test file-wise bus factor for files with single committers."""
199 |         # Get all file-wise bus factors
200 |         bus_factor_df = multi_committer_repo.bus_factor(by="file")
201 | 
202 |         # Filter for files that have only one committer (bus factor of 1)
203 |         single_committer_files = bus_factor_df[bus_factor_df["bus factor"] == 1]
204 | 
205 |         # Should have some single-committer files from our fixture
206 |         # (each committer created their own files)
207 |         assert len(single_committer_files) > 0
208 | 
209 |         # Verify the bus factor calculation for a single-committer file
210 |         if len(single_committer_files) > 0:
211 |             sample_file = single_committer_files.iloc[0]["file"]
212 | 
213 |             # Get blame data for this specific file
214 |             blame = multi_committer_repo.blame(by="file")
215 |             if isinstance(blame.index, pd.MultiIndex):
216 |                 blame = blame.reset_index()
217 | 
218 |             file_blame = blame[blame["file"] == sample_file]
219 |             unique_committers = (
220 |                 file_blame["committer"].nunique()
221 |                 if "committer" in file_blame.columns
222 |                 else file_blame["author"].nunique()
223 |             )
224 | 
225 |             # A file with bus factor 1 should have contributions that are >=50% from one person
226 |             # But due to rounding and the nature of our test data, we'll just verify it's reasonable
227 |             assert unique_committers >= 1
228 | 


--------------------------------------------------------------------------------