├── docs ├── source │ ├── CNAME │ ├── contributors.rst │ ├── index.rst │ ├── project.rst │ ├── repository.rst │ ├── usecases.rst │ └── cache.rst ├── requirements.txt ├── AUDIT_REPORT.md ├── Makefile └── make.bat ├── examples ├── __init__.py ├── img │ ├── lifeline.png │ ├── githubblame.png │ ├── cumulative_blame.png │ ├── cumulative_project_blame.png │ └── stravalib_cumulative_blame.png ├── definitions.py ├── file_change_rates.py ├── cloud_repo.py ├── project_blame.py ├── repo_file_detail.py ├── punchcard.py ├── bus_analysis.py ├── cumulative_blame.py ├── attributes.py ├── hours_estimate.py ├── parallel_blame.py ├── lifeline.py ├── commit_history.py ├── release_analytics.py └── cache_timestamps.py ├── img ├── lifeline.png ├── punchcard.png └── cumulative_blame.png ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── ISSUE_TEMPLATE.md └── workflows │ ├── test-docs-build.yml │ ├── docs.yml │ ├── test-suite.yml │ └── pypi-publish.yml ├── tests ├── __init__.py ├── test_Repository │ ├── __init__.py │ ├── test_properties.py │ ├── test_punchcard.py │ ├── test_file_operations.py │ ├── test_file_detail.py │ └── test_bus_factor.py ├── test_Project │ ├── __init__.py │ └── test_properties.py ├── test_utilities │ ├── test_init.py │ └── test_check_api.py ├── test_examples.py ├── conftest.py ├── test_cache_key_consistency.py └── test_repository_cache.py ├── gitpandas ├── __init__.py ├── utilities │ ├── __init__.py │ ├── check_api.py │ └── plotting.py └── logging.py ├── .gitignore ├── mcp_server └── README.md ├── LICENSE.md ├── .cursor └── rules │ ├── testing.mdc │ ├── pytest_standards.mdc │ ├── python_standards.mdc │ ├── sphinx_docs_standards.mdc │ └── project_overview.mdc ├── CONTRIBUTING.md ├── pyproject.toml ├── Makefile └── CHANGELOG.md /docs/source/CNAME: -------------------------------------------------------------------------------- 1 | gitpandas.mcginniscommawill.com -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "willmcginnis" 2 | -------------------------------------------------------------------------------- /img/lifeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/lifeline.png -------------------------------------------------------------------------------- /img/punchcard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/punchcard.png -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Fixes # 2 | 3 | ## Proposed Changes 4 | 5 | - 6 | - 7 | - -------------------------------------------------------------------------------- /img/cumulative_blame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/img/cumulative_blame.png -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test suite for git-pandas. 3 | """ 4 | 5 | __author__ = "willmcginnis" 6 | -------------------------------------------------------------------------------- /examples/img/lifeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/lifeline.png -------------------------------------------------------------------------------- /examples/img/githubblame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/githubblame.png -------------------------------------------------------------------------------- /examples/definitions.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | GIT_PANDAS_DIR = Path(__file__).resolve().parent.parent 4 | -------------------------------------------------------------------------------- /examples/img/cumulative_blame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/cumulative_blame.png -------------------------------------------------------------------------------- /tests/test_Repository/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the Repository class. 3 | """ 4 | 5 | __author__ = "willmcginnis" 6 | -------------------------------------------------------------------------------- /tests/test_Project/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the ProjectDirectory class. 3 | """ 4 | 5 | __author__ = "willmcginnis" 6 | -------------------------------------------------------------------------------- /examples/img/cumulative_project_blame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/cumulative_project_blame.png -------------------------------------------------------------------------------- /examples/img/stravalib_cumulative_blame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wdm0006/git-pandas/HEAD/examples/img/stravalib_cumulative_blame.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=4.0.0 2 | sphinx-rtd-theme>=1.0.0 3 | sphinx-rtd-dark-mode>=1.0.0 4 | sphinxcontrib-googleanalytics>=0.3 5 | gitpython>=1.0.0 6 | numpy>=1.9.0 7 | pandas>=2.0.0 8 | requests 9 | redis -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Expected Behavior 2 | 3 | 4 | ## Actual Behavior 5 | 6 | 7 | ## Steps to Reproduce the Problem 8 | 9 | 1. 10 | 1. 11 | 1. 12 | 13 | ## Specifications 14 | 15 | - Version: 16 | - Platform: 17 | - Subsystem: -------------------------------------------------------------------------------- /tests/test_utilities/test_init.py: -------------------------------------------------------------------------------- 1 | import gitpandas 2 | 3 | 4 | def test_version(): 5 | """Test that the version is defined.""" 6 | assert isinstance(gitpandas.__version__, str) 7 | assert len(gitpandas.__version__.split(".")) >= 2 # Should have at least major.minor version 8 | -------------------------------------------------------------------------------- /examples/file_change_rates.py: -------------------------------------------------------------------------------- 1 | from definitions import GIT_PANDAS_DIR 2 | 3 | from gitpandas import Repository 4 | 5 | __author__ = "willmcginnis" 6 | 7 | 8 | if __name__ == "__main__": 9 | repo = Repository(working_dir=GIT_PANDAS_DIR) 10 | fc = repo.file_change_rates(include_globs=["*.py"], coverage=True) 11 | print(fc) 12 | -------------------------------------------------------------------------------- /examples/cloud_repo.py: -------------------------------------------------------------------------------- 1 | from gitpandas import Repository 2 | 3 | __author__ = "willmcginnis" 4 | 5 | 6 | if __name__ == "__main__": 7 | repo = Repository(working_dir="https://github.com/CamDavidsonPilon/lifelines.git", verbose=True) 8 | shared_blame = repo.blame(include_globs=["*.py"], committer=False, by="file") 9 | 10 | print(shared_blame) 11 | -------------------------------------------------------------------------------- /examples/project_blame.py: -------------------------------------------------------------------------------- 1 | from definitions import GIT_PANDAS_DIR 2 | 3 | from gitpandas import ProjectDirectory 4 | 5 | __author__ = "willmcginnis" 6 | 7 | if __name__ == "__main__": 8 | g = ProjectDirectory(working_dir=GIT_PANDAS_DIR) 9 | 10 | b = g.blame(include_globs=["*.py"], ignore_globs=["lib/*", "docs/*"], by="file") 11 | print(b.head(5)) 12 | -------------------------------------------------------------------------------- /examples/repo_file_detail.py: -------------------------------------------------------------------------------- 1 | from definitions import GIT_PANDAS_DIR 2 | 3 | from gitpandas import ProjectDirectory 4 | 5 | __author__ = "willmcginnis" 6 | 7 | if __name__ == "__main__": 8 | g = ProjectDirectory(working_dir=GIT_PANDAS_DIR) 9 | 10 | b = g.file_detail(include_globs=["*.py"], ignore_globs=["lib/*", "docs/*"]) 11 | print(b.head(25)) 12 | -------------------------------------------------------------------------------- /gitpandas/__init__.py: -------------------------------------------------------------------------------- 1 | from gitpandas.project import GitHubProfile, ProjectDirectory 2 | from gitpandas.repository import Repository 3 | 4 | try: 5 | from importlib.metadata import version 6 | 7 | __version__ = version("git-pandas") 8 | except ImportError: 9 | # Fallback for Python < 3.8 10 | from importlib_metadata import version 11 | 12 | __version__ = version("git-pandas") 13 | 14 | __author__ = "willmcginnis" 15 | 16 | __all__ = ["Repository", "ProjectDirectory", "GitHubProfile"] 17 | -------------------------------------------------------------------------------- /gitpandas/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: utilities 3 | :platform: Unix, Windows 4 | :synopsis: Helper methods for plotting or otherwise manipulating output from gitpandas objects 5 | 6 | .. moduleauthor:: Will McGinnis 7 | 8 | 9 | """ 10 | 11 | __author__ = "willmcginnis" 12 | # Version is now managed centrally in gitpandas.__init__ 13 | 14 | try: 15 | import joblib # noqa: F401 16 | 17 | _has_joblib = True 18 | except ImportError: 19 | _has_joblib = False 20 | -------------------------------------------------------------------------------- /examples/punchcard.py: -------------------------------------------------------------------------------- 1 | """ """ 2 | 3 | import sys 4 | 5 | import matplotlib 6 | 7 | matplotlib.use("Agg") # Set the backend to Agg before importing pyplot 8 | 9 | from definitions import GIT_PANDAS_DIR 10 | 11 | from gitpandas import ProjectDirectory 12 | from gitpandas.utilities.plotting import plot_punchcard 13 | 14 | g = ProjectDirectory(working_dir=[str(GIT_PANDAS_DIR)], verbose=True) 15 | 16 | by = None 17 | punchcard = g.punchcard(include_globs=["*.py"], by=by, normalize=2500) 18 | 19 | if punchcard.empty: 20 | print("No commit data available for punchcard analysis.") 21 | sys.exit(0) 22 | 23 | plot_punchcard(punchcard, metric="lines", title="punchcard", by=by) 24 | -------------------------------------------------------------------------------- /.github/workflows/test-docs-build.yml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Docs Check" 2 | on: 3 | - pull_request 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v1 11 | 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | 17 | - name: Install uv 18 | run: | 19 | curl -LsSf https://astral.sh/uv/install.sh | sh 20 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 21 | 22 | - name: Install Dependencies 23 | run: | 24 | uv pip install --system -e ".[dev]" 25 | 26 | - name: Build Docs 27 | uses: ammaraskar/sphinx-action@master 28 | with: 29 | docs-folder: "docs/" -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: "Master Docs Publication" 2 | on: 3 | push: 4 | branches: [ master ] 5 | 6 | jobs: 7 | docs: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Clone 11 | uses: actions/checkout@v1 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.10' 17 | 18 | - name: Install uv 19 | run: | 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | echo "$HOME/.cargo/bin" >> $GITHUB_PATH 22 | 23 | - name: Install Dependencies 24 | run: | 25 | uv pip install --system -e ".[dev]" 26 | 27 | - name: Build Docs 28 | uses: ammaraskar/sphinx-action@master 29 | with: 30 | docs-folder: "./docs/" 31 | 32 | - name: Deploy Docs 33 | uses: peaceiris/actions-gh-pages@v3 34 | with: 35 | github_token: ${{ secrets.GITHUB_TOKEN }} 36 | publish_dir: "./docs/build/html/" -------------------------------------------------------------------------------- /gitpandas/utilities/check_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | A little utility to figure out all of the parameter names in git-pandas, and make sure we aren't mixing up language 3 | in different functions. 4 | 5 | """ 6 | 7 | import inspect 8 | 9 | __author__ = "willmcginnis" 10 | 11 | 12 | def extract_objects(m, classes=True, functions=False): 13 | # add in the classes at this level 14 | out = {} 15 | if classes: 16 | m_dict = {k: v for k, v in m.__dict__.items() if inspect.isclass(v)} 17 | out.update(m_dict) 18 | if functions: 19 | m_dict = {k: v for k, v in m.__dict__.items() if inspect.isfunction(v)} 20 | out.update(m_dict) 21 | 22 | return out 23 | 24 | 25 | def parse_docstring(ds): 26 | ds = [x.strip() for x in ds.split("\n")] 27 | ds = [x.split(":") for x in ds if x.startswith(":param")] 28 | ds = [{x[1].replace("param", "").strip(): x[2].strip()} for x in ds] 29 | return ds 30 | 31 | 32 | def get_distinct_params(m): 33 | out = set() 34 | for k in m: 35 | out.update(m[k]["args"]) 36 | return out 37 | 38 | 39 | if __name__ == "__main__": 40 | print("Development utilities for analyzing the git-pandas API") 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | 3 | # tests 4 | # these should be torn down after every test, but if a test is interupted, 5 | # they won't be until the next test 6 | tests/**/repos 7 | uv.lock 8 | # scratch stuff 9 | scratch 10 | 11 | ### Python template 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | .idea 17 | .idea/ 18 | .DS_Store 19 | # C extensions 20 | *.so 21 | gitnoc_desktop/.venv/ 22 | # Distribution / packaging 23 | .Python 24 | env/ 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *,cover 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | 74 | -------------------------------------------------------------------------------- /mcp_server/README.md: -------------------------------------------------------------------------------- 1 | # GitPandas MCP Server 2 | 3 | This directory contains an experimental Model Context Protocol (MCP) server that acts as a wrapper around the `git-pandas` Python library. It allows MCP-compatible clients (like AI assistants or IDE extensions) to interact with git repository data using structured tools. 4 | 5 | ## Prerequisites 6 | 7 | * Python 3.8+ 8 | * `uv` (Python package installer and virtual environment manager). See [uv installation guide](https://github.com/astral-sh/uv#installation). You can often install it via pip: `pip install uv`. 9 | 10 | ## Setup and Installation (using uv) 11 | 12 | 1. **Clone the Repository:** If you haven't already, clone the main `git-pandas` repository. 13 | 14 | ## Connecting with Claude Desktop (or similar clients) 15 | 16 | 1. Open Claude Desktop's settings or preferences. 17 | 2. Look for a section related to "Tools", "MCP Servers", "External Tools", or similar. 18 | 3. Add a new server configuration. 19 | 4. When prompted for the server command or path, paste the **full `uv run` command** you constructed in the previous section (e.g., `uv run --cwd /absolute/path/to/git-pandas/mcp_server python server.py`). 20 | 5. Save the configuration. 21 | 22 | Claude Desktop should now be able to launch this server process using `uv` when needed and communicate with it to use the defined tools. -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2025, Will McGinnis 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | 8 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 9 | 10 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /.github/workflows/test-suite.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: "Test Suite and Linting" 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | test: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v4 19 | with: 20 | fetch-depth: 0 21 | 22 | - name: Set up Python 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.10' 26 | 27 | - name: Install uv 28 | run: | 29 | curl -LsSf https://astral.sh/uv/install.sh | sh 30 | 31 | - name: Add uv to PATH 32 | run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH 33 | 34 | - name: Check uv version 35 | run: uv --version 36 | 37 | - name: Install dependencies 38 | run: | 39 | uv pip install --system -e ".[dev,datasets]" 40 | 41 | - name: Configure Git 42 | run: | 43 | git config --global user.email "github-actions@github.com" 44 | git config --global user.name "GitHub Actions" 45 | 46 | - name: Run tests 47 | run: | 48 | MPLBACKEND=Agg uv run pytest -v -m "not slow" 49 | 50 | - name: Run linting 51 | run: | 52 | uv run ruff check . 53 | -------------------------------------------------------------------------------- /examples/bus_analysis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of analyzing the "bus factor" of a repository. 3 | 4 | The bus factor is a measure of risk based on how concentrated the codebase knowledge is 5 | among contributors. A low bus factor (e.g. 1-2) indicates high risk as knowledge is 6 | concentrated among few contributors. 7 | 8 | This example demonstrates: 9 | 1. Creating a repository instance 10 | 2. Analyzing commit history with limits 11 | 3. Calculating bus factor 12 | 4. Viewing contributor statistics 13 | """ 14 | 15 | import time 16 | 17 | from gitpandas import Repository 18 | 19 | __author__ = "willmcginnis" 20 | 21 | 22 | if __name__ == "__main__": 23 | # Use a smaller repository for faster analysis 24 | # This is a small Python package that's good for examples 25 | repo = Repository(working_dir="https://github.com/wdm0006/cookiecutter-pipproject.git") 26 | 27 | print("Analyzing repository...") 28 | start_time = time.time() 29 | 30 | # Get commit history with a reasonable limit 31 | print("\nGetting commit history (limited to last 100 commits)...") 32 | ch = repo.commit_history("master", limit=100, include_globs=["*.py"]) 33 | 34 | # Calculate unique committers 35 | committers = set(ch["committer"].values) 36 | print(f"\nFound {len(committers)} unique committers:") 37 | for committer in sorted(committers): 38 | print(f" - {committer}") 39 | 40 | # Calculate bus factor 41 | print("\nCalculating bus factor...") 42 | bus_factor = repo.bus_factor(include_globs=["*.py"]) 43 | print("\nBus factor analysis:") 44 | print(bus_factor) 45 | 46 | end_time = time.time() 47 | print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds") 48 | -------------------------------------------------------------------------------- /.cursor/rules/testing.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: how to use the makefile to run tests 3 | globs: 4 | --- 5 | # Gitpandas Testing Standards 6 | 7 | ## Running Tests 8 | 9 | ### Using Make Commands 10 | - Always use `make test` to run the test suite 11 | - Use `make test-cov` to run tests with coverage reports 12 | - Use `make test-all` to run tests on all supported Python versions using tox 13 | 14 | ### Test Commands 15 | ```bash 16 | # Run the standard test suite 17 | make test 18 | 19 | # Run tests with coverage 20 | make test-cov 21 | 22 | # Run tests on all supported Python versions 23 | make test-all 24 | 25 | # Run a specific test file 26 | make test PYTEST_ARGS="tests/test_unified_interface.py" 27 | 28 | # Run a specific test class 29 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface" 30 | 31 | # Run a specific test method 32 | make test PYTEST_ARGS="tests/test_unified_interface.py::TestUnifiedInterface::test_base_methods_elo" 33 | ``` 34 | 35 | ### Test Environment 36 | - Tests are run using `uv run pytest` through the Makefile 37 | - The test environment is automatically set up with the correct dependencies 38 | - Always run tests in a clean environment to avoid dependency conflicts 39 | 40 | ### Continuous Integration 41 | - All tests must pass in CI before merging 42 | - Coverage should not decrease with new code 43 | - New features should include corresponding tests 44 | 45 | ### Benchmarks 46 | - Use `make benchmark` to run performance benchmarks 47 | - Benchmark results are stored in the `benchmark_results` directory 48 | - Performance regressions should be addressed before merging 49 | 50 | ### Linting and Formatting 51 | - Use `make lint` to check code quality 52 | - Use `make lint-fix` to automatically fix linting issues 53 | - Use `make format` to format code according to project standards -------------------------------------------------------------------------------- /examples/cumulative_blame.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of visualizing cumulative blame information. 3 | 4 | This example demonstrates: 5 | 1. Creating a project directory instance 6 | 2. Analyzing blame information across multiple repositories 7 | 3. Visualizing the results using matplotlib 8 | """ 9 | 10 | import os 11 | import time 12 | 13 | from gitpandas import ProjectDirectory 14 | from gitpandas.utilities.plotting import plot_cumulative_blame 15 | 16 | __author__ = "willmcginnis" 17 | 18 | 19 | if __name__ == "__main__": 20 | print("Initializing project directory...") 21 | start_time = time.time() 22 | 23 | # Use pygeohash repository - a good size for examples 24 | g = ProjectDirectory(working_dir=["https://github.com/wdm0006/pygeohash.git"]) 25 | 26 | print("\nAnalyzing blame information...") 27 | print("Using a limit of 20 commits and skipping every 2nd commit for faster analysis") 28 | 29 | try: 30 | blame = g.cumulative_blame( 31 | branch="master", # Use master instead of main 32 | include_globs=["*.py"], # Focus on Python files only 33 | by="committer", 34 | limit=20, # Limit to 20 commits 35 | skip=2, # Skip every other commit 36 | ) 37 | 38 | print("\nGenerating visualization...") 39 | # Create the plot and save it 40 | fig = plot_cumulative_blame(blame) 41 | output_path = os.path.join("img", "cumulative_blame.png") 42 | fig.savefig(output_path) 43 | print(f"Plot saved to {output_path}") 44 | 45 | except Exception as e: 46 | print(f"\nError during analysis: {str(e)}") 47 | print("This might happen if the repository is not accessible or if there are no commits to analyze.") 48 | 49 | end_time = time.time() 50 | print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds") 51 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: "PyPI Packaging" 2 | on: 3 | release: 4 | types: created 5 | 6 | jobs: 7 | build_and_publish: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | # IMPORTANT: this permission is mandatory for Trusted Publishing 11 | id-token: write 12 | steps: 13 | - name: Clone 14 | uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.11' 20 | 21 | - name: Install uv 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install uv 25 | 26 | - name: Setup virtual environment 27 | run: | 28 | uv venv 29 | echo "VIRTUAL_ENV=$(pwd)/.venv" >> $GITHUB_ENV 30 | echo "$(pwd)/.venv/bin" >> $GITHUB_PATH 31 | 32 | - name: Install dependencies 33 | run: | 34 | uv pip install -e ".[dev]" 35 | uv pip install build 36 | 37 | - name: Build source distribution 38 | run: | 39 | # Clean the build directories first 40 | rm -rf build/ 41 | rm -rf dist/ 42 | rm -rf .eggs/ 43 | 44 | # Find and remove egg-info directories, excluding the virtual environment 45 | find . -path ./.venv -prune -o -name '*.egg-info' -type d -exec rm -rf {} \; 2>/dev/null || true 46 | find . -path ./.venv -prune -o -name '*.egg' -type f -exec rm -f {} \; 2>/dev/null || true 47 | find . -path ./.venv -prune -o -name '__pycache__' -type d -exec rm -rf {} \; 2>/dev/null || true 48 | 49 | # Build only the source distribution 50 | python -m build --sdist 51 | 52 | - name: Publish package to PyPI 53 | uses: pypa/gh-action-pypi-publish@release/v1 54 | with: 55 | packages-dir: dist/ -------------------------------------------------------------------------------- /examples/attributes.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: attributes.py 3 | :platform: Unix, Windows 4 | :synopsis: An example showing basic attribute access to gitpandas objects. 5 | 6 | .. moduleauthor:: Will McGinnis 7 | 8 | 9 | """ 10 | 11 | from pandas import set_option 12 | 13 | from gitpandas.project import ProjectDirectory 14 | from gitpandas.repository import Repository 15 | 16 | __author__ = "willmcginnis" 17 | 18 | 19 | def repository(): 20 | # build an example repository object then check the attributes 21 | r = Repository("https://github.com/wdm0006/git-pandas.git") 22 | print("\nRepository Name") 23 | print(r.repo_name) 24 | print("\nRepository Branches:") 25 | print(r.branches()) 26 | print("\nRepository Tags:") 27 | print(r.tags()) 28 | print("\nRepository Revisions:") 29 | print(r.revs()) 30 | print("\nRepository Blame:") 31 | print(r.blame(include_globs=["*.py"])) 32 | print("\nRepository Is Bare:") 33 | print(r.is_bare()) 34 | 35 | 36 | def project(): 37 | # build an example project directory object then check the attributes 38 | p = ProjectDirectory( 39 | [ 40 | "https://github.com/wdm0006/git-pandas.git", 41 | "https://github.com/CamDavidsonPilon/lifelines.git", 42 | ] 43 | ) 44 | print("\nProject Directory Name") 45 | print(p.repo_name()) 46 | print("\nProject Directory Branches:") 47 | print(p.branches()) 48 | print("\nProject Directory Tags:") 49 | print(p.tags()) 50 | print("\nProject Directory Revisions:") 51 | print(p.revs()) 52 | print("\nProject Directory Blame:") 53 | print(p.blame()) 54 | print("\nProject Directory Is Bare:") 55 | print(p.is_bare()) 56 | 57 | 58 | if __name__ == "__main__": 59 | set_option("display.max_rows", 500) 60 | set_option("display.max_columns", 500) 61 | set_option("display.width", 1000) 62 | repository() 63 | project() 64 | -------------------------------------------------------------------------------- /examples/hours_estimate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of estimating development hours from commit history. 3 | 4 | This example demonstrates: 5 | 1. Creating a repository instance 6 | 2. Analyzing commit history 7 | 3. Estimating development hours based on commit patterns 8 | 4. Visualizing the results 9 | """ 10 | 11 | import time 12 | 13 | from gitpandas import Repository 14 | 15 | __author__ = "willmcginnis" 16 | 17 | 18 | if __name__ == "__main__": 19 | print("Initializing repository...") 20 | start_time = time.time() 21 | 22 | # Use pygeohash repository - a good size for examples 23 | repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git") 24 | 25 | print("\nAnalyzing commit history...") 26 | print("Using a limit of 20 commits for faster analysis") 27 | 28 | # Get commit history with limits 29 | commits = repo.commit_history( 30 | branch="master", # Use master instead of main 31 | limit=20, # Limit to 20 commits 32 | include_globs=["*.py"], # Focus on Python files only 33 | ) 34 | 35 | print("\nEstimating development hours...") 36 | # Group commits by day and estimate hours 37 | daily_hours = commits.groupby(commits.index.date).agg({"lines": "sum", "insertions": "sum", "deletions": "sum"}) 38 | 39 | # Estimate hours based on commit patterns 40 | # Assuming average of 10 lines per hour of development 41 | daily_hours["estimated_hours"] = daily_hours["lines"] / 10 42 | 43 | print("\nResults:") 44 | print("\nDaily Development Hours:") 45 | print(daily_hours["estimated_hours"].round(2)) 46 | 47 | print("\nSummary Statistics:") 48 | print(f"Total commits analyzed: {len(commits)}") 49 | print(f"Total days with commits: {len(daily_hours)}") 50 | print(f"Total estimated hours: {daily_hours['estimated_hours'].sum():.2f}") 51 | print(f"Average hours per day: {daily_hours['estimated_hours'].mean():.2f}") 52 | 53 | end_time = time.time() 54 | print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds") 55 | -------------------------------------------------------------------------------- /.cursor/rules/pytest_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: tests/* 4 | --- 5 | # Pytest Standards 6 | 7 | ## Test Organization 8 | - All test files should be named with the prefix `test_` (e.g., `test_module.py`) 9 | - Test classes should be named with the prefix `Test` (e.g., `TestClassName`) 10 | - Test methods should be named with the prefix `test_` (e.g., `test_function_name`) 11 | - Tests should be organized in the same directory structure as the source code they test 12 | 13 | ## Test Structure 14 | - Each test should focus on testing a single functionality 15 | - Use descriptive test names that explain what is being tested 16 | - Group related tests in test classes 17 | - Use appropriate fixtures for test setup and teardown 18 | - Avoid test interdependence - tests should be able to run in any order 19 | 20 | ## Assertions 21 | - Use pytest's built-in assertions instead of Python's `assert` statement when possible 22 | - Use appropriate assertion methods for the type being tested (e.g., `assert_almost_equal` for floats) 23 | - Include descriptive error messages in assertions to aid debugging 24 | 25 | ## Fixtures 26 | - Use fixtures for common setup and teardown operations 27 | - Define fixtures at the appropriate scope (function, class, module, or session) 28 | - Use parameterized fixtures for testing multiple inputs 29 | - Use conftest.py for sharing fixtures across multiple test files 30 | 31 | ## Coverage 32 | - Aim for at least 80% code coverage 33 | - Write tests for both success and failure cases 34 | - Test edge cases and boundary conditions 35 | - Use `pytest-cov` to generate coverage reports 36 | 37 | ## Best Practices 38 | - Keep tests fast and independent 39 | - Avoid mocking unless necessary 40 | - Use markers to categorize tests (e.g., `@pytest.mark.slow`) 41 | - Use parametrize for testing multiple inputs 42 | - Write tests before or alongside code (TDD approach) 43 | - Run the full test suite before committing changes 44 | 45 | ## Commands 46 | - Run tests with `pytest` 47 | - Generate coverage reports with `pytest --cov=gitpandas` 48 | - Run specific tests with `pytest path/to/test_file.py::TestClass::test_method` -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests to verify that all example scripts run without errors. 3 | """ 4 | 5 | import subprocess 6 | import sys 7 | from pathlib import Path 8 | 9 | import pytest 10 | 11 | # Get the examples directory 12 | EXAMPLES_DIR = Path(__file__).parent.parent / "examples" 13 | 14 | # List of example scripts to test 15 | EXAMPLE_SCRIPTS = [ 16 | "attributes.py", 17 | "bus_analysis.py", 18 | "cloud_repo.py", 19 | "commit_history.py", 20 | "cumulative_blame.py", 21 | "definitions.py", 22 | "file_change_rates.py", 23 | "hours_estimate.py", 24 | "lifeline.py", 25 | "parallel_blame.py", 26 | "project_blame.py", 27 | "punchcard.py", 28 | "remote_fetch_and_cache_warming.py", # Added new example 29 | "repo_file_detail.py", 30 | "release_analytics.py", # Added new example 31 | ] 32 | 33 | 34 | @pytest.mark.slow 35 | def test_example_scripts(): 36 | """Test that all example scripts run without errors.""" 37 | for script in EXAMPLE_SCRIPTS: 38 | script_path = EXAMPLES_DIR / script 39 | assert script_path.exists(), f"Example script {script} not found" 40 | 41 | # Run the script with Python 42 | try: 43 | # Use the same Python interpreter that's running the tests 44 | python_executable = sys.executable 45 | result = subprocess.run( 46 | [python_executable, str(script_path)], 47 | cwd=EXAMPLES_DIR, 48 | capture_output=True, 49 | text=True, 50 | timeout=300, # 5 minute timeout per script 51 | ) 52 | 53 | # Check if the script ran successfully 54 | assert result.returncode == 0, ( 55 | f"Script {script} failed with return code {result.returncode}\n" 56 | f"stdout: {result.stdout}\n" 57 | f"stderr: {result.stderr}" 58 | ) 59 | 60 | except subprocess.TimeoutExpired: 61 | pytest.fail(f"Script {script} timed out after 5 minutes") 62 | except Exception as e: 63 | pytest.fail(f"Error running script {script}: {str(e)}") 64 | -------------------------------------------------------------------------------- /tests/test_utilities/test_check_api.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import pytest 4 | 5 | from gitpandas.utilities.check_api import ( 6 | extract_objects, 7 | get_distinct_params, 8 | parse_docstring, 9 | ) 10 | 11 | 12 | class TestCheckAPI: 13 | def test_extract_objects(self): 14 | """Test extracting objects from a module.""" 15 | # Test with classes only 16 | objects = extract_objects(pytest, classes=True, functions=False) 17 | assert isinstance(objects, dict) 18 | assert all(inspect.isclass(v) for v in objects.values()) 19 | 20 | # Test with functions only 21 | objects = extract_objects(pytest, classes=False, functions=True) 22 | assert isinstance(objects, dict) 23 | assert all(inspect.isfunction(v) for v in objects.values()) 24 | 25 | # Test with both 26 | objects = extract_objects(pytest, classes=True, functions=True) 27 | assert isinstance(objects, dict) 28 | assert any(inspect.isclass(v) for v in objects.values()) 29 | assert any(inspect.isfunction(v) for v in objects.values()) 30 | 31 | def test_parse_docstring(self): 32 | """Test parsing docstrings to extract parameter information.""" 33 | # Test with a simple docstring 34 | doc = """ 35 | Test function. 36 | 37 | :param arg1: First argument 38 | :param arg2: Second argument 39 | """ 40 | params = parse_docstring(doc) 41 | assert len(params) == 2 42 | assert params[0]["arg1"] == "First argument" 43 | assert params[1]["arg2"] == "Second argument" 44 | 45 | # Test with no parameters 46 | doc = "Test function with no parameters." 47 | params = parse_docstring(doc) 48 | assert len(params) == 0 49 | 50 | def test_get_distinct_params(self): 51 | """Test getting distinct parameters from signatures.""" 52 | # Create test signatures 53 | sigs = { 54 | "func1": {"args": ["arg1", "arg2"]}, 55 | "func2": {"args": ["arg2", "arg3"]}, 56 | "func3": {"args": ["arg1", "arg3"]}, 57 | } 58 | 59 | # Get distinct parameters 60 | params = get_distinct_params(sigs) 61 | 62 | # Should have all unique parameters 63 | assert params == {"arg1", "arg2", "arg3"} 64 | -------------------------------------------------------------------------------- /.cursor/rules/python_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: *.py 4 | --- 5 | # Python Coding Standards 6 | 7 | ## Important Notes 8 | - Do not ever ever add test-case specific handling into the library code. Do not add handling for "special cases", "test cases" or any other logic branches that attempt to skip the normal buisness logic to get a test to pass. 9 | 10 | ## Code Style 11 | - Follow PEP 8 style guide for Python code 12 | - Use 4 spaces for indentation (no tabs) 13 | - Maximum line length of 88 characters (Black default) 14 | - Use snake_case for variables, functions, and methods 15 | - Use CamelCase for classes 16 | - Use UPPER_CASE for constants 17 | - Add a blank line at the end of each file 18 | 19 | ## Imports 20 | - Group imports in the following order: 21 | 1. Standard library imports 22 | 2. Related third-party imports 23 | 3. Local application/library specific imports 24 | - Use absolute imports when possible 25 | - Avoid wildcard imports (`from module import *`) 26 | - Use import aliases for long module names 27 | 28 | ## Documentation 29 | - Document all public modules, classes, methods, and functions 30 | - Use docstrings that follow the Google style guide 31 | - Include type hints for function parameters and return values 32 | - Document parameters, return values, and exceptions raised 33 | 34 | ## Error Handling 35 | - Use specific exception types instead of generic ones 36 | - Handle exceptions at the appropriate level 37 | - Use context managers (`with` statements) for resource management 38 | - Avoid catching exceptions without proper handling 39 | 40 | ## Code Organization 41 | - Keep functions and methods short and focused 42 | - Follow the Single Responsibility Principle 43 | - Use classes to encapsulate related functionality 44 | - Separate concerns into different modules 45 | 46 | ## Testing 47 | - Write unit tests for all code 48 | - Use meaningful test names 49 | - Test both normal and edge cases 50 | - Mock external dependencies in tests 51 | 52 | ## Performance 53 | - Prefer list/dict/set comprehensions over loops when appropriate 54 | - Use generators for large data sets 55 | - Profile code before optimizing 56 | - Consider using NumPy/Pandas for numerical operations 57 | 58 | ## Tools 59 | - Use Black for code formatting 60 | - Use Ruff for linting and static analysis 61 | - Use mypy for type checking 62 | - Use isort for import sorting 63 | 64 | ## Version Control 65 | - Write meaningful commit messages 66 | - Keep commits focused on a single change 67 | - Use feature branches for development 68 | - Review code before merging -------------------------------------------------------------------------------- /examples/parallel_blame.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of analyzing blame information in parallel. 3 | 4 | This example demonstrates: 5 | 1. Creating a repository instance 6 | 2. Analyzing blame information sequentially 7 | 3. Analyzing blame information in parallel (if joblib is available) 8 | 4. Comparing performance between sequential and parallel analysis 9 | """ 10 | 11 | import sys 12 | import time 13 | 14 | from gitpandas import Repository 15 | 16 | __author__ = "willmcginnis" 17 | 18 | 19 | if __name__ == "__main__": 20 | print("Initializing repository...") 21 | 22 | # Use pygeohash repository - a good size for examples 23 | repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git") 24 | 25 | # Define analysis parameters 26 | branch = "master" # Use master instead of main 27 | include_globs = ["*.py"] # Focus on Python files only 28 | limit = 20 # Limit to 20 commits for faster analysis 29 | 30 | print(f"\nAnalyzing blame information for {branch} branch") 31 | print(f"Including files: {', '.join(include_globs)}") 32 | print(f"Analyzing {limit} commits") 33 | 34 | # Sequential analysis 35 | print("\nRunning sequential analysis...") 36 | start_time = time.time() 37 | blame = repo.cumulative_blame(branch=branch, include_globs=include_globs, limit=limit) 38 | seq_time = time.time() - start_time 39 | print(f"Sequential analysis completed in {seq_time:.2f} seconds") 40 | print("\nSample of results:") 41 | print(blame.head()) 42 | 43 | # Try parallel analysis if joblib is available 44 | try: 45 | import joblib # noqa: F401 46 | 47 | print("\nRunning parallel analysis with 4 workers...") 48 | start_time = time.time() 49 | blame = repo.parallel_cumulative_blame(branch=branch, include_globs=include_globs, limit=limit, workers=4) 50 | par_time = time.time() - start_time 51 | print(f"Parallel analysis completed in {par_time:.2f} seconds") 52 | print("\nSample of results:") 53 | print(blame.head()) 54 | 55 | # Compare performance 56 | print("\nPerformance comparison:") 57 | print(f" Sequential time: {seq_time:.2f} seconds") 58 | print(f" Parallel time: {par_time:.2f} seconds") 59 | print(f" Speedup: {seq_time / par_time:.2f}x") 60 | except ImportError: 61 | print("\nParallel analysis skipped: joblib package not installed.") 62 | print("To enable parallel analysis, install joblib:") 63 | print(" pip install joblib") 64 | sys.exit(0) # Exit with success since this is an expected case 65 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing Code 2 | ================= 3 | 4 | Git-pandas is a python library for analyzing git repositories with pandas. Our goal is twofold: 5 | 6 | * To make the analysis of git repositories or collections of repositories simple, fast, and pleasant. 7 | * To give developers interested in data analysis a simple interface to datasets that they understand and have already (git data) 8 | 9 | With that in mind, we welcome and in fact would love some help. 10 | 11 | How to Contribute 12 | ================= 13 | 14 | The preferred workflow to contribute to git-pandas is: 15 | 16 | 1. Fork this repository into your own github account. 17 | 2. Clone the fork on your account onto your local disk: 18 | 19 | $ git clone git@github.com:YourLogin/git-pandas.git 20 | $ cd git-pandas 21 | 22 | 3. Create a branch for your new awesome feature, do not work in the master branch: 23 | 24 | $ git checkout -b new-awesome-feature 25 | 26 | 4. Write some code, or docs, or tests. 27 | 5. When you are done, submit a pull request. 28 | 29 | Guidelines 30 | ========== 31 | 32 | Git-pandas is still a very young project, but we do have a few guiding principles: 33 | 34 | 1. Maintain feature and API parity between Repository and ProjectDirectory 35 | 2. Write detailed docstrings in sphinx format 36 | 3. Slow or potentially memory intensive functions should have a limit option 37 | 38 | Running Tests 39 | ============= 40 | 41 | Test coverage is admittedly pretty bad right now, so help out by writing tests for new code. To run the tests, use: 42 | 43 | $ nosetests --with-coverage 44 | $ coverage html 45 | 46 | Easy Issues / Getting Started 47 | ============================= 48 | 49 | There are a number of issues on the near term horizon that would be great to have help with. 50 | 51 | 1. Diff: it would be really nice to be able to call a function with 2 revs and return the diff as a dataframe. So columns for line number, filename, path, change, author, timestamp, etc. 52 | 4. File-level tracking: it would be really cool to be able to get a dataframe of one file's history in great detail. It likely would be a subset of the existing file change history function. 53 | 5. Cross-Branch Analytics: finding differences between different branches of a single repository. Or aggregating the results of other functions across multiple branches. 54 | 6. Verbose Mode: add logging in more functions when verbose is set to True. 55 | 7. Heirarchical bus factor: what's the bus factor of a file, directory, repo, project directory, etc 56 | 8. Language analytics: what languages do we use most, what are bus factors of those languages? Who should we hire next? -------------------------------------------------------------------------------- /.cursor/rules/sphinx_docs_standards.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: docs/* 4 | --- 5 | # Sphinx Documentation Standards 6 | 7 | ## Project Setup 8 | - Use `sphinx-quickstart` to initialize the documentation structure 9 | - Configure `conf.py` with appropriate project information 10 | - Use the ReadTheDocs theme for consistent styling 11 | - Enable necessary extensions (e.g., `autodoc`, `napoleon`, `viewcode`) 12 | 13 | ## Directory Structure 14 | - Keep documentation source files in the `docs/` directory 15 | - Organize documentation into logical sections (e.g., user guide, API reference) 16 | - Use a clear and consistent file naming convention 17 | - Include an `index.rst` file as the documentation entry point 18 | 19 | ## Documentation Style 20 | - Write in clear, concise language 21 | - Use present tense and active voice 22 | - Be consistent with terminology 23 | - Include examples where appropriate 24 | - Target the appropriate audience (users, developers, etc.) 25 | 26 | ## reStructuredText Formatting 27 | - Use proper heading hierarchy (=, -, ~, ^, ") 28 | - Use bullet lists for unordered items and numbered lists for sequences 29 | - Use code blocks with appropriate language for syntax highlighting 30 | - Use cross-references to link between documentation sections 31 | - Include images and diagrams where they add value 32 | 33 | ## API Documentation 34 | - Use autodoc to generate API documentation from docstrings 35 | - Document all public modules, classes, methods, and functions 36 | - Follow Google or NumPy docstring style consistently 37 | - Include type information for parameters and return values 38 | - Document exceptions that may be raised 39 | 40 | ## Examples and Tutorials 41 | - Include practical examples for common use cases 42 | - Provide step-by-step tutorials for complex operations 43 | - Ensure all examples are tested and working 44 | - Use `literalinclude` to include code examples from actual source files 45 | 46 | ## Building and Testing 47 | - Build documentation locally before committing changes 48 | - Check for and fix all warnings during the build process 49 | - Verify that cross-references work correctly 50 | - Test documentation on different screen sizes 51 | 52 | ## Deployment 53 | - Configure automatic documentation builds on ReadTheDocs or GitHub Pages 54 | - Include a link to the documentation in the project README 55 | - Version documentation to match software releases 56 | - Provide a changelog or release notes section 57 | 58 | ## Maintenance 59 | - Keep documentation up-to-date with code changes 60 | - Review and update documentation during each release cycle 61 | - Address user feedback and questions in the documentation 62 | - Remove outdated or deprecated information -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "git-pandas" 7 | version = "2.5.0" 8 | description = "A utility for interacting with data from git repositories as Pandas dataframes" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | license = {text = "BSD"} 12 | authors = [ 13 | {name = "Will McGinnis", email = "will@mcginniscommawill.com"}, 14 | ] 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Intended Audience :: Developers", 18 | "Programming Language :: Python :: 3", 19 | ] 20 | keywords = ["git", "pandas", "data", "analysis"] 21 | 22 | dependencies = [ 23 | "gitpython>=1.0.0", 24 | "numpy>=1.9.0", 25 | "pandas>=2.0.0", 26 | "requests", 27 | "redis", 28 | "coverage>=5.0.0", 29 | "importlib-metadata>=1.0; python_version<'3.8'", 30 | ] 31 | 32 | [project.urls] 33 | Documentation = "https://gitpandas.mcginniscommawill.com" 34 | Source = "https://github.com/wdm0006/git-pandas" 35 | Issues = "https://github.com/wdm0006/git-pandas/issues" 36 | 37 | [project.optional-dependencies] 38 | examples = [ 39 | "matplotlib", 40 | "lifelines", 41 | ] 42 | dev = [ 43 | "pytest>=7.0.0", 44 | "pytest-cov>=4.0.0", 45 | "sphinx>=7.0.0", 46 | "sphinx-rtd-theme>=1.0.0", 47 | "ruff>=0.1.0", 48 | "matplotlib", 49 | "joblib", 50 | "pytest-mock" 51 | ] 52 | all = [ 53 | "matplotlib", 54 | "lifelines", 55 | "pytest>=7.0.0", 56 | "pytest-cov>=4.0.0", 57 | "sphinx>=7.0.0", 58 | "sphinx-rtd-theme>=1.0.0", 59 | "ruff>=0.1.0", 60 | ] 61 | 62 | [tool.hatch.build.targets.wheel] 63 | packages = ["gitpandas"] 64 | 65 | [tool.pytest.ini_options] 66 | testpaths = ["tests"] 67 | python_files = ["test_*.py"] 68 | python_functions = ["test_*"] 69 | python_classes = ["Test*"] 70 | addopts = "--verbose --strict-markers" 71 | markers = [ 72 | "slow: marks tests as slow (run with pytest -m slow)", 73 | "remote: marks tests that require remote access", 74 | "redis: marks tests that require Redis", 75 | ] 76 | 77 | [tool.coverage.run] 78 | source = ["gitpandas"] 79 | 80 | [tool.ruff] 81 | target-version = "py311" 82 | line-length = 120 83 | exclude=["docs/**"] 84 | 85 | [tool.ruff.lint] 86 | select = ["E", "F", "B", "I", "N", "UP", "C4", "SIM", "ERA"] 87 | ignore = ["UP017"] 88 | 89 | [tool.ruff.lint.isort] 90 | known-first-party = ["gitpandas"] 91 | 92 | [tool.ruff.lint.flake8-annotations] 93 | allow-star-arg-any = true 94 | suppress-none-returning = true 95 | 96 | [tool.ruff.format] 97 | quote-style = "double" 98 | indent-style = "space" 99 | skip-magic-trailing-comma = false 100 | line-ending = "auto" -------------------------------------------------------------------------------- /examples/lifeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of analyzing file lifelines and ownership changes. 3 | 4 | This example demonstrates: 5 | 1. Creating a repository instance 6 | 2. Analyzing file change history 7 | 3. Identifying ownership changes and refactoring events 8 | 4. Visualizing survival curves for file owners 9 | """ 10 | 11 | import os 12 | import time 13 | 14 | import matplotlib 15 | 16 | matplotlib.use("Agg") # Set the backend to Agg before importing pyplot 17 | 18 | from gitpandas import Repository 19 | from gitpandas.utilities.plotting import plot_lifeline 20 | 21 | __author__ = "willmcginnis" 22 | 23 | 24 | if __name__ == "__main__": 25 | print("Initializing repository...") 26 | start_time = time.time() 27 | 28 | # Use pygeohash repository - a good size for examples 29 | repo = Repository(working_dir="https://github.com/wdm0006/pygeohash.git") 30 | 31 | print("\nAnalyzing file change history...") 32 | print("Using a limit of 20 commits for faster analysis") 33 | 34 | # Get file change history with limits 35 | changes = repo.file_change_history( 36 | branch="master", # Use master instead of main 37 | limit=20, # Limit to 20 commits 38 | include_globs=["*.py"], # Focus on Python files only 39 | ) 40 | 41 | print("\nDataFrame structure:") 42 | print("\nColumns:", changes.columns.tolist()) 43 | print("\nSample data:") 44 | print(changes.head()) 45 | 46 | print("\nIdentifying ownership changes...") 47 | # Identify ownership changes 48 | ownership_changes = changes.groupby("filename").filter(lambda x: len(x["committer"].unique()) > 1) 49 | 50 | print("\nIdentifying refactoring events...") 51 | # Identify refactoring events (significant changes to files) 52 | # Consider changes with >50 total lines changed (insertions + deletions) as refactoring 53 | changes["total_changes"] = changes["insertions"] + changes["deletions"] 54 | refactoring = changes[changes["total_changes"] > 50] 55 | 56 | print("\nGenerating visualization...") 57 | # Create the plot and save it 58 | fig = plot_lifeline(changes, ownership_changes, refactoring) 59 | output_path = os.path.join("img", "lifeline.png") 60 | fig.savefig(output_path) 61 | print(f"Plot saved to {output_path}") 62 | 63 | end_time = time.time() 64 | print(f"\nAnalysis completed in {end_time - start_time:.2f} seconds") 65 | 66 | # Print summary statistics 67 | print("\nSummary:") 68 | print(f"Total files analyzed: {len(changes.filename.unique())}") 69 | print(f"Total ownership changes: {len(ownership_changes)}") 70 | print(f"Total refactoring events: {len(refactoring)}") 71 | print("\nRefactoring events details:") 72 | print(refactoring[["filename", "total_changes", "message"]].to_string()) 73 | -------------------------------------------------------------------------------- /docs/source/contributors.rst: -------------------------------------------------------------------------------- 1 | Contributing to Git-Pandas 2 | ========================== 3 | 4 | We welcome contributions to Git-Pandas! Our goal is to make Git repository analysis simple, fast, and accessible to developers interested in data analysis. 5 | 6 | Getting Started 7 | --------------- 8 | 9 | 1. Fork the repository on GitHub 10 | 2. Clone your fork locally: 11 | 12 | .. code-block:: shell 13 | 14 | git clone git@github.com:YourLogin/git-pandas.git 15 | cd git-pandas 16 | 17 | 3. Create a feature branch: 18 | 19 | .. code-block:: shell 20 | 21 | git checkout -b feature/your-awesome-feature 22 | 23 | 4. Make your changes 24 | 5. Submit a pull request 25 | 26 | Development Guidelines 27 | ---------------------- 28 | 29 | Code Style 30 | ~~~~~~~~~~ 31 | 32 | * Follow PEP 8 style guide 33 | * Use 4 spaces for indentation 34 | * Maximum line length of 88 characters (Black default) 35 | * Use snake_case for variables and functions 36 | * Use CamelCase for classes 37 | * Add docstrings following Google style guide 38 | 39 | Documentation 40 | ~~~~~~~~~~~~~ 41 | 42 | * Write detailed docstrings for all public APIs 43 | * Include type hints for function parameters and return values 44 | * Document exceptions that may be raised 45 | * Add examples where appropriate 46 | * Update the documentation for any new features 47 | 48 | Testing 49 | ~~~~~~~ 50 | 51 | * Write unit tests for new code 52 | * Maintain or improve test coverage 53 | * Run tests before submitting PR: 54 | 55 | .. code-block:: shell 56 | 57 | make test 58 | make coverage 59 | 60 | API Design Principles 61 | ~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | * Maintain feature parity between Repository and ProjectDirectory 64 | * Include limit options for memory-intensive functions 65 | * Keep the API simple and intuitive 66 | * Consider performance implications 67 | 68 | Current Development Focus 69 | ------------------------- 70 | 71 | High Priority 72 | ~~~~~~~~~~~~~ 73 | 74 | * Improve test coverage with proper unit tests 75 | * Add diff functionality between revisions 76 | * Enhance documentation with more examples and visualizations 77 | * Streamline documentation deployment 78 | 79 | Recently Completed (v2.5.0) 80 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 81 | 82 | * ✅ File-wise bus factor analysis (completed) 83 | * ✅ Cache management methods (invalidate_cache, get_cache_stats) 84 | * ✅ Performance documentation and optimization guide 85 | * ✅ Remote operations (safe_fetch_remote, warm_cache, bulk operations) 86 | * ✅ Enhanced caching system with timestamps and metadata 87 | 88 | Feature Ideas 89 | ~~~~~~~~~~~~~ 90 | 91 | * Cross-branch analytics 92 | * Enhanced verbose logging 93 | * Language analytics and insights 94 | * Code complexity metrics 95 | * Contributor network analysis 96 | 97 | Development Setup 98 | ----------------- 99 | 100 | 1. Install development dependencies: 101 | 102 | .. code-block:: shell 103 | 104 | make install-dev 105 | 106 | 2. Set up pre-commit hooks: 107 | 108 | .. code-block:: shell 109 | 110 | make pre-commit 111 | 112 | 3. Run tests: 113 | 114 | .. code-block:: shell 115 | 116 | make test 117 | 118 | 4. Build documentation: 119 | 120 | .. code-block:: shell 121 | 122 | make docs 123 | 124 | Questions? 125 | ---------- 126 | 127 | * Open an issue for bug reports or feature requests 128 | * Join our discussions on GitHub 129 | * Check existing issues for similar problems 130 | 131 | Thank you for contributing to Git-Pandas! 132 | 133 | -------------------------------------------------------------------------------- /gitpandas/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any 3 | 4 | # Setup library logging 5 | logger = logging.getLogger("gitpandas") # Use a consistent name 6 | logger.addHandler(logging.NullHandler()) 7 | 8 | 9 | def get_logger(name: str | None = None) -> logging.Logger: 10 | """Get a logger instance for the specified name. 11 | 12 | Args: 13 | name: The name of the logger to get. If None, returns the main gitpandas logger. 14 | If specified, returns a child logger of the main gitpandas logger. 15 | 16 | Returns: 17 | logging.Logger: The requested logger instance. 18 | """ 19 | if name is None: 20 | return logger 21 | return logger.getChild(name) 22 | 23 | 24 | def set_log_level(level: int | str) -> None: 25 | """Set the logging level for the gitpandas library. 26 | 27 | Args: 28 | level: The logging level to set. Can be either a string (e.g., 'INFO') 29 | or an integer (e.g., logging.INFO). 30 | """ 31 | logger.setLevel(level) 32 | 33 | 34 | def add_stream_handler( 35 | level: int | str = logging.INFO, 36 | format_string: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s", 37 | **handler_kwargs: Any, 38 | ) -> None: 39 | """Add a stream handler to the gitpandas logger. 40 | 41 | Args: 42 | level: The logging level for the handler. Defaults to INFO. 43 | format_string: The format string for log messages. 44 | **handler_kwargs: Additional keyword arguments to pass to StreamHandler. 45 | """ 46 | # Avoid adding duplicate handlers 47 | if any(isinstance(h, logging.StreamHandler) for h in logger.handlers): 48 | logger.warning("StreamHandler already exists for gitpandas logger.") 49 | return 50 | 51 | handler = logging.StreamHandler(**handler_kwargs) 52 | handler.setLevel(level) 53 | formatter = logging.Formatter(format_string) 54 | handler.setFormatter(formatter) 55 | logger.addHandler(handler) 56 | 57 | 58 | def add_file_handler( 59 | filename: str, 60 | level: int | str = logging.INFO, 61 | format_string: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s", 62 | **handler_kwargs: Any, 63 | ) -> None: 64 | """Add a file handler to the gitpandas logger. 65 | 66 | Args: 67 | filename: The name of the file to log to. 68 | level: The logging level for the handler. Defaults to INFO. 69 | format_string: The format string for log messages. 70 | **handler_kwargs: Additional keyword arguments to pass to FileHandler. 71 | """ 72 | # Avoid adding duplicate file handlers for the same file 73 | if any(isinstance(h, logging.FileHandler) and h.baseFilename == filename for h in logger.handlers): 74 | logger.warning(f"FileHandler for {filename} already exists for gitpandas logger.") 75 | return 76 | 77 | handler = logging.FileHandler(filename, **handler_kwargs) 78 | handler.setLevel(level) 79 | formatter = logging.Formatter(format_string) 80 | handler.setFormatter(formatter) 81 | logger.addHandler(handler) 82 | 83 | 84 | def remove_all_handlers() -> None: 85 | """Remove all handlers from the gitpandas logger (except the default NullHandler).""" 86 | for handler in logger.handlers[:]: 87 | if not isinstance(handler, logging.NullHandler): 88 | logger.removeHandler(handler) 89 | 90 | 91 | __all__ = [ 92 | "logger", 93 | "get_logger", 94 | "set_log_level", 95 | "add_stream_handler", 96 | "add_file_handler", 97 | "remove_all_handlers", 98 | ] 99 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup test test-all lint format clean docs build run-example test-single mcp gitnoc 2 | 3 | # Use uv for all Python operations 4 | PYTHON = python 5 | UV = uv 6 | 7 | # Project settings 8 | PACKAGE_NAME = gitpandas 9 | TESTS_DIR = tests 10 | DOCS_DIR = docs 11 | BUILD_DIR = dist 12 | EXAMPLES_DIR = examples 13 | 14 | setup: 15 | $(UV) pip install -e ".[dev]" 16 | 17 | setup-examples: 18 | $(UV) pip install -e ".[examples]" 19 | 20 | setup-all: 21 | $(UV) pip install -e ".[all]" 22 | 23 | test: setup-all 24 | MPLBACKEND=Agg $(UV) run pytest $(TESTS_DIR) --cov=$(PACKAGE_NAME) --cov-report=term-missing -m "not slow" 25 | 26 | test-single: setup-all 27 | @if [ "$(test)" = "" ]; then \ 28 | echo "Error: Please specify a test using test="; \ 29 | echo "Example: make test-single test=tests/test_Repository/test_advanced.py::TestRepositoryAdvanced::test_parallel_cumulative_blame"; \ 30 | exit 1; \ 31 | fi 32 | MPLBACKEND=Agg $(UV) run pytest $(test) -v 33 | 34 | test-all: 35 | MPLBACKEmND=Agg $(UV) run pytest $(TESTS_DIR) --cov=$(PACKAGE_NAME) --cov-report=term-missing 36 | 37 | lint: 38 | $(UV) run ruff check --fix --unsafe-fixes . 39 | 40 | format: 41 | $(UV) run ruff format . 42 | 43 | docs: 44 | $(MAKE) -C $(DOCS_DIR) html 45 | 46 | docs-serve: 47 | cd $(DOCS_DIR)/build/html && $(PYTHON) -m http.server 48 | 49 | clean: 50 | rm -rf $(BUILD_DIR) 51 | rm -rf $(DOCS_DIR)/build 52 | rm -rf .pytest_cache 53 | rm -rf .ruff_cache 54 | rm -rf .coverage 55 | find . -type d -name __pycache__ -exec rm -rf {} + 56 | find . -type f -name "*.pyc" -delete 57 | 58 | build: 59 | $(UV) pip build 60 | 61 | publish: 62 | $(UV) pip publish 63 | 64 | env-export: 65 | $(UV) pip freeze > requirements.txt 66 | 67 | run-example: 68 | @if [ "$(example)" = "" ]; then \ 69 | echo "Error: Please specify an example to run using example="; \ 70 | echo "Available examples:"; \ 71 | ls $(EXAMPLES_DIR)/*.py | sed 's/$(EXAMPLES_DIR)\///' | sed 's/\.py$$//'; \ 72 | exit 1; \ 73 | fi 74 | @if [ ! -f "$(EXAMPLES_DIR)/$(example).py" ]; then \ 75 | echo "Error: Example '$(example)' not found in $(EXAMPLES_DIR)"; \ 76 | exit 1; \ 77 | fi 78 | MPLBACKEND=Agg $(UV) run python $(EXAMPLES_DIR)/$(example).py 79 | 80 | mcp: 81 | $(UV) run python mcp_server/server.py 82 | 83 | gitnoc: 84 | $(UV) run python gitnoc/app.py 85 | 86 | help: 87 | @echo "Available commands:" 88 | @echo " setup Install the package in development mode" 89 | @echo " setup-examples Install the package with examples dependencies" 90 | @echo " setup-all Install the package with all dependencies" 91 | @echo " setup-gitnoc Install GitNOC dependencies" 92 | @echo " test Run tests with pytest (excluding slow tests)" 93 | @echo " test-single Run a single test (usage: make test-single test=)" 94 | @echo " test-all Run all tests including slow tests" 95 | @echo " lint Run ruff linter" 96 | @echo " format Format code with ruff" 97 | @echo " docs Build documentation with Sphinx" 98 | @echo " docs-serve Serve documentation locally" 99 | @echo " clean Remove build artifacts" 100 | @echo " build Build distribution packages" 101 | @echo " publish Publish package to PyPI" 102 | @echo " env-export Export dependencies to requirements.txt" 103 | @echo " run-example Run a specific example (usage: make run-example example=)" 104 | @echo " mcp Run the MCP server" 105 | @echo " gitnoc Run the GitNOC Streamlit app" -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared pytest fixtures for git-pandas tests. 3 | """ 4 | 5 | import subprocess 6 | import pytest 7 | 8 | __author__ = "willmcginnis" 9 | 10 | 11 | def get_default_branch(): 12 | """Get the system's default branch name for new repositories.""" 13 | try: 14 | # Try to get the configured default branch 15 | result = subprocess.run( 16 | ["git", "config", "--global", "init.defaultBranch"], 17 | capture_output=True, 18 | text=True, 19 | check=False 20 | ) 21 | if result.returncode == 0 and result.stdout.strip(): 22 | return result.stdout.strip() 23 | except Exception: 24 | pass 25 | 26 | # If no default branch is configured, create a temporary repo to see what git actually creates 27 | import tempfile 28 | import os 29 | 30 | try: 31 | with tempfile.TemporaryDirectory() as temp_dir: 32 | test_repo_path = os.path.join(temp_dir, "test_repo") 33 | 34 | # Initialize a repo and make an initial commit to see what branch git creates 35 | init_result = subprocess.run( 36 | ["git", "init", test_repo_path], 37 | capture_output=True, 38 | text=True, 39 | check=False 40 | ) 41 | 42 | if init_result.returncode == 0: 43 | # Configure user for the test repo 44 | subprocess.run( 45 | ["git", "-C", test_repo_path, "config", "user.name", "Test"], 46 | capture_output=True, 47 | check=False 48 | ) 49 | subprocess.run( 50 | ["git", "-C", test_repo_path, "config", "user.email", "test@example.com"], 51 | capture_output=True, 52 | check=False 53 | ) 54 | 55 | # Create a file and commit to establish a branch 56 | test_file = os.path.join(test_repo_path, "test.txt") 57 | with open(test_file, "w") as f: 58 | f.write("test") 59 | 60 | subprocess.run( 61 | ["git", "-C", test_repo_path, "add", "test.txt"], 62 | capture_output=True, 63 | check=False 64 | ) 65 | subprocess.run( 66 | ["git", "-C", test_repo_path, "commit", "-m", "initial"], 67 | capture_output=True, 68 | check=False 69 | ) 70 | 71 | # Check what branch was created 72 | branch_result = subprocess.run( 73 | ["git", "-C", test_repo_path, "branch", "--show-current"], 74 | capture_output=True, 75 | text=True, 76 | check=False 77 | ) 78 | 79 | if branch_result.returncode == 0 and branch_result.stdout.strip(): 80 | return branch_result.stdout.strip() 81 | except Exception: 82 | pass 83 | 84 | # Final fallback to 'master' for maximum compatibility 85 | return "master" 86 | 87 | 88 | @pytest.fixture(scope="session") 89 | def default_branch(): 90 | """Pytest fixture to get the default branch name.""" 91 | return get_default_branch() 92 | 93 | 94 | def pytest_configure(config): 95 | """Register custom markers.""" 96 | config.addinivalue_line("markers", "slow: marks tests as slow (deselect with '-m \"not slow\"')") 97 | -------------------------------------------------------------------------------- /examples/commit_history.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of analyzing commit history in a repository. 3 | 4 | This example demonstrates: 5 | 1. Creating repository and project directory instances 6 | 2. Analyzing commit history with reasonable limits 7 | 3. Viewing committer statistics 8 | 4. Analyzing file changes by extension 9 | """ 10 | 11 | import os 12 | import time 13 | 14 | import numpy as np 15 | from definitions import GIT_PANDAS_DIR 16 | from pandas import set_option 17 | 18 | from gitpandas import ProjectDirectory, Repository 19 | 20 | __author__ = "willmcginnis" 21 | 22 | 23 | def project(path): 24 | """Analyze commit history for a project directory.""" 25 | print("\nAnalyzing project directory...") 26 | start_time = time.time() 27 | 28 | p = ProjectDirectory(working_dir=path) 29 | 30 | # Get commit history with reasonable limits 31 | print("\nGetting commit history (last 7 days, limited to 100 commits)...") 32 | ch = p.commit_history( 33 | "master", 34 | limit=100, 35 | include_globs=["*.py"], 36 | ignore_globs=["lib/*", "docs/*", "test/*", "tests/*", "tests_t/*"], 37 | days=7, 38 | ) 39 | print("\nRecent commits:") 40 | print(ch.head()) 41 | 42 | # Get committer statistics 43 | committers = set(ch["committer"].values) 44 | print(f"\nFound {len(committers)} unique committers:") 45 | for committer in sorted(committers): 46 | print(f" - {committer}") 47 | 48 | # Calculate contributions 49 | print("\nContributions by committer:") 50 | attr = ch.reindex(columns=["committer", "lines", "insertions", "deletions", "net"]).groupby(["committer"]) 51 | attr = attr.agg({"lines": np.sum, "insertions": np.sum, "deletions": np.sum, "net": np.sum}) 52 | print(attr) 53 | 54 | print(f"\nProject analysis completed in {time.time() - start_time:.2f} seconds") 55 | 56 | 57 | def repository(path): 58 | """Analyze commit history for a single repository.""" 59 | print("\nAnalyzing repository...") 60 | start_time = time.time() 61 | 62 | # Build repository object 63 | ignore_dirs = ["docs/*", "tests/*", "Data/*"] 64 | r = Repository(path, default_branch="main") 65 | 66 | # Check if bare 67 | print("\nRepository type:") 68 | print(f" Bare repository: {r.is_bare()}") 69 | 70 | # Get commit history with limits 71 | print("\nGetting commit history (limited to 50 commits)...") 72 | ch = r.commit_history("HEAD", limit=50, include_globs=["*.py"], ignore_globs=ignore_dirs) 73 | print("\nRecent commits:") 74 | print(ch.head(5)) 75 | 76 | # Get committer statistics 77 | committers = set(ch["committer"].values) 78 | print(f"\nFound {len(committers)} unique committers:") 79 | for committer in sorted(committers): 80 | print(f" - {committer}") 81 | 82 | # Calculate contributions 83 | print("\nContributions by committer:") 84 | attr = ch.reindex(columns=["committer", "lines", "insertions", "deletions"]).groupby(["committer"]) 85 | attr = attr.agg({"lines": np.sum, "insertions": np.sum, "deletions": np.sum}) 86 | print(attr) 87 | 88 | # Get file change history with limits 89 | print("\nAnalyzing file changes (limited to 50 commits)...") 90 | fh = r.file_change_history("HEAD", limit=50, ignore_globs=ignore_dirs) 91 | fh["ext"] = fh["filename"].map(lambda x: x.split(".")[-1]) 92 | print("\nRecent file changes:") 93 | print(fh.head(10)) 94 | 95 | # Analyze by extension 96 | print("\nChanges by file extension:") 97 | etns = fh.reindex(columns=["ext", "insertions", "deletions"]).groupby(["ext"]) 98 | etns = etns.agg({"insertions": np.sum, "deletions": np.sum}) 99 | print(etns) 100 | 101 | print(f"\nRepository analysis completed in {time.time() - start_time:.2f} seconds") 102 | 103 | 104 | if __name__ == "__main__": 105 | # Configure pandas display options 106 | set_option("display.max_rows", 500) 107 | set_option("display.max_columns", 500) 108 | set_option("display.width", 1000) 109 | 110 | path = os.path.abspath(GIT_PANDAS_DIR) 111 | project(path) 112 | repository(path) 113 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. git-pandas documentation master file, created by 2 | sphinx-quickstart on Sun Nov 8 21:21:04 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Git-Pandas Documentation 7 | =================================== 8 | 9 | Git-Pandas is a Python library that transforms Git repository data into pandas DataFrames, making it easy to analyze your codebase's history and development patterns. Built on top of GitPython, it provides a simple interface for extracting insights from your Git repositories. 10 | 11 | .. image:: https://raw.githubusercontent.com/wdm0006/git-pandas/master/examples/img/githubblame.png 12 | :alt: Cumulative Blame Visualization 13 | :align: center 14 | 15 | Quick Start 16 | ----------- 17 | 18 | Install Git-Pandas using pip: 19 | 20 | .. code-block:: bash 21 | 22 | pip install git-pandas 23 | 24 | Basic Usage 25 | ~~~~~~~~~~~ 26 | 27 | Analyze a single repository: 28 | 29 | .. code-block:: python 30 | 31 | from gitpandas import Repository 32 | 33 | # Create a repository instance 34 | repo = Repository('/path/to/repo') 35 | 36 | # Get commit history with filtering 37 | commits_df = repo.commit_history( 38 | branch='main', 39 | ignore_globs=['*.pyc'], 40 | include_globs=['*.py'] 41 | ) 42 | 43 | # Analyze blame information 44 | blame_df = repo.blame(by='repository') 45 | 46 | # Calculate bus factor 47 | bus_factor_df = repo.bus_factor() 48 | 49 | Analyze multiple repositories: 50 | 51 | .. code-block:: python 52 | 53 | from gitpandas import ProjectDirectory 54 | project = ProjectDirectory('/path/to/project') 55 | 56 | Key Features 57 | ------------ 58 | 59 | * **Repository Analysis**: Extract commit history, file changes, and blame information 60 | * **Project Insights**: Calculate bus factor and analyze repository metrics 61 | * **Multi-Repository Support**: Analyze multiple repositories together 62 | * **Remote Operations**: Safely fetch changes from remote repositories 63 | * **Cache Warming**: Pre-populate caches for improved performance 64 | * **Bulk Operations**: Efficiently process multiple repositories in parallel 65 | * **Performance Optimization**: Advanced caching support and glob-based filtering 66 | 67 | Core Components 68 | --------------- 69 | 70 | The library is built around two main components: 71 | 72 | Repository 73 | ~~~~~~~~~~ 74 | A wrapper around a single Git repository that provides: 75 | 76 | * Commit history analysis with filtering options 77 | * File change tracking and blame information 78 | * Branch existence checking and repository status 79 | * Bus factor calculation and repository metrics 80 | * Punchcard statistics generation 81 | 82 | ProjectDirectory 83 | ~~~~~~~~~~~~~~~ 84 | A collection of Git repositories that enables: 85 | 86 | * Analysis across multiple repositories 87 | * Aggregated metrics and statistics 88 | * Project-level insights 89 | 90 | Common Parameters 91 | ---------------- 92 | 93 | Most analysis methods support these filtering parameters: 94 | 95 | * **branch**: Branch to analyze (defaults to repository's default branch) 96 | * **limit**: Maximum number of commits to analyze 97 | * **days**: Limit analysis to last N days 98 | * **ignore_globs**: List of glob patterns for files to ignore 99 | * **include_globs**: List of glob patterns for files to include 100 | * **by**: How to group results (usually 'repository' or 'file') 101 | 102 | Documentation 103 | ------------- 104 | 105 | For detailed information about the components and their usage, see: 106 | 107 | .. toctree:: 108 | :maxdepth: 1 109 | :caption: Contents: 110 | 111 | repository 112 | project 113 | cache 114 | remote_operations 115 | performance 116 | usecases 117 | contributors 118 | 119 | Additional Resources 120 | -------------------- 121 | 122 | * :ref:`genindex` - Complete API reference 123 | * :ref:`modindex` - Module index 124 | * :ref:`search` - Search the documentation 125 | 126 | License 127 | ------- 128 | 129 | This project is BSD licensed (see LICENSE.md) 130 | 131 | -------------------------------------------------------------------------------- /docs/source/project.rst: -------------------------------------------------------------------------------- 1 | Project Directory 2 | ================= 3 | 4 | The ProjectDirectory class enables analysis across multiple Git repositories. It can aggregate metrics and insights from multiple repositories into a single output. 5 | 6 | Overview 7 | -------- 8 | 9 | The ProjectDirectory class provides: 10 | 11 | * Analysis across multiple repositories 12 | * Aggregated metrics and statistics 13 | * Project-level insights 14 | * Multi-repository bus factor analysis 15 | * Consolidated commit history and blame information 16 | 17 | Creating a ProjectDirectory 18 | --------------------------- 19 | 20 | You can create a ProjectDirectory object in three ways: 21 | 22 | Directory of Repositories 23 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | 25 | Create a ProjectDirectory from a directory containing multiple repositories: 26 | 27 | .. code-block:: python 28 | 29 | from gitpandas import ProjectDirectory 30 | project = ProjectDirectory( 31 | working_dir='/path/to/dir/', 32 | ignore_repos=['repo_to_ignore'], 33 | verbose=True, 34 | default_branch='main' # Optional, will auto-detect if not specified 35 | ) 36 | 37 | The `ignore_repos` parameter can be a list of repository names to exclude. This method uses `os.walk` to search for `.git` directories recursively. 38 | 39 | Explicit Local Repositories 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | Create a ProjectDirectory from a list of local repositories: 43 | 44 | .. code-block:: python 45 | 46 | from gitpandas import ProjectDirectory 47 | project = ProjectDirectory( 48 | working_dir=['/path/to/repo1/', '/path/to/repo2/'], 49 | ignore_repos=['repo_to_ignore'], 50 | verbose=True, 51 | default_branch='main' # Optional, will auto-detect if not specified 52 | ) 53 | 54 | Explicit Remote Repositories 55 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 56 | 57 | Create a ProjectDirectory from a list of remote repositories: 58 | 59 | .. code-block:: python 60 | 61 | from gitpandas import ProjectDirectory 62 | project = ProjectDirectory( 63 | working_dir=['git://github.com/user/repo1.git', 'git://github.com/user/repo2.git'], 64 | ignore_repos=['repo_to_ignore'], 65 | verbose=True, 66 | default_branch='main' # Optional, will auto-detect if not specified 67 | ) 68 | 69 | Available Methods 70 | ---------------- 71 | 72 | Core Analysis 73 | ~~~~~~~~~~~~ 74 | 75 | .. code-block:: python 76 | 77 | # Commit history across repositories 78 | project.commit_history( 79 | branch=None, # Branch to analyze 80 | limit=None, # Maximum number of commits 81 | days=None, # Limit to last N days 82 | ignore_globs=None, # Files to ignore 83 | include_globs=None # Files to include 84 | ) 85 | 86 | # File change history across repositories 87 | project.file_change_history( 88 | branch=None, 89 | limit=None, 90 | days=None, 91 | ignore_globs=None, 92 | include_globs=None 93 | ) 94 | 95 | # Blame analysis across repositories 96 | project.blame( 97 | rev="HEAD", # Revision to analyze 98 | committer=True, # Group by committer (False for author) 99 | by="repository", # Group by 'repository' or 'file' 100 | ignore_globs=None, 101 | include_globs=None 102 | ) 103 | 104 | # Bus factor analysis across repositories 105 | project.bus_factor( 106 | by="repository", # How to group results ('projectd', 'repository', or 'file') 107 | ignore_globs=None, 108 | include_globs=None 109 | ) 110 | 111 | Common Parameters 112 | ---------------- 113 | 114 | Most analysis methods support these filtering parameters: 115 | 116 | * **branch**: Branch to analyze (defaults to repository's default branch) 117 | * **limit**: Maximum number of commits to analyze 118 | * **days**: Limit analysis to last N days 119 | * **ignore_globs**: List of glob patterns for files to ignore 120 | * **include_globs**: List of glob patterns for files to include 121 | * **by**: How to group results (usually 'repository' or 'file') 122 | 123 | API Reference 124 | ------------- 125 | 126 | .. currentmodule:: gitpandas.project 127 | 128 | .. autoclass:: ProjectDirectory 129 | :members: 130 | :undoc-members: 131 | :show-inheritance: 132 | :inherited-members: 133 | :special-members: __init__, __str__, __repr__ 134 | 135 | .. rubric:: Methods 136 | -------------------------------------------------------------------------------- /docs/source/repository.rst: -------------------------------------------------------------------------------- 1 | Repository 2 | ========== 3 | 4 | The Repository class provides an interface for analyzing a single Git repository. It can be created from either a local or remote repository. 5 | 6 | Overview 7 | -------- 8 | 9 | The Repository class offers methods for: 10 | 11 | * Commit history analysis with filtering options 12 | * File change tracking and blame information 13 | * Branch existence checking and repository status 14 | * Bus factor calculation and repository metrics 15 | * Punchcard statistics generation 16 | 17 | Creating a Repository 18 | --------------------- 19 | 20 | You can create a Repository object in two ways: 21 | 22 | Local Repository 23 | ~~~~~~~~~~~~~~~~ 24 | 25 | Create a Repository from a local Git repository: 26 | 27 | .. code-block:: python 28 | 29 | from gitpandas import Repository 30 | repo = Repository( 31 | working_dir='/path/to/repo/', 32 | verbose=True, 33 | default_branch='main' # Optional, will auto-detect if not specified 34 | ) 35 | 36 | The directory must contain a `.git` directory. Subdirectories are not searched. 37 | 38 | Remote Repository 39 | ~~~~~~~~~~~~~~~~~ 40 | 41 | Create a Repository from a remote Git repository: 42 | 43 | .. code-block:: python 44 | 45 | from gitpandas import Repository 46 | repo = Repository( 47 | working_dir='git://github.com/user/repo.git', 48 | verbose=True, 49 | default_branch='main' # Optional, will auto-detect if not specified 50 | ) 51 | 52 | The repository will be cloned locally into a temporary directory. This can be slow for large repositories. 53 | 54 | Available Methods 55 | ---------------- 56 | 57 | Core Analysis 58 | ~~~~~~~~~~~~ 59 | 60 | .. code-block:: python 61 | 62 | # Commit history analysis 63 | repo.commit_history( 64 | branch=None, # Branch to analyze 65 | limit=None, # Maximum number of commits 66 | days=None, # Limit to last N days 67 | ignore_globs=None, # Files to ignore 68 | include_globs=None # Files to include 69 | ) 70 | 71 | # File change history 72 | repo.file_change_history( 73 | branch=None, 74 | limit=None, 75 | days=None, 76 | ignore_globs=None, 77 | include_globs=None 78 | ) 79 | 80 | # Blame analysis 81 | repo.blame( 82 | rev="HEAD", # Revision to analyze 83 | committer=True, # Group by committer (False for author) 84 | by="repository", # Group by 'repository' or 'file' 85 | ignore_globs=None, 86 | include_globs=None 87 | ) 88 | 89 | # Bus factor analysis 90 | repo.bus_factor( 91 | by="repository", # How to group results ('repository' or 'file') 92 | ignore_globs=None, 93 | include_globs=None 94 | ) 95 | 96 | # Commit pattern analysis 97 | repo.punchcard( 98 | branch=None, 99 | limit=None, 100 | days=None, 101 | by=None, # Additional grouping 102 | normalize=None, # Normalize values 103 | ignore_globs=None, 104 | include_globs=None 105 | ) 106 | 107 | Repository Information 108 | ~~~~~~~~~~~~~~~~~~~~ 109 | 110 | .. code-block:: python 111 | 112 | # List files in repository 113 | repo.list_files(rev="HEAD") 114 | 115 | # Check branch existence 116 | repo.has_branch(branch) 117 | 118 | # Check if repository is bare 119 | repo.is_bare() 120 | 121 | # Check for coverage information 122 | repo.has_coverage() 123 | repo.coverage() 124 | 125 | # Get specific commit content 126 | repo.get_commit_content( 127 | rev, # Revision to analyze 128 | ignore_globs=None, 129 | include_globs=None 130 | ) 131 | 132 | Common Parameters 133 | ---------------- 134 | 135 | Most analysis methods support these filtering parameters: 136 | 137 | * **branch**: Branch to analyze (defaults to repository's default branch) 138 | * **limit**: Maximum number of commits to analyze 139 | * **days**: Limit analysis to last N days 140 | * **ignore_globs**: List of glob patterns for files to ignore 141 | * **include_globs**: List of glob patterns for files to include 142 | * **by**: How to group results (usually 'repository' or 'file') 143 | 144 | API Reference 145 | ------------- 146 | 147 | .. currentmodule:: gitpandas.repository 148 | 149 | .. autoclass:: Repository 150 | :members: 151 | :undoc-members: 152 | :show-inheritance: 153 | :inherited-members: 154 | :special-members: __init__, __str__, __repr__ 155 | 156 | .. autoclass:: GitFlowRepository 157 | :members: 158 | :undoc-members: 159 | :show-inheritance: 160 | :inherited-members: -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v2.5.0 2 | ====== 3 | 4 | ## New Features 5 | 6 | ### Remote Operations & Cache Warming 7 | * **NEW**: `Repository.safe_fetch_remote()` - Safely fetch changes from remote repositories without modifying working directory 8 | - Read-only operation with comprehensive error handling 9 | - Support for dry-run preview and remote validation 10 | - Configurable remote names and pruning options 11 | * **NEW**: `Repository.warm_cache()` - Pre-populate repository cache for improved performance 12 | - Configurable method selection with intelligent parameter handling 13 | - Performance metrics and cache entry tracking 14 | - Significant performance improvements (1.5-10x speedup demonstrated) 15 | * **NEW**: `ProjectDirectory.bulk_fetch_and_warm()` - Efficiently process multiple repositories 16 | - Parallel processing support when joblib is available 17 | - Error isolation (failures in one repo don't affect others) 18 | - Comprehensive summary statistics and progress tracking 19 | 20 | ### Enhanced Caching System 21 | * **NEW**: `CacheEntry` class with metadata tracking (timestamps, age calculation) 22 | * **ENHANCED**: Thread-safe cache operations with proper locking mechanisms 23 | * **ENHANCED**: Cache key consistency improvements using `||` delimiter format 24 | * **ENHANCED**: Cache timestamp and metadata access methods (`get_cache_info()`, `list_cached_keys()`) 25 | 26 | ### Documentation & Examples 27 | * **NEW**: Comprehensive remote operations documentation (`docs/source/remote_operations.rst`) 28 | * **NEW**: Cache warming and remote fetch example (`examples/remote_fetch_and_cache_warming.py`) 29 | * **NEW**: Cache timestamp usage example (`examples/cache_timestamps.py`) 30 | * **NEW**: Release analytics example (`examples/release_analytics.py`) 31 | 32 | ## Testing & Quality 33 | * **NEW**: 38 comprehensive tests for remote operations and cache warming 34 | * **NEW**: Thread safety tests for cache operations 35 | * **NEW**: Edge case and error handling test coverage 36 | * **IMPROVED**: Overall test coverage and reliability 37 | * **FIXED**: Various minor bugs and future warnings 38 | 39 | ## Backward Compatibility 40 | * All new features are fully backward compatible 41 | * No breaking changes to existing APIs 42 | * Existing cache backends work seamlessly with new features 43 | 44 | v2.4.0 45 | ====== 46 | 47 | * Significant caching bugfixes and updates 48 | * Added a DiskCache that persists across runs 49 | * Added release analytics 50 | 51 | v2.3.0 52 | ====== 53 | 54 | * Updated coverage file parsing to use coverage.py API instead of direct file parsing 55 | * Added coverage>=5.0.0 as a core dependency 56 | * Added a basic MCP server 57 | * Added methods to `Repository` for getting files in repo, getting content of a file, and getting diffs of a revision 58 | 59 | 60 | v2.2.1 61 | ====== 62 | 63 | * Docs CI bugfix 64 | 65 | v2.2.0 66 | ====== 67 | 68 | * Support for default branch setting instead of assuming master, will infer if not passed 69 | * Better handling of ignore repos in project directory setup 70 | * Added a branch exists helper in repository 71 | * Docs corrections 72 | 73 | v2.1.0 74 | ====== 75 | 76 | * Imrpoved test suite 77 | * Many bugfixes 78 | * Updates for pandas v2 79 | 80 | v2.0.0 81 | ====== 82 | 83 | * Fully transitioned to ignore_globs and include_globs style syntax 84 | * Parallelized cumulative blame support with joblib threading backend 85 | * Added threading parallelism to many project directory functions. 86 | * Added a chaching module for optional redis or memory backed caching of certain resultsets 87 | 88 | v1.2.0 89 | ====== 90 | 91 | * Added ignore_globs option alongside all methods with ignore_dir and extensions, will be the only method for filtering files in v2.0.0 92 | 93 | v1.1.0 94 | ====== 95 | 96 | * _repo_name changed to repo_name in project directories (old method left with deprecation warning) 97 | * repo_name property added to repositories 98 | 99 | v1.0.3 100 | ====== 101 | 102 | * Support for estimating time spent developing on projects. 103 | 104 | v1.0.2 105 | ====== 106 | 107 | * bugfix in ignore_dir option for root level directories 108 | 109 | v1.0.1 110 | ====== 111 | 112 | * file details function 113 | 114 | v1.0.0 115 | ====== 116 | 117 | * Stable API 118 | * Punchcard dataframe added 119 | * Plotting helpers added to library under utilities module 120 | * Added github.com profile object 121 | 122 | v0.0.6 123 | ====== 124 | 125 | * Added file owner utility 126 | * Added lifelines example 127 | * Added rev to file change history table 128 | * Added file-wise blame using by='file' parameter 129 | * Bus Factor returns a dataframe 130 | * Now supporting python 2.7+ and 3.3+ 131 | 132 | v0.0.5 133 | ====== 134 | 135 | * Added file change rates table with risk metrics 136 | * Added basic functionality with coverage files 137 | * Added limited time window based dataset functionality 138 | * Expanded docs 139 | 140 | v0.0.4 141 | ====== 142 | 143 | * Added cumulative blame and revision history 144 | 145 | v0.0.3 146 | ====== 147 | 148 | * Added approximate bus factor analysis 149 | 150 | v0.0.2 151 | ====== 152 | 153 | * Added blame 154 | 155 | v0.0.1 156 | ====== 157 | 158 | * Initial release, basic interface to commit history and descriptors -------------------------------------------------------------------------------- /tests/test_Repository/test_properties.py: -------------------------------------------------------------------------------- 1 | import git 2 | import pytest 3 | from pandas import DataFrame 4 | 5 | from gitpandas import Repository 6 | 7 | __author__ = "willmcginnis" 8 | 9 | 10 | @pytest.fixture 11 | def remote_repo(): 12 | """Fixture for a remote repository.""" 13 | repo = Repository(working_dir="https://github.com/wdm0006/git-pandas.git", verbose=True) 14 | yield repo 15 | repo.__del__() 16 | 17 | 18 | @pytest.fixture 19 | def local_repo(tmp_path, default_branch): 20 | """Create a local git repository for testing.""" 21 | repo_path = tmp_path / "repository1" 22 | repo_path.mkdir() 23 | repo = git.Repo.init(repo_path) 24 | 25 | # Configure git user 26 | repo.config_writer().set_value("user", "name", "Test User").release() 27 | repo.config_writer().set_value("user", "email", "test@example.com").release() 28 | 29 | # Create and checkout default branch 30 | repo.git.checkout("-b", default_branch) 31 | 32 | # Create initial commit 33 | (repo_path / "README.md").write_text("# Test Repository") 34 | repo.index.add(["README.md"]) 35 | repo.index.commit("Initial commit") 36 | 37 | # Create test files 38 | py_content = """import os 39 | import sys 40 | import json 41 | def main(): 42 | print('Hello, World!') 43 | return True 44 | def helper(): 45 | return True 46 | if __name__ == '__main__': 47 | main()""" 48 | (repo_path / "test.py").write_text(py_content) 49 | (repo_path / "test.js").write_text("console.log('Hello, World!');") 50 | repo.index.add(["test.py", "test.js"]) 51 | repo.index.commit("Add test files") 52 | 53 | return repo_path 54 | 55 | 56 | # Remote repository tests 57 | class TestRemoteProperties: 58 | @pytest.mark.remote 59 | def test_repo_name(self, remote_repo): 60 | assert remote_repo.repo_name == "git-pandas" 61 | 62 | @pytest.mark.remote 63 | def test_branches(self, remote_repo, default_branch): 64 | branches = list(remote_repo.branches()["branch"].values) 65 | # Check for the default branch (could be master or main) 66 | assert default_branch in branches or "master" in branches or "main" in branches 67 | assert "gh-pages" in branches 68 | 69 | @pytest.mark.remote 70 | def test_tags(self, remote_repo): 71 | tags = list(remote_repo.tags()["tag"].values) 72 | assert "0.0.1" in tags 73 | assert "0.0.2" in tags 74 | 75 | @pytest.mark.remote 76 | def test_is_bare(self, remote_repo): 77 | assert not remote_repo.is_bare() 78 | 79 | 80 | # Local repository tests 81 | class TestLocalProperties: 82 | def test_repo_name(self, local_repo, default_branch): 83 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 84 | assert repo.repo_name == "repository1" 85 | 86 | def test_branches(self, local_repo, default_branch): 87 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 88 | branches = list(repo.branches()["branch"].values) 89 | assert default_branch in branches 90 | 91 | def test_tags(self, local_repo, default_branch): 92 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 93 | tags = repo.tags() 94 | assert len(tags) == 0 95 | 96 | def test_is_bare(self, local_repo, default_branch): 97 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 98 | assert not repo.is_bare() 99 | 100 | def test_commit_history(self, local_repo, default_branch): 101 | """Test commit history retrieval.""" 102 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 103 | history = repo.commit_history(branch=default_branch) 104 | assert isinstance(history, DataFrame) 105 | assert "repository" in history.columns 106 | assert len(history) > 0 107 | 108 | def test_file_change_history(self, local_repo, default_branch): 109 | """Test file change history retrieval.""" 110 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 111 | history = repo.file_change_history(branch=default_branch) 112 | assert isinstance(history, DataFrame) 113 | assert "repository" in history.columns 114 | assert len(history) > 0 115 | 116 | def test_file_change_rates(self, local_repo, default_branch): 117 | """Test file change rates calculation.""" 118 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 119 | rates = repo.file_change_rates(branch=default_branch) 120 | assert isinstance(rates, DataFrame) 121 | assert "repository" in rates.columns 122 | assert len(rates) > 0 123 | 124 | def test_has_coverage(self, local_repo, default_branch): 125 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 126 | # We know this repo doesn't have coverage 127 | assert not repo.has_coverage() 128 | 129 | def test_bus_factor(self, local_repo, default_branch): 130 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 131 | # We know this repo only has one committer 132 | assert repo.bus_factor(by="repository")["bus factor"].values[0] == 1 133 | 134 | def test_blame(self, local_repo, default_branch): 135 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 136 | blame = repo.blame(ignore_globs=["*.[!p][!y]"]) 137 | assert blame["loc"].sum() == 10 138 | assert blame.shape[0] == 1 139 | 140 | def test_cumulative_blame(self, local_repo, default_branch): 141 | """Test cumulative blame calculation.""" 142 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 143 | blame = repo.cumulative_blame(branch=default_branch) 144 | assert isinstance(blame, DataFrame) 145 | assert len(blame) > 0 146 | 147 | def test_revs(self, local_repo, default_branch): 148 | """Test revision history retrieval.""" 149 | repo = Repository(working_dir=str(local_repo), default_branch=default_branch) 150 | revs = repo.revs(branch=default_branch) 151 | assert isinstance(revs, DataFrame) 152 | assert "repository" in revs.columns 153 | assert len(revs) > 0 154 | -------------------------------------------------------------------------------- /docs/source/usecases.rst: -------------------------------------------------------------------------------- 1 | Use Cases and Examples 2 | ====================== 3 | 4 | Git-Pandas provides a powerful interface for analyzing Git repositories using pandas DataFrames. This guide demonstrates common use cases and provides practical examples. 5 | 6 | Basic Repository Analysis 7 | ------------------------- 8 | 9 | Repository Attributes 10 | ~~~~~~~~~~~~~~~~~~~~~ 11 | 12 | Get basic information about a repository: 13 | 14 | .. code-block:: python 15 | 16 | from gitpandas import Repository 17 | repo = Repository('/path/to/repo') 18 | 19 | # Get repository name 20 | print(repo.repo_name) 21 | 22 | # Check if repository is bare 23 | print(repo.is_bare()) 24 | 25 | # Get all tags 26 | print(repo.tags()) 27 | 28 | # Get all branches 29 | print(repo.branches()) 30 | 31 | # Get all revisions 32 | print(repo.revs()) 33 | 34 | # Get blame information 35 | print(repo.blame(include_globs=['*.py'])) 36 | 37 | Commit History Analysis 38 | ~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | Analyze commit patterns and history: 41 | 42 | .. code-block:: python 43 | 44 | # Get commit history 45 | commits_df = repo.commit_history() 46 | 47 | # Get file change history 48 | changes_df = repo.file_change_history() 49 | 50 | # Filter by file extension 51 | python_changes = repo.file_change_history(include_globs=['*.py']) 52 | 53 | # Filter by directory 54 | src_changes = repo.file_change_history(include_globs=['src/*']) 55 | 56 | # Get commits in tags 57 | tag_commits = repo.commits_in_tags() 58 | 59 | Project-Level Analysis 60 | ---------------------- 61 | 62 | Multiple Repository Analysis 63 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 64 | 65 | Analyze multiple repositories simultaneously: 66 | 67 | .. code-block:: python 68 | 69 | from gitpandas import ProjectDirectory 70 | 71 | # Create project from multiple repositories 72 | project = ProjectDirectory([ 73 | 'git://github.com/user/repo1.git', 74 | 'git://github.com/user/repo2.git' 75 | ]) 76 | 77 | # Get repository information 78 | print(project.repo_information()) 79 | 80 | # Calculate bus factor 81 | print(project.bus_factor()) 82 | 83 | # Get file change history 84 | print(project.file_change_history()) 85 | 86 | # Get blame information 87 | print(project.blame()) 88 | 89 | Advanced Analysis 90 | ----------------- 91 | 92 | Cumulative Blame Analysis 93 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 94 | 95 | Track code ownership over time: 96 | 97 | .. code-block:: python 98 | 99 | # Get cumulative blame 100 | blame_df = repo.cumulative_blame() 101 | 102 | # Plot cumulative blame using pandas plotting 103 | import matplotlib.pyplot as plt 104 | blame_df.plot(x='date', y='loc', title='Cumulative Blame Over Time') 105 | plt.show() 106 | 107 | Bus Factor Analysis 108 | ~~~~~~~~~~~~~~~~~~~ 109 | 110 | Analyze project sustainability: 111 | 112 | .. code-block:: python 113 | 114 | # Calculate bus factor for repository 115 | bus_factor = repo.bus_factor() 116 | 117 | # Calculate file-wise bus factor (new in v2.5.0) 118 | file_bus_factor = repo.bus_factor(by='file') 119 | 120 | # Get detailed blame information 121 | blame_df = repo.blame(by='file') # Get file-level blame details 122 | 123 | # Analyze ownership patterns 124 | ownership_patterns = repo.blame(committer=True, by='repository') 125 | 126 | Performance Optimization 127 | ------------------------ 128 | 129 | Using Caching 130 | ~~~~~~~~~~~~~ 131 | 132 | Optimize performance with caching: 133 | 134 | .. code-block:: python 135 | 136 | from gitpandas import Repository 137 | from gitpandas.cache import EphemeralCache, DiskCache, RedisDFCache 138 | 139 | # Use in-memory caching 140 | cache = EphemeralCache(max_keys=1000) 141 | repo = Repository('/path/to/repo', cache_backend=cache) 142 | 143 | # Use persistent disk caching (new in v2.5.0) 144 | disk_cache = DiskCache('/tmp/gitpandas_cache.gz', max_keys=500) 145 | repo = Repository('/path/to/repo', cache_backend=disk_cache) 146 | 147 | # Or use Redis for distributed caching 148 | redis_cache = RedisDFCache( 149 | host='localhost', 150 | port=6379, 151 | db=12, 152 | ttl=3600 # Cache entries expire after 1 hour 153 | ) 154 | repo = Repository('/path/to/repo', cache_backend=redis_cache) 155 | 156 | Cache Management (New in v2.5.0) 157 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 158 | 159 | Manage cache performance and contents: 160 | 161 | .. code-block:: python 162 | 163 | # Get cache statistics 164 | stats = repo.get_cache_stats() 165 | print(f"Cache usage: {stats['global_cache_stats']['cache_usage_percent']:.1f}%") 166 | 167 | # Invalidate specific cache entries 168 | repo.invalidate_cache(keys=['commit_history']) 169 | 170 | # Clear all cache for this repository 171 | repo.invalidate_cache() 172 | 173 | # Warm cache for better performance 174 | result = repo.warm_cache(methods=['commit_history', 'blame'], limit=100) 175 | print(f"Created {result['cache_entries_created']} cache entries") 176 | 177 | Visualization Examples 178 | ---------------------- 179 | 180 | Commit Analysis 181 | ~~~~~~~~~~~~~~~ 182 | 183 | Visualize commit patterns: 184 | 185 | .. code-block:: python 186 | 187 | # Get commit history 188 | commit_df = repo.commit_history() 189 | 190 | # Plot commits over time using pandas 191 | commit_df.resample('D').size().plot( 192 | kind='bar', 193 | title='Commits per Day' 194 | ) 195 | plt.show() 196 | 197 | File Change Analysis 198 | ~~~~~~~~~~~~~~~~~~~~ 199 | 200 | Visualize file changes: 201 | 202 | .. code-block:: python 203 | 204 | # Get file change history 205 | changes_df = repo.file_change_history() 206 | 207 | # Plot changes over time using pandas 208 | changes_df.groupby('filename')['insertions'].sum().plot( 209 | kind='bar', 210 | title='Lines Added by File' 211 | ) 212 | plt.show() 213 | 214 | Best Practices 215 | -------------- 216 | 217 | * Use caching for expensive operations like blame analysis 218 | * Filter data early using include_globs/ignore_globs 219 | * Leverage pandas operations for analysis 220 | * Consider memory usage with large repositories 221 | * Use appropriate branch names (main/master) 222 | * Handle repository cleanup properly when using remote repositories 223 | 224 | For more examples and detailed API documentation, see the :doc:`repository` and :doc:`project` pages. 225 | -------------------------------------------------------------------------------- /tests/test_cache_key_consistency.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | import pandas as pd 5 | import pytest 6 | 7 | from gitpandas.cache import DiskCache, multicache 8 | 9 | 10 | class RepositoryMock: 11 | """Mock Repository class to test cache key generation consistency""" 12 | 13 | def __init__(self, working_dir="/mock/repo/path", cache_backend=None): 14 | self.working_dir = working_dir 15 | self.repo_name = working_dir # Simulate how Repository sets repo_name 16 | self.cache_backend = cache_backend 17 | self.execution_count = 0 18 | 19 | @multicache(key_prefix="list_files", key_list=["directory", "filter_regex"]) 20 | def list_files(self, directory=None, filter_regex=None, force_refresh=False): 21 | """Mock list_files method that demonstrates the caching behavior""" 22 | self.execution_count += 1 23 | return pd.DataFrame( 24 | { 25 | "file": [f"file{i}.txt" for i in range(3)], 26 | "directory": [directory] * 3, 27 | "call_number": [self.execution_count] * 3, 28 | } 29 | ) 30 | 31 | @multicache(key_prefix="complex_method", key_list=["param1", "param2", "param3"]) 32 | def complex_method(self, param1=None, param2=None, param3=None, force_refresh=False): 33 | """Method with multiple parameters to test key generation with many parameters""" 34 | self.execution_count += 1 35 | return pd.DataFrame( 36 | { 37 | "result": [f"result{i}" for i in range(2)], 38 | "param_values": [f"{param1}_{param2}_{param3}"] * 2, 39 | "call_number": [self.execution_count] * 2, 40 | } 41 | ) 42 | 43 | 44 | class TestCacheKeyConsistency: 45 | """Tests specifically focused on the cache key consistency issue""" 46 | 47 | @pytest.fixture 48 | def temp_cache_path(self): 49 | """Create temporary file path for cache""" 50 | fd, path = tempfile.mkstemp(suffix=".gz") 51 | os.close(fd) 52 | yield path 53 | if os.path.exists(path): 54 | os.unlink(path) 55 | 56 | def test_consistent_cache_keys(self, temp_cache_path): 57 | """Test that cache keys are consistent between method calls""" 58 | cache = DiskCache(filepath=temp_cache_path) 59 | repo = RepositoryMock(cache_backend=cache) 60 | 61 | # Capture cache keys generated 62 | original_set = cache.set 63 | captured_keys = [] 64 | 65 | def mock_set(k, v): 66 | captured_keys.append(k) 67 | return original_set(k, v) 68 | 69 | cache.set = mock_set 70 | 71 | # First call 72 | repo.list_files(directory="src", filter_regex="*.py") 73 | first_key = captured_keys[0] 74 | 75 | # Clear captured keys 76 | captured_keys.clear() 77 | 78 | # Second call with identical parameters 79 | repo.list_files(directory="src", filter_regex="*.py") 80 | 81 | # No key should be captured on second call (cache hit) 82 | assert len(captured_keys) == 0 83 | 84 | # Force refresh should use the same key 85 | repo.list_files(directory="src", filter_regex="*.py", force_refresh=True) 86 | assert len(captured_keys) == 1 87 | assert captured_keys[0] == first_key 88 | 89 | def test_fix_resolves_reported_issue(self, temp_cache_path): 90 | """Test specifically addressing the reported issue""" 91 | cache = DiskCache(filepath=temp_cache_path) 92 | repo = RepositoryMock(working_dir="/absolute/path/to/repo", cache_backend=cache) 93 | 94 | # First call 95 | result1 = repo.list_files() 96 | assert repo.execution_count == 1 97 | 98 | # Second call to the same method (should use cache) 99 | result2 = repo.list_files() 100 | assert repo.execution_count == 1 # Should NOT increment 101 | 102 | # Results should match 103 | pd.testing.assert_frame_equal(result1, result2) 104 | 105 | def test_varied_path_formats(self, temp_cache_path): 106 | """Test with different path formats to ensure key consistency""" 107 | cache = DiskCache(filepath=temp_cache_path) 108 | 109 | # Different repo path formats 110 | repo1 = RepositoryMock(working_dir="/path/to/repo", cache_backend=cache) 111 | repo2 = RepositoryMock(working_dir="/path/to/repo/", cache_backend=cache) # Extra slash 112 | 113 | # Capture all keys set in the cache 114 | original_set = cache.set 115 | captured_keys = [] 116 | 117 | def mock_set(k, v): 118 | captured_keys.append(k) 119 | return original_set(k, v) 120 | 121 | cache.set = mock_set 122 | 123 | # Call method on first repo 124 | repo1.list_files(directory="src") 125 | assert repo1.execution_count == 1 126 | key1 = captured_keys[0] 127 | 128 | # Clear keys 129 | captured_keys.clear() 130 | 131 | # Call on second repo with same parameters - should generate a different key 132 | # due to different repo_name ("/path/to/repo" vs "/path/to/repo/") 133 | repo2.list_files(directory="src") 134 | assert repo2.execution_count == 1 # Should increment for repo2 135 | key2 = captured_keys[0] 136 | 137 | # Keys should be different because repo_name is different 138 | assert key1 != key2 139 | assert "||/path/to/repo||" in key1 140 | assert "||/path/to/repo/||" in key2 141 | 142 | def test_complex_key_generation(self, temp_cache_path): 143 | """Test key generation with complex parameters""" 144 | cache = DiskCache(filepath=temp_cache_path) 145 | repo = RepositoryMock(cache_backend=cache) 146 | 147 | # Capture keys 148 | original_set = cache.set 149 | captured_keys = [] 150 | 151 | def mock_set(k, v): 152 | captured_keys.append(k) 153 | return original_set(k, v) 154 | 155 | cache.set = mock_set 156 | 157 | # Call with complex parameters 158 | repo.complex_method(param1="value1", param2="value2", param3="value3") 159 | 160 | # Check key format 161 | key = captured_keys[0] 162 | assert key.startswith("complex_method||") 163 | assert "value1_" in key 164 | assert "value2_" in key 165 | assert "value3" in key 166 | 167 | # Call again with different order of parameters in the call 168 | # Python should normalize kwargs, so the key should be the same 169 | captured_keys.clear() 170 | repo.complex_method(param3="value3", param1="value1", param2="value2", force_refresh=True) 171 | 172 | # Key should be the same despite different parameter order 173 | assert captured_keys[0] == key 174 | -------------------------------------------------------------------------------- /examples/release_analytics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example demonstrating release analytics features. 3 | 4 | This example shows how to use gitpandas to analyze changes between release tags. 5 | """ 6 | 7 | import pandas as pd 8 | 9 | from gitpandas import Repository 10 | 11 | # --- Instantiate Repository --- 12 | # Using the gitpandas repository URI as an example 13 | # Replace with your repository URI or local path 14 | repo_uri = "https://github.com/wdm0006/git-pandas.git" 15 | print(f"Attempting to instantiate Repository for: {repo_uri}") 16 | 17 | try: 18 | # Instantiate the Repository object. 19 | # For remote repositories, gitpandas will clone it to a temporary directory. 20 | # verbose=True can be helpful for debugging, but is optional here. 21 | repo = Repository(working_dir=repo_uri, verbose=False) 22 | print(f"Successfully instantiated Repository for: {repo.repo_name}") 23 | print(f"Cloned to temporary directory: {repo.git_dir}") 24 | except Exception as e: 25 | print(f"Error instantiating repository: {e}") 26 | repo = None # Ensure repo is None if instantiation fails 27 | 28 | # --- Call release_tag_summary --- 29 | if repo: 30 | print("\n--- Release Tag Summary ---") 31 | try: 32 | # This method analyzes the repository's tags to provide a summary of 33 | # changes between each tagged release. 34 | # It looks at the differences from the previous tag to the current one. 35 | # Output includes diff statistics (insertions, deletions), 36 | # committers, authors, and files changed during that period. 37 | # You can use tag_glob to filter for specific tag patterns (e.g., 'v*.*'). 38 | release_summary_df = repo.release_tag_summary() 39 | 40 | if not release_summary_df.empty: 41 | print("Release summary retrieved successfully:") 42 | # Display the DataFrame. Pandas default display might be wide, 43 | # but for an example, direct print is usually fine. 44 | # For better display in production, consider setting pandas display options 45 | print(release_summary_df) 46 | 47 | # Example of how to access specific information: 48 | if "tag" in release_summary_df.columns and len(release_summary_df) > 1: 49 | # Show files changed between the first two listed tags (if available) 50 | # Note: The first tag in the summary won't have "previous tag" data. 51 | second_tag_entry = release_summary_df.iloc[1] # Second tag in the sorted list 52 | print(f"\nExample: Files changed for tag '{second_tag_entry['tag']}' (since previous tag):") 53 | if isinstance(second_tag_entry["files"], list) and second_tag_entry["files"]: 54 | for file_path in second_tag_entry["files"]: 55 | print(f" - {file_path}") 56 | else: 57 | print(" No files listed or files column is not a list.") 58 | 59 | elif release_summary_df is not None: # Empty DataFrame 60 | print("No release summary data returned. The repository might not have tags, or no tags match the glob.") 61 | else: # None was returned, indicating an issue 62 | print("Failed to retrieve release summary (method returned None).") 63 | 64 | except Exception as e: 65 | print(f"Error calling release_tag_summary: {e}") 66 | 67 | # --- Optional: Demonstrate get_commit_content --- 68 | # The release_tag_summary gives you information about *what* changed (files, stats). 69 | # If you need to see the *actual content* of a specific commit that was part of a 70 | # release (perhaps a commit listed by commits_in_tags, or the commit SHA 71 | # directly associated with a tag), you can use get_commit_content. 72 | 73 | print("\n--- Optional: Get Content of a Specific Commit ---") 74 | # Note: For a real scenario, you'd get a relevant commit SHA from your analysis, 75 | # for example, from the 'commit_sha' column in release_summary_df or from repo.commits_in_tags(). 76 | # As this is a brief example, we'll try to pick one from the summary if possible, 77 | # otherwise, we'll use a placeholder. 78 | target_commit_sha = None 79 | if "release_summary_df" in locals() and not release_summary_df.empty and "commit_sha" in release_summary_df.columns: 80 | # Let's try to get the commit SHA of the first tag listed (if any) 81 | # This commit is what the tag points to. 82 | potential_sha = release_summary_df["commit_sha"].iloc[0] 83 | if pd.notna(potential_sha): # Check if the SHA is not NaN or None 84 | target_commit_sha = potential_sha 85 | print(f"Attempting to get content for commit SHA (from first tag's commit_sha): {target_commit_sha}") 86 | else: 87 | print("Could not get a valid commit SHA from the release_summary_df's first entry.") 88 | 89 | if not target_commit_sha: 90 | target_commit_sha = "PLACEHOLDER_COMMIT_SHA" # Replace with an actual commit SHA from the repo 91 | print(f"Using placeholder commit SHA: {target_commit_sha}. Replace with a real one for actual output.") 92 | 93 | if target_commit_sha != "PLACEHOLDER_COMMIT_SHA": 94 | try: 95 | # The 'rev' parameter takes the commit SHA. 96 | commit_content_df = repo.get_commit_content(rev=target_commit_sha) 97 | 98 | if commit_content_df is not None and not commit_content_df.empty: 99 | print(f"Content changes for commit {target_commit_sha} (showing first 5 lines):") 100 | # Displaying only a part of the DataFrame for brevity. 101 | # Columns typically include: 'file_path', 'change_type', 'diff', 'old_blob_sha', 'new_blob_sha' 102 | print(commit_content_df.head()) 103 | elif commit_content_df is not None: # Empty DataFrame 104 | print( 105 | f"No content changes (e.g. diffs) found for commit {target_commit_sha}. " 106 | f"This can be normal for merge commits with no textual changes, " 107 | f"or if the commit only modified tree structure." 108 | ) 109 | else: # None was returned 110 | print( 111 | f"Failed to get content for commit {target_commit_sha} (method returned None). " 112 | f"Could be an invalid SHA or repository issue." 113 | ) 114 | except Exception as e: 115 | print(f"Error calling get_commit_content for {target_commit_sha}: {e}") 116 | else: 117 | print("Skipping get_commit_content due to placeholder SHA or if no valid SHA was found.") 118 | 119 | else: 120 | print("\nSkipping release_tag_summary and get_commit_content because repository instantiation failed.") 121 | -------------------------------------------------------------------------------- /docs/AUDIT_REPORT.md: -------------------------------------------------------------------------------- 1 | # Git-Pandas Documentation Audit Report 2 | 3 | ## Executive Summary 4 | 5 | The documentation audit has been completed for git-pandas v2.5.0. Overall, the documentation is comprehensive and well-structured, but several inconsistencies and outdated information were identified and corrected. 6 | 7 | ## Issues Found and Corrected 8 | 9 | ### ✅ FIXED - Critical Issues 10 | 11 | 1. **Version Information Inconsistency** 12 | - **Issue**: conf.py showed version "2.2.1" while current version is "2.5.0" 13 | - **Fix**: Updated conf.py to reflect correct version "2.5.0" 14 | - **Files**: `docs/source/conf.py` 15 | 16 | 2. **Missing Performance Documentation** 17 | - **Issue**: performance.rst was created but not included in main table of contents 18 | - **Fix**: Added performance.rst to index.rst toctree 19 | - **Files**: `docs/source/index.rst` 20 | 21 | 3. **Incomplete Bus Factor Documentation** 22 | - **Issue**: Bus factor docs didn't mention file-wise analysis option 23 | - **Fix**: Updated repository.rst and project.rst to show 'file' option 24 | - **Files**: `docs/source/repository.rst`, `docs/source/project.rst` 25 | 26 | ### ✅ FIXED - Content Updates 27 | 28 | 4. **Cache System Documentation** 29 | - **Issue**: Overview didn't mention cache management features 30 | - **Fix**: Added cache management and statistics to feature list 31 | - **Files**: `docs/source/cache.rst` 32 | 33 | 5. **Use Cases Missing New Features** 34 | - **Issue**: No examples of file-wise bus factor or cache management 35 | - **Fix**: Added comprehensive examples for v2.5.0 features 36 | - **Files**: `docs/source/usecases.rst` 37 | 38 | 6. **Contributors Page Outdated** 39 | - **Issue**: Didn't reflect recently completed features 40 | - **Fix**: Added "Recently Completed (v2.5.0)" section 41 | - **Files**: `docs/source/contributors.rst` 42 | 43 | ## Documentation Quality Assessment 44 | 45 | ### ✅ EXCELLENT - Well Documented Areas 46 | 47 | 1. **Cache System** (`cache.rst`) 48 | - Comprehensive coverage of all cache backends 49 | - Clear examples for each cache type 50 | - Good coverage of cache management features 51 | - Proper API reference with autodoc 52 | 53 | 2. **Remote Operations** (`remote_operations.rst`) 54 | - Detailed documentation of safe_fetch_remote 55 | - Complete coverage of cache warming features 56 | - Good examples and error handling documentation 57 | - Comprehensive return value documentation 58 | 59 | 3. **Performance Guide** (`performance.rst`) 60 | - Thorough performance optimization strategies 61 | - Benchmark data and real-world examples 62 | - Best practices and anti-patterns 63 | - Memory management guidance 64 | 65 | ### ✅ GOOD - Generally Well Documented 66 | 67 | 4. **Repository API** (`repository.rst`) 68 | - Good coverage of core methods 69 | - Clear parameter documentation 70 | - Could benefit from more advanced examples 71 | 72 | 5. **Project Directory** (`project.rst`) 73 | - Adequate coverage of multi-repository features 74 | - Good examples of different initialization methods 75 | 76 | 6. **Use Cases** (`usecases.rst`) 77 | - Good practical examples 78 | - Now includes v2.5.0 features 79 | - Could use more visualization examples 80 | 81 | ### ✅ ADEQUATE - Basic Documentation 82 | 83 | 7. **Index Page** (`index.rst`) 84 | - Clear quick start examples 85 | - Good feature overview 86 | - Proper navigation structure 87 | 88 | 8. **Contributors Guide** (`contributors.rst`) 89 | - Standard contribution guidelines 90 | - Now reflects current development status 91 | 92 | ## Remaining Recommendations 93 | 94 | ### High Priority 95 | 96 | 1. **README.md Synchronization** 97 | - Update Python version requirements (currently claims 2.7+ support) 98 | - Add examples of new v2.5.0 features 99 | - Update installation instructions for optional dependencies 100 | 101 | 2. **API Documentation Verification** 102 | - Ensure all public methods have proper docstrings 103 | - Verify autodoc is picking up all new methods 104 | - Check that method signatures in docs match implementation 105 | 106 | ### Medium Priority 107 | 108 | 3. **Cross-Reference Verification** 109 | - Verify all internal links work correctly 110 | - Check that all referenced examples exist 111 | - Ensure consistent terminology across documents 112 | 113 | 4. **Example Code Testing** 114 | - Systematically test all code examples in documentation 115 | - Add automated testing for documentation examples 116 | - Ensure examples use realistic file paths and parameters 117 | 118 | ### Low Priority 119 | 120 | 5. **Enhancement Opportunities** 121 | - Add more visualization examples using matplotlib/seaborn 122 | - Include performance benchmarks in appropriate sections 123 | - Add troubleshooting section for common issues 124 | 125 | ## Testing Performed 126 | 127 | ### ✅ Verified Working 128 | - Basic imports work correctly 129 | - Cache management methods exist and are callable 130 | - New features are accessible through public APIs 131 | - Documentation structure builds correctly 132 | 133 | ### Manual Verification Needed 134 | - All code examples execute without errors 135 | - External links are valid and accessible 136 | - Cross-references resolve correctly 137 | 138 | ## Files Modified in This Audit 139 | 140 | 1. `docs/source/conf.py` - Version update to 2.5.0 141 | 2. `docs/source/index.rst` - Added performance.rst to toctree 142 | 3. `docs/source/repository.rst` - Updated bus_factor documentation 143 | 4. `docs/source/project.rst` - Updated bus_factor options 144 | 5. `docs/source/cache.rst` - Enhanced feature overview 145 | 6. `docs/source/usecases.rst` - Added v2.5.0 feature examples 146 | 7. `docs/source/contributors.rst` - Added recently completed features section 147 | 148 | ## Overall Assessment 149 | 150 | **Grade: B+ (Good with room for improvement)** 151 | 152 | The documentation is comprehensive and covers all major features well. The recent additions for v2.5.0 are well-documented, particularly the cache management and remote operations features. The main areas for improvement are: 153 | 154 | 1. Synchronizing README.md with current documentation 155 | 2. Ensuring all code examples are tested and working 156 | 3. Verifying cross-references and links 157 | 158 | The documentation successfully serves its purpose of helping users understand and use git-pandas effectively, with clear examples and comprehensive API coverage. 159 | 160 | ## Next Steps 161 | 162 | 1. **Immediate**: Update README.md to match documentation 163 | 2. **Short-term**: Test all documentation examples 164 | 3. **Medium-term**: Add automated testing for documentation examples 165 | 4. **Long-term**: Consider adding more advanced use case examples 166 | 167 | --- 168 | 169 | *Audit completed: January 2025* 170 | *Documentation version: 2.5.0* 171 | *Status: Ready for release with minor README updates needed* -------------------------------------------------------------------------------- /examples/cache_timestamps.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of accessing cache timestamp information. 3 | 4 | This example demonstrates how users can check when cache entries were populated 5 | without any changes to the Repository or ProjectDirectory API. 6 | """ 7 | 8 | import os 9 | import time 10 | from datetime import datetime, timezone 11 | 12 | from definitions import GIT_PANDAS_DIR 13 | 14 | from gitpandas import Repository 15 | from gitpandas.cache import DiskCache, EphemeralCache 16 | 17 | 18 | def demonstrate_cache_timestamps(): 19 | """Demonstrate accessing cache timestamp information.""" 20 | print("Cache Timestamp Information Demo") 21 | print("=" * 40) 22 | 23 | # Create a repository with a cache backend 24 | cache = EphemeralCache(max_keys=100) 25 | repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master") 26 | 27 | print(f"Repository: {repo.repo_name}") 28 | print(f"Cache backend: {type(cache).__name__}") 29 | print() 30 | 31 | # Call some methods to populate the cache 32 | print("Populating cache with repository data...") 33 | 34 | print(" - Getting commit history...") 35 | repo.commit_history(limit=10) 36 | 37 | print(" - Getting file list...") 38 | repo.list_files() 39 | 40 | print(" - Getting blame information...") 41 | repo.blame() 42 | 43 | print(f"Cache now contains {len(cache._cache)} entries") 44 | print() 45 | 46 | # Show cache information 47 | print("Cache Contents and Timestamps:") 48 | print("-" * 40) 49 | 50 | cached_keys = cache.list_cached_keys() 51 | for entry in cached_keys: 52 | print(f"Key: {entry['key']}") 53 | print(f" Cached at: {entry['cached_at'].strftime('%Y-%m-%d %H:%M:%S UTC')}") 54 | print(f" Age: {entry['age_seconds']:.1f} seconds") 55 | print() 56 | 57 | # Wait a moment and call one method again 58 | print("Waiting 2 seconds and refreshing commit history...") 59 | time.sleep(2) 60 | 61 | # This should hit the cache 62 | repo.commit_history(limit=10) 63 | 64 | # This should create a new cache entry 65 | repo.commit_history(limit=20) 66 | 67 | print("\nUpdated cache contents:") 68 | print("-" * 40) 69 | 70 | cached_keys = cache.list_cached_keys() 71 | for entry in cached_keys: 72 | print(f"Key: {entry['key']}") 73 | print(f" Cached at: {entry['cached_at'].strftime('%Y-%m-%d %H:%M:%S UTC')}") 74 | print(f" Age: {entry['age_seconds']:.1f} seconds") 75 | print() 76 | 77 | # Demonstrate getting specific cache info 78 | print("Getting specific cache information:") 79 | print("-" * 40) 80 | 81 | # Find a commit_history cache key 82 | commit_keys = [k for k in cached_keys if "commit_history" in k["key"]] 83 | if commit_keys: 84 | key = commit_keys[0]["key"] 85 | info = cache.get_cache_info(key) 86 | if info: 87 | print(f"Cache info for key '{key}':") 88 | print(f" Cached at: {info['cached_at']}") 89 | print(f" Age: {info['age_minutes']:.2f} minutes") 90 | print(f" Age: {info['age_hours']:.4f} hours") 91 | 92 | 93 | def demonstrate_disk_cache_persistence(): 94 | """Demonstrate cache persistence with DiskCache.""" 95 | print("\n" + "=" * 50) 96 | print("Disk Cache Persistence Demo") 97 | print("=" * 50) 98 | 99 | cache_file = "/tmp/gitpandas_demo_cache.gz" 100 | 101 | # Clean up any existing cache file 102 | if os.path.exists(cache_file): 103 | os.remove(cache_file) 104 | 105 | print("Creating repository with DiskCache...") 106 | cache = DiskCache(filepath=cache_file, max_keys=50) 107 | repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master") 108 | 109 | # Populate cache 110 | print("Populating cache...") 111 | repo.commit_history(limit=5) 112 | repo.list_files() 113 | 114 | print(f"Cache file created: {cache_file}") 115 | print(f"Cache contains {len(cache._cache)} entries") 116 | 117 | # Show initial cache info 118 | cached_keys = cache.list_cached_keys() 119 | print("\nInitial cache entries:") 120 | for entry in cached_keys: 121 | print(f" {entry['key']}: {entry['cached_at'].strftime('%H:%M:%S')}") 122 | 123 | # Create a new cache instance from the same file 124 | print("\nCreating new cache instance from saved file...") 125 | cache2 = DiskCache(filepath=cache_file, max_keys=50) 126 | 127 | print(f"Loaded cache contains {len(cache2._cache)} entries") 128 | 129 | # Show loaded cache info 130 | cached_keys2 = cache2.list_cached_keys() 131 | print("\nLoaded cache entries:") 132 | for entry in cached_keys2: 133 | print(f" {entry['key']}: {entry['cached_at'].strftime('%H:%M:%S')} (age: {entry['age_seconds']:.1f}s)") 134 | 135 | # Clean up 136 | if os.path.exists(cache_file): 137 | os.remove(cache_file) 138 | print(f"\nCleaned up cache file: {cache_file}") 139 | 140 | 141 | def demonstrate_cache_with_force_refresh(): 142 | """Demonstrate cache behavior with force_refresh.""" 143 | print("\n" + "=" * 50) 144 | print("Force Refresh Demo") 145 | print("=" * 50) 146 | 147 | cache = EphemeralCache(max_keys=10) 148 | repo = Repository(working_dir=GIT_PANDAS_DIR, cache_backend=cache, default_branch="master") 149 | 150 | print("Getting commit history (first time)...") 151 | start_time = datetime.now(timezone.utc) 152 | repo.commit_history(limit=5) 153 | 154 | time.sleep(1) 155 | 156 | print("Getting commit history (should use cache)...") 157 | repo.commit_history(limit=5) 158 | 159 | time.sleep(1) 160 | 161 | print("Getting commit history with force_refresh=True...") 162 | repo.commit_history(limit=5, force_refresh=True) 163 | 164 | print("\nCache timeline:") 165 | cached_keys = cache.list_cached_keys() 166 | for entry in cached_keys: 167 | if "commit_history" in entry["key"]: 168 | age_from_start = (entry["cached_at"] - start_time).total_seconds() 169 | print(f" Commit history cached at: +{age_from_start:.1f}s from start") 170 | print(f" Current age: {entry['age_seconds']:.1f}s") 171 | 172 | 173 | if __name__ == "__main__": 174 | try: 175 | demonstrate_cache_timestamps() 176 | demonstrate_disk_cache_persistence() 177 | demonstrate_cache_with_force_refresh() 178 | 179 | print("\n" + "=" * 50) 180 | print("Summary:") 181 | print("- Cache backends now track when entries were created") 182 | print("- No changes to Repository or ProjectDirectory API") 183 | print("- Users can access cache info via cache_backend.get_cache_info()") 184 | print("- Users can list all cached keys via cache_backend.list_cached_keys()") 185 | print("- Backward compatibility maintained with existing caches") 186 | 187 | except Exception as e: 188 | print(f"Error running demo: {e}") 189 | print("Make sure you're running this from the git-pandas directory") 190 | -------------------------------------------------------------------------------- /tests/test_Repository/test_punchcard.py: -------------------------------------------------------------------------------- 1 | import git 2 | import pandas as pd 3 | import pytest 4 | 5 | from gitpandas import Repository 6 | 7 | 8 | @pytest.fixture 9 | def local_repo(tmp_path, default_branch): 10 | """Fixture for a local repository with commits at different times.""" 11 | # Create a temporary directory 12 | repo_dir = tmp_path / "repository1" 13 | repo_dir.mkdir() 14 | 15 | # Initialize a git repo 16 | grepo = git.Repo.init(str(repo_dir)) 17 | 18 | # Configure git user 19 | grepo.git.config("user.name", "Test User") 20 | grepo.git.config("user.email", "test@example.com") 21 | 22 | # Create and checkout default branch 23 | grepo.git.checkout("-b", default_branch) 24 | 25 | # Add a README file 26 | readme_path = repo_dir / "README.md" 27 | readme_path.write_text("Sample README for a sample project\n") 28 | 29 | # Commit it 30 | grepo.git.add("README.md") 31 | 32 | # Set the environment variables for the commit time 33 | # Monday morning 34 | env = { 35 | "GIT_AUTHOR_DATE": "2023-01-02T09:30:00", 36 | "GIT_COMMITTER_DATE": "2023-01-02T09:30:00", 37 | } 38 | grepo.git.commit(m="first commit", env=env) 39 | 40 | # Add some Python files at different times 41 | for idx, (day, hour) in enumerate( 42 | [ 43 | # Tuesday afternoon 44 | (3, 14), 45 | # Wednesday evening 46 | (4, 19), 47 | # Thursday night 48 | (5, 22), 49 | # Friday morning 50 | (6, 8), 51 | # Saturday afternoon 52 | (7, 15), 53 | ] 54 | ): 55 | py_file = repo_dir / f"file_{idx}.py" 56 | py_file.write_text("import sys\nimport os\n") 57 | 58 | grepo.git.add(all=True) 59 | 60 | # Set the environment variables for the commit time 61 | env = { 62 | "GIT_AUTHOR_DATE": f"2023-01-{day:02d}T{hour:02d}:00:00", 63 | "GIT_COMMITTER_DATE": f"2023-01-{day:02d}T{hour:02d}:00:00", 64 | } 65 | grepo.git.commit(m=f"adding file_{idx}.py", env=env) 66 | 67 | # Create the Repository object 68 | git_pandas_repo = Repository(working_dir=str(repo_dir), verbose=True, default_branch=default_branch) 69 | 70 | yield git_pandas_repo 71 | 72 | # Cleanup 73 | git_pandas_repo.__del__() 74 | 75 | 76 | class TestPunchcard: 77 | def test_punchcard_basic(self, local_repo, default_branch): 78 | """Test basic functionality of the punchcard method.""" 79 | punchcard = local_repo.punchcard(branch=default_branch) 80 | 81 | # Check the shape and columns 82 | assert isinstance(punchcard, pd.DataFrame) 83 | assert punchcard.shape[0] > 0 84 | assert "hour_of_day" in punchcard.columns 85 | assert "day_of_week" in punchcard.columns 86 | assert "lines" in punchcard.columns 87 | assert "insertions" in punchcard.columns 88 | assert "deletions" in punchcard.columns 89 | assert "net" in punchcard.columns 90 | 91 | # Check that we have data for the days and hours we committed 92 | days_hours = [ 93 | (0, 9), # Monday 9am 94 | (1, 14), # Tuesday 2pm 95 | (2, 19), # Wednesday 7pm 96 | (3, 22), # Thursday 10pm 97 | (4, 8), # Friday 8am 98 | (5, 15), # Saturday 3pm 99 | ] 100 | 101 | for day, hour in days_hours: 102 | matching_rows = punchcard[(punchcard["day_of_week"] == day) & (punchcard["hour_of_day"] == hour)] 103 | if len(matching_rows) > 0: 104 | assert matching_rows["net"].values[0] > 0 105 | 106 | def test_punchcard_normalize(self, local_repo, default_branch): 107 | """Test the normalize parameter of the punchcard method.""" 108 | # Get punchcard without normalization 109 | local_repo.punchcard(branch=default_branch) 110 | 111 | # Get punchcard with normalization by value 112 | punchcard_norm = local_repo.punchcard(branch=default_branch, normalize=1.0) 113 | 114 | # Check that the normalized values are between 0 and 1 115 | assert punchcard_norm["net"].max() <= 1.0 116 | assert punchcard_norm["net"].min() >= 0.0 117 | 118 | # Check that the row normalization works correctly 119 | for day in range(7): 120 | day_rows = punchcard_norm[punchcard_norm["day_of_week"] == day] 121 | if len(day_rows) > 0 and day_rows["net"].sum() > 0: 122 | # If there are commits on this day, the max value should be 1.0 or close to it 123 | assert day_rows["net"].max() <= 1.0 124 | 125 | def test_punchcard_by_parameter(self, local_repo): 126 | """Test the 'by' parameter of the punchcard method.""" 127 | # Test with by='committer' 128 | punchcard_committer = local_repo.punchcard(branch="master", by="committer") 129 | 130 | # Check that we have the committer column 131 | assert "committer" in punchcard_committer.columns 132 | 133 | # Test with by='repository' 134 | punchcard_repo = local_repo.punchcard(branch="master", by="repository") 135 | 136 | # Check that we have the repository column 137 | assert "repository" in punchcard_repo.columns 138 | 139 | def test_punchcard_with_globs(self, local_repo): 140 | """Test the ignore_globs and include_globs parameters.""" 141 | # Get punchcard for all files 142 | punchcard_all = local_repo.punchcard(branch="master") 143 | 144 | # Get punchcard ignoring Python files 145 | punchcard_no_py = local_repo.punchcard(branch="master", ignore_globs=["*.py"]) 146 | 147 | # Check that we have fewer lines in the filtered punchcard 148 | assert punchcard_no_py["lines"].sum() < punchcard_all["lines"].sum() 149 | 150 | # Get punchcard including only Python files 151 | punchcard_only_py = local_repo.punchcard(branch="master", include_globs=["*.py"]) 152 | 153 | # Check that we have fewer lines than the full punchcard 154 | assert punchcard_only_py["lines"].sum() < punchcard_all["lines"].sum() 155 | 156 | # Check that the sum of the filtered punchcards equals the total 157 | assert punchcard_no_py["lines"].sum() + punchcard_only_py["lines"].sum() == punchcard_all["lines"].sum() 158 | 159 | def test_punchcard_with_limit(self, local_repo): 160 | """Test the limit parameter of the punchcard method.""" 161 | # Get punchcard with all commits 162 | punchcard_all = local_repo.punchcard(branch="master") 163 | 164 | # Get punchcard with limited commits 165 | punchcard_limited = local_repo.punchcard(branch="master", limit=3) 166 | 167 | # Check that we have fewer lines in the limited punchcard 168 | assert punchcard_limited["lines"].sum() <= punchcard_all["lines"].sum() 169 | 170 | def test_punchcard_with_days(self, local_repo): 171 | """Test the days parameter of the punchcard method.""" 172 | # Get punchcard with all commits 173 | local_repo.punchcard(branch="master") 174 | 175 | # Get punchcard with commits from the last 2 days 176 | # Since our test data is from 2023, this should return an empty DataFrame 177 | punchcard_recent = local_repo.punchcard(branch="master", days=2) 178 | 179 | # Check that we have no lines in the recent punchcard 180 | assert punchcard_recent["lines"].sum() == 0 181 | -------------------------------------------------------------------------------- /tests/test_Repository/test_file_operations.py: -------------------------------------------------------------------------------- 1 | import git 2 | import pandas as pd 3 | import pytest 4 | 5 | from gitpandas import Repository 6 | 7 | 8 | @pytest.fixture 9 | def local_repo(tmp_path, default_branch): 10 | """Create a local git repository with various file types and structures.""" 11 | repo_path = tmp_path / "test_repo" 12 | repo_path.mkdir() 13 | repo = git.Repo.init(repo_path) 14 | 15 | # Configure git user 16 | repo.config_writer().set_value("user", "name", "Test User").release() 17 | repo.config_writer().set_value("user", "email", "test@example.com").release() 18 | 19 | # Create and checkout default branch 20 | repo.git.checkout("-b", default_branch) 21 | 22 | # Create and checkout master branch 23 | repo.git.checkout("-b", "master") 24 | 25 | # Create initial structure 26 | (repo_path / "src").mkdir() 27 | (repo_path / "docs").mkdir() 28 | (repo_path / "tests").mkdir() 29 | 30 | # Create various files 31 | files = { 32 | "README.md": "# Test Repository\nA test repository for gitpandas.", 33 | "src/main.py": "def main():\n print('Hello, World!')\n return True", 34 | "src/utils.py": "def helper():\n return 'helper'", 35 | "docs/index.md": "# Documentation\nThis is the documentation.", 36 | "tests/test_main.py": "def test_main():\n assert True", 37 | ".gitignore": "*.pyc\n__pycache__/\n.DS_Store", 38 | } 39 | 40 | # Create and commit files 41 | for path, content in files.items(): 42 | file_path = repo_path / path 43 | file_path.write_text(content) 44 | repo.index.add([str(file_path)]) 45 | 46 | repo.index.commit("Initial commit") 47 | 48 | # Create some ignored files 49 | (repo_path / "src/main.pyc").write_text("compiled python") 50 | (repo_path / "src/__pycache__").mkdir() 51 | (repo_path / "src/__pycache__/main.cpython-39.pyc").write_text("cached python") 52 | 53 | # Make a change to test commit content 54 | main_py = repo_path / "src/main.py" 55 | main_py.write_text("def main():\n print('Hello, Universe!')\n return True") 56 | repo.index.add([str(main_py)]) 57 | commit = repo.index.commit("Update greeting") 58 | 59 | return {"repo_path": repo_path, "repo": Repository(working_dir=str(repo_path), default_branch=default_branch), "last_commit": commit.hexsha} 60 | 61 | 62 | class TestFileOperations: 63 | def test_list_files(self, local_repo, default_branch): 64 | """Test listing files in the repository.""" 65 | repo = local_repo["repo"] 66 | 67 | # Get all files 68 | files = repo.list_files() 69 | 70 | # Check basic DataFrame properties 71 | assert isinstance(files, pd.DataFrame) 72 | assert "file" in files.columns 73 | assert "mode" in files.columns 74 | assert "type" in files.columns 75 | assert "sha" in files.columns 76 | assert "repository" in files.columns 77 | 78 | # Check that we have the expected files 79 | file_paths = set(files["file"].values) 80 | expected_files = { 81 | "README.md", 82 | "src/main.py", 83 | "src/utils.py", 84 | "docs/index.md", 85 | "tests/test_main.py", 86 | ".gitignore", 87 | } 88 | assert file_paths == expected_files 89 | 90 | # Check that ignored files are not included 91 | assert "src/main.pyc" not in file_paths 92 | assert "src/__pycache__/main.cpython-39.pyc" not in file_paths 93 | 94 | # Check file types 95 | assert all(files["type"] == "blob") # All should be files, not trees 96 | 97 | # Check file modes (should be regular files) 98 | assert all(files["mode"].isin(["100644"])) 99 | 100 | def test_get_file_content(self, local_repo, default_branch): 101 | """Test getting file content from the repository.""" 102 | repo = local_repo["repo"] 103 | 104 | # Test getting content of an existing file 105 | content = repo.get_file_content("src/main.py") 106 | assert content == "def main():\n print('Hello, Universe!')\n return True" 107 | 108 | # Test getting content at a specific revision (first commit) 109 | first_content = repo.get_file_content("src/main.py", rev="HEAD^") 110 | assert first_content == "def main():\n print('Hello, World!')\n return True" 111 | 112 | # Test getting content of a non-existent file 113 | assert repo.get_file_content("nonexistent.txt") is None 114 | 115 | # Test getting content of an ignored file 116 | assert repo.get_file_content("src/main.pyc") is None 117 | 118 | # Test getting content with invalid revision 119 | assert repo.get_file_content("src/main.py", rev="invalid_rev") is None 120 | 121 | def test_get_commit_content(self, local_repo, default_branch): 122 | """Test getting detailed content changes from a commit.""" 123 | repo = local_repo["repo"] 124 | commit_sha = local_repo["last_commit"] 125 | 126 | # Get changes from the last commit 127 | changes = repo.get_commit_content(commit_sha) 128 | 129 | # Check basic DataFrame properties 130 | assert isinstance(changes, pd.DataFrame) 131 | assert "file" in changes.columns 132 | assert "change_type" in changes.columns 133 | assert "old_line_num" in changes.columns 134 | assert "new_line_num" in changes.columns 135 | assert "content" in changes.columns 136 | assert "repository" in changes.columns 137 | 138 | # Check that we have the expected changes 139 | assert len(changes) > 0 140 | file_changes = changes[changes["file"] == "src/main.py"] 141 | assert len(file_changes) > 0 142 | 143 | # Check for removed line 144 | removed = file_changes[file_changes["old_line_num"].notna()] 145 | assert len(removed) == 1 146 | assert "Hello, World!" in removed.iloc[0]["content"] 147 | 148 | # Check for added line 149 | added = file_changes[file_changes["new_line_num"].notna()] 150 | assert len(added) == 1 151 | assert "Hello, Universe!" in added.iloc[0]["content"] 152 | 153 | # Test with glob filters 154 | # Should find no changes when excluding .py files 155 | filtered = repo.get_commit_content(commit_sha, ignore_globs=["*.py"]) 156 | assert len(filtered) == 0 157 | 158 | # Should find changes when including only .py files 159 | filtered = repo.get_commit_content(commit_sha, include_globs=["*.py"]) 160 | assert len(filtered) > 0 161 | 162 | # Test with invalid commit 163 | invalid = repo.get_commit_content("invalid_sha") 164 | assert len(invalid) == 0 165 | 166 | def test_file_change_rates_index_ambiguity(self, local_repo, default_branch): 167 | """Test that file_change_rates handles file index/column ambiguity correctly.""" 168 | repo = local_repo["repo"] 169 | 170 | # Get file change rates 171 | change_rates = repo.file_change_rates() 172 | 173 | # Verify file is not both an index and column 174 | assert "file" not in change_rates.index.names, "file should not be in index" 175 | assert "file" in change_rates.columns, "file should be a column" 176 | 177 | # Try merging with another DataFrame to ensure no ambiguity 178 | file_details = repo.file_detail() 179 | 180 | # This merge should work without ambiguity 181 | try: 182 | pd.merge(file_details, change_rates, on="file", how="outer") 183 | except ValueError as e: 184 | pytest.fail(f"Merge failed due to ambiguity: {e}") 185 | -------------------------------------------------------------------------------- /gitpandas/utilities/plotting.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: plotting 3 | :platform: Unix, Windows 4 | :synopsis: helper functions for plotting tables from gitpandas 5 | 6 | .. moduleauthor:: Will McGinnis 7 | 8 | 9 | """ 10 | 11 | try: 12 | import matplotlib.pyplot as plt 13 | import matplotlib.style 14 | import pandas as pd 15 | 16 | matplotlib.style.use("ggplot") 17 | HAS_MPL = True 18 | except ImportError: 19 | HAS_MPL = False 20 | 21 | __author__ = "willmcginnis" 22 | 23 | 24 | def plot_punchcard(df, metric="lines", title="punchcard", by=None): 25 | """ 26 | Uses modified plotting code from https://bitbucket.org/birkenfeld/hgpunchcard 27 | 28 | :param df: 29 | :param metric: 30 | :param title: 31 | :return: 32 | """ 33 | 34 | if not HAS_MPL: 35 | raise ImportError("Must have matplotlib installed to use the plotting functions") 36 | 37 | # Validate input DataFrame 38 | required_columns = ["hour_of_day", "day_of_week", metric] 39 | if df.empty or not all(col in df.columns for col in required_columns): 40 | raise KeyError(f"DataFrame must contain columns: {required_columns}") 41 | 42 | # Validate data types and ranges 43 | if not pd.api.types.is_numeric_dtype(df[metric]): 44 | raise ValueError(f"Metric column '{metric}' must be numeric") 45 | 46 | if not all(0 <= x <= 6 for x in df["day_of_week"]): 47 | raise ValueError("day_of_week values must be between 0 and 6") 48 | 49 | if not all(0 <= x <= 23 for x in df["hour_of_day"]): 50 | raise ValueError("hour_of_day values must be between 0 and 23") 51 | 52 | # find how many plots we are making 53 | unique_vals = set(df[by].values.tolist()) if by is not None else ["foo"] 54 | for idx, val in enumerate(unique_vals): 55 | sub_df = df[df[by] == val] if by is not None else df 56 | fig = plt.figure(figsize=(8, title and 3 or 2.5), facecolor="#ffffff") 57 | ax = fig.add_subplot(111, facecolor="#ffffff") 58 | fig.subplots_adjust(left=0.06, bottom=0.04, right=0.98, top=0.95) 59 | if by is not None: 60 | ax.set_title(title + f" ({str(val)})", y=0.96).set_color("#333333") 61 | else: 62 | ax.set_title(title, y=0.96).set_color("#333333") 63 | ax.set_frame_on(False) 64 | ax.scatter( 65 | sub_df["hour_of_day"], 66 | sub_df["day_of_week"], 67 | s=sub_df[metric], 68 | c="#333333", 69 | edgecolor="#333333", 70 | ) 71 | for line in ax.get_xticklines() + ax.get_yticklines(): 72 | line.set_alpha(0.0) 73 | dist = -0.8 74 | ax.plot([dist, 23.5], [dist, dist], c="#555555") 75 | ax.plot([dist, dist], [dist, 6.4], c="#555555") 76 | ax.set_xlim(-1, 24) 77 | ax.set_ylim(-0.9, 6.9) 78 | ax.set_yticks(range(7)) 79 | for tx in ax.set_yticklabels(["Mon", "Tues", "Wed", "Thurs", "Fri", "Sat", "Sun"]): 80 | tx.set_color("#555555") 81 | tx.set_size("x-small") 82 | ax.set_xticks(range(24)) 83 | for tx in ax.set_xticklabels([f"{x:02d}" for x in range(24)]): 84 | tx.set_color("#555555") 85 | tx.set_size("x-small") 86 | ax.set_aspect("equal") 87 | if idx + 1 == len(unique_vals): 88 | plt.show(block=True) 89 | else: 90 | plt.show(block=False) 91 | 92 | 93 | def plot_cumulative_blame(df): 94 | """ 95 | Plot cumulative blame information as a stacked area chart. 96 | 97 | Args: 98 | df (pandas.DataFrame): DataFrame with dates as index and committers as columns 99 | 100 | Returns: 101 | matplotlib.figure.Figure: The generated figure 102 | """ 103 | 104 | if not HAS_MPL: 105 | raise ImportError("Must have matplotlib installed to use the plotting functions") 106 | 107 | # Validate input DataFrame 108 | if df.empty: 109 | raise ValueError("DataFrame cannot be empty") 110 | 111 | # Handle NaN values by filling with 0 112 | df = df.fillna(0) 113 | 114 | ax = df.plot(kind="area", stacked=True) 115 | plt.title("Cumulative Blame") 116 | plt.xlabel("date") 117 | plt.ylabel("LOC") 118 | box = ax.get_position() 119 | ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) 120 | ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) 121 | 122 | # Only try to show if not using Agg backend 123 | if plt.get_backend() != "Agg": 124 | plt.show() 125 | 126 | return plt.gcf() 127 | 128 | 129 | def plot_lifeline(changes, ownership_changes, refactoring): 130 | """ 131 | Plot file lifelines with ownership changes and refactoring events. 132 | 133 | Args: 134 | changes (pd.DataFrame): DataFrame containing file change history 135 | ownership_changes (pd.DataFrame): DataFrame containing ownership change events 136 | refactoring (pd.DataFrame): DataFrame containing refactoring events 137 | 138 | Returns: 139 | matplotlib.figure.Figure: The generated plot figure 140 | """ 141 | if not HAS_MPL: 142 | raise ImportError("matplotlib is required for plotting. Please install it first.") 143 | 144 | # Validate input DataFrames 145 | if changes.empty: 146 | raise ValueError("changes DataFrame cannot be empty") 147 | 148 | # Validate that all files in ownership_changes and refactoring exist in changes 149 | if not ownership_changes.empty: 150 | invalid_files = set(ownership_changes.filename) - set(changes.filename) 151 | if invalid_files: 152 | raise ValueError(f"Files in ownership_changes not found in changes: {invalid_files}") 153 | 154 | if not refactoring.empty: 155 | invalid_files = set(refactoring.filename) - set(changes.filename) 156 | if invalid_files: 157 | raise ValueError(f"Files in refactoring not found in changes: {invalid_files}") 158 | 159 | # Create a new figure 160 | fig, ax = plt.subplots(figsize=(12, 6)) 161 | 162 | # Get unique files and sort them 163 | files = changes.filename.unique() 164 | files.sort() 165 | 166 | # Create y-axis positions for each file 167 | file_positions = {file: i for i, file in enumerate(files)} 168 | 169 | # Plot lifelines 170 | for file in files: 171 | file_changes = changes[changes.filename == file] 172 | ax.plot( 173 | file_changes.index, 174 | [file_positions[file]] * len(file_changes), 175 | "-", 176 | label="_nolegend_", 177 | alpha=0.5, 178 | ) 179 | 180 | # Plot ownership changes 181 | if len(ownership_changes) > 0: 182 | ax.scatter( 183 | ownership_changes.index, 184 | [file_positions[f] for f in ownership_changes.filename], 185 | marker="o", 186 | c="red", 187 | label="Ownership Change", 188 | alpha=0.7, 189 | ) 190 | 191 | # Plot refactoring events 192 | if len(refactoring) > 0: 193 | ax.scatter( 194 | refactoring.index, 195 | [file_positions[f] for f in refactoring.filename], 196 | marker="s", 197 | c="blue", 198 | label="Refactoring", 199 | alpha=0.7, 200 | ) 201 | 202 | # Customize the plot 203 | ax.set_yticks(range(len(files))) 204 | ax.set_yticklabels(files) 205 | ax.set_xlabel("Time") 206 | ax.set_ylabel("Files") 207 | ax.set_title("File Lifelines with Ownership Changes and Refactoring Events") 208 | ax.legend() 209 | ax.grid(True, alpha=0.3) 210 | 211 | # Rotate dates for better readability 212 | plt.xticks(rotation=45) 213 | 214 | # Adjust layout to prevent label cutoff 215 | plt.tight_layout() 216 | 217 | return fig 218 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/git-pandas.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/git-pandas.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/git-pandas" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/git-pandas" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /tests/test_repository_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | from unittest import mock 5 | 6 | import pytest 7 | from git import Repo 8 | 9 | from gitpandas import Repository 10 | from gitpandas.cache import DiskCache 11 | 12 | 13 | @pytest.fixture 14 | def temp_git_repo(default_branch): 15 | """Create a temporary git repository for testing.""" 16 | temp_dir = tempfile.mkdtemp() 17 | 18 | # Initialize git repo 19 | repo = Repo.init(temp_dir) 20 | # Explicitly create and checkout default branch 21 | repo.git.checkout(b=default_branch) 22 | 23 | # Create a test file 24 | test_file_path = os.path.join(temp_dir, "test_file.txt") 25 | with open(test_file_path, "w") as f: 26 | f.write("Initial content") 27 | 28 | # Add and commit 29 | repo.git.add(test_file_path) 30 | repo.git.commit("-m", "Initial commit") 31 | 32 | # Create a second file and commit 33 | test_file2_path = os.path.join(temp_dir, "test_file2.txt") 34 | with open(test_file2_path, "w") as f: 35 | f.write("Second file content") 36 | 37 | # Add and commit 38 | repo.git.add(test_file2_path) 39 | repo.git.commit("-m", "Second commit") 40 | 41 | yield temp_dir 42 | 43 | # Cleanup 44 | shutil.rmtree(temp_dir) 45 | 46 | 47 | @pytest.fixture 48 | def temp_cache_file(): 49 | """Create a temporary cache file.""" 50 | fd, path = tempfile.mkstemp(suffix=".gz") 51 | os.close(fd) 52 | yield path 53 | if os.path.exists(path): 54 | os.unlink(path) 55 | 56 | 57 | class TestRepositoryCache: 58 | """Test caching behavior with the actual Repository class.""" 59 | 60 | def test_repository_list_files_cache(self, temp_git_repo, temp_cache_file, default_branch): 61 | """Test that list_files method properly uses cache.""" 62 | # Create cache and repository 63 | cache = DiskCache(filepath=temp_cache_file) 64 | 65 | # Mock the cache _get_entry and set methods to track calls 66 | with ( 67 | mock.patch.object(cache, "_get_entry", wraps=cache._get_entry) as mock_get_entry, 68 | mock.patch.object(cache, "set", wraps=cache.set) as mock_set, 69 | ): 70 | repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch) 71 | 72 | # First call - should set cache but not get from it 73 | result1 = repo.list_files() 74 | assert mock_set.call_count > 0, "Cache set should be called" 75 | assert mock_get_entry.call_count > 0, "Cache _get_entry should be called (but returns miss)" 76 | mock_set.reset_mock() 77 | mock_get_entry.reset_mock() 78 | 79 | # Second call - should get from cache 80 | result2 = repo.list_files() 81 | assert mock_get_entry.call_count > 0, "Cache _get_entry should be called" 82 | assert mock_set.call_count == 0, "Cache set should not be called" 83 | 84 | # Results should be identical 85 | assert result1.equals(result2), "Results should be identical when using cache" 86 | 87 | # Force refresh - should set cache again 88 | mock_set.reset_mock() 89 | mock_get_entry.reset_mock() 90 | result3 = repo.list_files(force_refresh=True) 91 | assert mock_set.call_count > 0, "Cache set should be called with force_refresh" 92 | 93 | # Results should match (unchanged repo) 94 | assert result1.equals(result3), "Results should match even with force_refresh" 95 | 96 | def test_repository_cache_with_different_params(self, temp_git_repo, temp_cache_file, default_branch): 97 | """Test caching with different parameters.""" 98 | cache = DiskCache(filepath=temp_cache_file) 99 | 100 | # Spy on the cache's set method 101 | with mock.patch.object(cache, "set", wraps=cache.set) as mock_set: 102 | repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch) 103 | 104 | # Call with default revision (HEAD) 105 | repo.list_files() 106 | first_call_count = mock_set.call_count 107 | assert first_call_count > 0, "Cache set should be called" 108 | mock_set.reset_mock() 109 | 110 | # Call with same parameters - should use cache 111 | repo.list_files() 112 | assert mock_set.call_count == 0, "Cache set should not be called for same parameters" 113 | 114 | # Call with different revision - should set new cache entry 115 | repo.list_files(rev="HEAD~1") 116 | assert mock_set.call_count > 0, "Cache set should be called for different parameters" 117 | 118 | def test_repository_cache_persistence(self, temp_git_repo, temp_cache_file, default_branch): 119 | """Test that cache persists between Repository instances.""" 120 | # First repository 121 | cache1 = DiskCache(filepath=temp_cache_file) 122 | repo1 = Repository(working_dir=temp_git_repo, cache_backend=cache1, default_branch=default_branch) 123 | 124 | # Call method and get result 125 | result1 = repo1.list_files() 126 | 127 | # Create second repository with same cache file 128 | cache2 = DiskCache(filepath=temp_cache_file) 129 | 130 | # Mock the _get_entry method to verify it's called 131 | with ( 132 | mock.patch.object(cache2, "_get_entry", wraps=cache2._get_entry) as mock_get_entry, 133 | mock.patch.object(cache2, "set", wraps=cache2.set) as mock_set, 134 | ): 135 | repo2 = Repository(working_dir=temp_git_repo, cache_backend=cache2, default_branch=default_branch) 136 | 137 | # Call same method - should use cache 138 | result2 = repo2.list_files() 139 | assert mock_get_entry.call_count > 0, "Cache _get_entry should be called" 140 | assert mock_set.call_count == 0, "Cache set should not be called" 141 | 142 | # Results should match 143 | assert result1.equals(result2), "Results should match between repository instances" 144 | 145 | def test_multiple_repository_methods_cache(self, temp_git_repo, temp_cache_file, default_branch): 146 | """Test caching behavior across different repository methods.""" 147 | cache = DiskCache(filepath=temp_cache_file) 148 | repo = Repository(working_dir=temp_git_repo, cache_backend=cache, default_branch=default_branch) 149 | 150 | # Create a dictionary to store original results 151 | results = {} 152 | 153 | # First calls - should create cache entries 154 | results["commits"] = repo.commit_history() 155 | results["files"] = repo.list_files() 156 | 157 | # Mock the cache _get_entry method to track calls 158 | with mock.patch.object(cache, "_get_entry", wraps=cache._get_entry) as mock_get_entry: 159 | # Second calls - should use cache 160 | commits2 = repo.commit_history() 161 | files2 = repo.list_files() 162 | 163 | # Both should match original results 164 | assert commits2.equals(results["commits"]), "Commit results should match" 165 | assert files2.equals(results["files"]), "File results should match" 166 | 167 | # Verify _get_entry was called for both 168 | assert mock_get_entry.call_count >= 2, "Cache _get_entry should be called for both methods" 169 | 170 | # Force refresh one method 171 | mock_get_entry.reset_mock() 172 | with mock.patch.object(cache, "set", wraps=cache.set) as mock_set: 173 | commits3 = repo.commit_history(force_refresh=True) 174 | assert commits3.equals(results["commits"]), "Results should still match after force refresh" 175 | assert mock_set.call_count > 0, "Cache set should be called for force refresh" 176 | 177 | # Other method should still use cache 178 | mock_get_entry.reset_mock() 179 | files3 = repo.list_files() 180 | assert files3.equals(results["files"]), "Files should still match" 181 | assert mock_get_entry.call_count > 0, "Cache _get_entry should be called for unchanged method" 182 | -------------------------------------------------------------------------------- /tests/test_Repository/test_file_detail.py: -------------------------------------------------------------------------------- 1 | import git 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | 6 | from gitpandas import Repository 7 | 8 | 9 | @pytest.fixture 10 | def local_repo(tmp_path, default_branch): 11 | """Fixture for a local repository with different file types.""" 12 | # Create a temporary directory 13 | repo_dir = tmp_path / "repository1" 14 | repo_dir.mkdir() 15 | 16 | # Initialize a git repo 17 | grepo = git.Repo.init(str(repo_dir)) 18 | 19 | # Configure git user 20 | grepo.git.config("user.name", "Test User") 21 | grepo.git.config("user.email", "test@example.com") 22 | 23 | # Create and checkout default branch 24 | grepo.git.checkout("-b", default_branch) 25 | 26 | # Add a README file 27 | readme_path = repo_dir / "README.md" 28 | readme_path.write_text("Sample README for a sample project\n") 29 | 30 | # Commit it 31 | grepo.git.add("README.md") 32 | grepo.git.commit(m="first commit") 33 | 34 | # Add some Python files with different content 35 | for idx in range(3): 36 | py_file = repo_dir / f"file_{idx}.py" 37 | py_file.write_text(f"import sys\nimport os\n\ndef function_{idx}():\n return {idx}\n") 38 | 39 | grepo.git.add(all=True) 40 | grepo.git.commit(m=f"adding file_{idx}.py") 41 | 42 | # Add a JavaScript file 43 | js_file = repo_dir / "script.js" 44 | js_file.write_text('function hello() {\n console.log("Hello, world!");\n}\n') 45 | 46 | grepo.git.add("script.js") 47 | grepo.git.commit(m="adding script.js") 48 | 49 | # Add a CSS file 50 | css_file = repo_dir / "style.css" 51 | css_file.write_text("body {\n margin: 0;\n padding: 0;\n}\n") 52 | 53 | grepo.git.add("style.css") 54 | grepo.git.commit(m="adding style.css") 55 | 56 | # Create a subdirectory 57 | subdir = repo_dir / "subdir" 58 | subdir.mkdir() 59 | 60 | # Add a file in the subdirectory 61 | subdir_file = subdir / "subfile.py" 62 | subdir_file.write_text('import sys\n\ndef subfunction():\n return "sub"\n') 63 | 64 | grepo.git.add(all=True) 65 | grepo.git.commit(m="adding subdir/subfile.py") 66 | 67 | # Create the Repository object 68 | git_pandas_repo = Repository(working_dir=str(repo_dir), verbose=True, default_branch=default_branch) 69 | 70 | yield git_pandas_repo 71 | 72 | # Cleanup 73 | git_pandas_repo.__del__() 74 | 75 | 76 | class TestFileDetail: 77 | def test_file_detail_basic(self, local_repo): 78 | """Test basic functionality of the file_detail method.""" 79 | file_detail = local_repo.file_detail() 80 | 81 | # Check the shape and columns 82 | assert isinstance(file_detail, pd.DataFrame) 83 | assert file_detail.shape[0] > 0 84 | 85 | # Check that we have the expected columns 86 | expected_columns = ["loc", "file_owner", "ext", "last_edit_date", "repository"] 87 | for col in expected_columns: 88 | assert col in file_detail.columns 89 | 90 | # Check that we have entries for each file type 91 | exts = file_detail["ext"].unique() 92 | assert "md" in exts 93 | assert "py" in exts 94 | assert "js" in exts 95 | assert "css" in exts 96 | 97 | # Check that the LOC counts are correct for different file types 98 | md_loc = file_detail.loc[file_detail["ext"] == "md", "loc"].sum() 99 | py_loc = file_detail.loc[file_detail["ext"] == "py", "loc"].sum() 100 | js_loc = file_detail.loc[file_detail["ext"] == "js", "loc"].sum() 101 | css_loc = file_detail.loc[file_detail["ext"] == "css", "loc"].sum() 102 | 103 | assert md_loc == 1 # README.md has 1 line 104 | assert py_loc == 19 # 3 Python files with 5 lines each + 1 with 4 lines 105 | assert js_loc == 3 # script.js has 3 lines 106 | assert css_loc == 4 # style.css has 4 lines 107 | 108 | def test_file_detail_with_globs(self, local_repo): 109 | """Test the ignore_globs and include_globs parameters.""" 110 | # Get file detail for all files 111 | file_detail_all = local_repo.file_detail() 112 | 113 | # Get file detail ignoring Python files 114 | file_detail_no_py = local_repo.file_detail(ignore_globs=["*.py"]) 115 | 116 | # Check that we have fewer files in the filtered file detail 117 | assert file_detail_no_py.shape[0] < file_detail_all.shape[0] 118 | 119 | # Check that no Python files are included 120 | assert "py" not in file_detail_no_py["ext"].values 121 | 122 | # Get file detail including only Python files 123 | file_detail_only_py = local_repo.file_detail(include_globs=["*.py"]) 124 | 125 | # Check that we have fewer files than the full file detail 126 | assert file_detail_only_py.shape[0] < file_detail_all.shape[0] 127 | 128 | # Check that only Python files are included 129 | for ext in file_detail_only_py["ext"].values: 130 | assert ext == "py" 131 | 132 | # Check that the sum of the filtered file details equals the total 133 | assert file_detail_no_py.shape[0] + file_detail_only_py.shape[0] == file_detail_all.shape[0] 134 | 135 | def test_file_detail_with_rev(self, local_repo, default_branch): 136 | """Test the rev parameter of the file_detail method.""" 137 | # Get file detail for the current revision 138 | file_detail_head = local_repo.file_detail(rev="HEAD") 139 | 140 | # Get file detail for the first commit 141 | # This should only include the README.md file 142 | first_commit = local_repo.revs(branch=default_branch).iloc[-1]["rev"] 143 | file_detail_first = local_repo.file_detail(rev=first_commit) 144 | 145 | # Check that we have fewer files in the first commit 146 | assert file_detail_first.shape[0] < file_detail_head.shape[0] 147 | 148 | # Check that only md extension is included in the first commit 149 | assert file_detail_first.shape[0] == 1 150 | assert "md" in file_detail_first["ext"].values 151 | 152 | def test_file_detail_committer(self, local_repo): 153 | """Test the committer parameter of the file_detail method.""" 154 | # Get file detail with committer=True 155 | file_detail_committer = local_repo.file_detail(committer=True) 156 | 157 | # Check that we have the file_owner column 158 | assert "file_owner" in file_detail_committer.columns 159 | 160 | # Check that all file owners are 'Test User' 161 | for owner in file_detail_committer["file_owner"]: 162 | if owner is not None: # Check if owner is not None before asserting 163 | assert isinstance(owner, str) 164 | assert owner == "Test User" 165 | 166 | # Get file detail with committer=False 167 | file_detail_no_committer = local_repo.file_detail(committer=False) 168 | 169 | # Check that we have the file_owner column 170 | assert "file_owner" in file_detail_no_committer.columns 171 | 172 | # Check that all file owners are 'Test User' 173 | for owner in file_detail_no_committer["file_owner"]: 174 | if owner is not None: # Check if owner is not None before asserting 175 | assert isinstance(owner, str) 176 | assert owner == "Test User" 177 | 178 | def test_file_detail_last_edit(self, local_repo): 179 | """Test that the last_edit_date column contains valid timestamps.""" 180 | file_detail = local_repo.file_detail() 181 | 182 | # Check that all last_edit_date values are datetime64 objects 183 | for last_edit in file_detail["last_edit_date"].values: 184 | assert isinstance(last_edit, np.datetime64 | pd.Timestamp) 185 | 186 | # Check that the last file added has the most recent timestamp 187 | # Find the file with the py extension in the subdir directory 188 | subdir_files = [idx for idx, ext in enumerate(file_detail["ext"].values) if ext == "py"] 189 | last_edit_dates = [file_detail["last_edit_date"].values[idx] for idx in subdir_files] 190 | last_edit = max(last_edit_dates) 191 | 192 | # All other files should have earlier or equal timestamps 193 | for edit_time in file_detail["last_edit_date"].values: 194 | assert edit_time <= last_edit 195 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\git-pandas.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\git-pandas.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /.cursor/rules/project_overview.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: Overview of the gitpandas project 3 | globs: 4 | alwaysApply: false 5 | --- 6 | I'll do a thorough analysis of the gitpandas library to help new developers understand it. Let me start by exploring the codebase to get a comprehensive understanding. 7 | 8 | Ran tool 9 | Let's first look at the README to get an overview of the project: 10 | 11 | Read file: README.md 12 | Now let's look at the main module structure: 13 | 14 | Ran tool 15 | Let's check the __init__.py file to understand the main components: 16 | 17 | Read file: gitpandas/__init__.py 18 | Now let's look at the repository.py file, which seems to be one of the main components: 19 | 20 | Read file: gitpandas/repository.py 21 | Let's also look at the project.py file to understand the ProjectDirectory class: 22 | 23 | Read file: gitpandas/project.py 24 | Let's also check the cache.py file to understand the caching mechanism: 25 | 26 | Read file: gitpandas/cache.py 27 | Let's check the utilities directory to understand what helper functions are available: 28 | 29 | Ran tool 30 | Let's look at the utilities/__init__.py file: 31 | 32 | Read file: gitpandas/utilities/__init__.py 33 | Let's look at the plotting utilities: 34 | 35 | Read file: gitpandas/utilities/plotting.py 36 | Let's check the examples directory to see how the library is used: 37 | 38 | Ran tool 39 | Let's look at a couple of example files to understand how the library is used: 40 | 41 | Read file: examples/commit_history.py 42 | 43 | Read file: examples/bus_analysis.py 44 | 45 | Read file: examples/cumulative_blame.py 46 | Let's check the pyproject.toml file to understand the project dependencies and metadata: 47 | 48 | Read file: pyproject.toml 49 | Based on my thorough analysis of the gitpandas library, here's a comprehensive overview for new developers: 50 | 51 | # GitPandas: A Comprehensive Overview 52 | 53 | ## What is GitPandas? 54 | 55 | GitPandas is a Python library that provides a set of wrappers around GitPython for creating pandas DataFrames from git repository data. It enables data-driven analysis of git repositories, allowing developers and data scientists to extract meaningful insights from their codebase's history and structure. 56 | 57 | ## Core Components 58 | 59 | GitPandas is built around two primary objects: 60 | 61 | 1. **Repository**: Represents a single git repository and provides methods to analyze it. 62 | 2. **ProjectDirectory**: Represents a directory containing multiple git repositories, allowing for aggregated analysis across all repositories. 63 | 3. **GitHubProfile**: A specialized ProjectDirectory that analyzes all repositories owned by a GitHub user. 64 | 65 | ## Key Features 66 | 67 | ### Repository Analysis 68 | - **Commit History**: Extract detailed commit history with filtering by file extensions and directories 69 | - **File Change History**: Track how files have changed over time 70 | - **Blame Analysis**: Determine who wrote each line of code in the repository 71 | - **Branch and Tag Information**: Extract metadata about branches and tags 72 | - **Bus Factor Analysis**: Estimate the project's "bus factor" (risk if key contributors leave) 73 | - **Cumulative Blame**: Track how code ownership changes over time 74 | - **Punchcard Analysis**: Visualize when code is written (day/hour patterns) 75 | - **Hours Estimation**: Estimate time spent on the project by contributors 76 | 77 | ### Project Directory Analysis 78 | - Aggregate all the above analyses across multiple repositories 79 | - Compare metrics across repositories 80 | - Generate project-wide statistics and visualizations 81 | 82 | ### Visualization Utilities 83 | - Plot cumulative blame over time 84 | - Create punchcard visualizations of commit patterns 85 | 86 | ### Caching System 87 | - In-memory caching via `EphemeralCache` 88 | - Redis-based caching via `RedisDFCache` 89 | - Decorator-based caching system for expensive operations 90 | 91 | ## Core Concepts 92 | 93 | ### Data Representation 94 | All data in GitPandas is represented as pandas DataFrames, making it easy to: 95 | - Filter and transform the data 96 | - Visualize results 97 | - Export to various formats 98 | - Integrate with other data analysis workflows 99 | 100 | ### Filtering Mechanisms 101 | GitPandas provides consistent filtering options across most methods: 102 | - `ignore_globs`: Glob patterns for files to exclude (e.g., `['tests/*', 'docs/*']`) 103 | - `include_globs`: Glob patterns for files to include (e.g., `['*.py', '*.js']`) 104 | - `limit`: Limit the number of commits to analyze 105 | - `days`: Limit analysis to commits within a certain number of days 106 | - `branch`: Specify which branch to analyze 107 | 108 | ### Caching 109 | GitPandas implements a flexible caching system to improve performance: 110 | - The `multicache` decorator allows caching results of expensive operations 111 | - Cache backends can be swapped (memory or Redis) 112 | - Cache keys are generated based on repository name and method parameters 113 | 114 | ## Usage Patterns 115 | 116 | ### Single Repository Analysis 117 | ```python 118 | from gitpandas import Repository 119 | 120 | # Create a Repository object 121 | repo = Repository(working_dir='/path/to/repo') 122 | 123 | # Get commit history 124 | commit_history = repo.commit_history(branch='master', include_globs=['*.py']) 125 | 126 | # Analyze blame 127 | blame = repo.blame(committer=True) 128 | 129 | # Estimate bus factor 130 | bus_factor = repo.bus_factor() 131 | ``` 132 | 133 | ### Multi-Repository Analysis 134 | ```python 135 | from gitpandas import ProjectDirectory 136 | 137 | # Create a ProjectDirectory object 138 | project = ProjectDirectory(working_dir='/path/to/project') 139 | 140 | # Get aggregated commit history 141 | commit_history = project.commit_history(branch='master') 142 | 143 | # Get aggregated blame 144 | blame = project.blame() 145 | ``` 146 | 147 | ### GitHub Profile Analysis 148 | ```python 149 | from gitpandas import GitHubProfile 150 | 151 | # Analyze a GitHub user's repositories 152 | profile = GitHubProfile(username='username') 153 | 154 | # Get repository information 155 | repo_info = profile.repo_information() 156 | ``` 157 | 158 | ### Visualization 159 | ```python 160 | from gitpandas import Repository 161 | from gitpandas.utilities.plotting import plot_cumulative_blame 162 | 163 | repo = Repository(working_dir='/path/to/repo') 164 | blame = repo.cumulative_blame(branch='master') 165 | plot_cumulative_blame(blame) 166 | ``` 167 | 168 | ## Technical Implementation Details 169 | 170 | ### Git Integration 171 | - Uses GitPython under the hood to interact with git repositories 172 | - Can work with local repositories or clone remote ones 173 | - Handles bare repositories appropriately 174 | 175 | ### Performance Considerations 176 | - Caching system for expensive operations 177 | - Optional parallel processing for certain operations (requires joblib) 178 | - Filtering mechanisms to limit analysis scope 179 | 180 | ### Data Processing 181 | - Heavy use of pandas for data manipulation and aggregation 182 | - Consistent DataFrame structures across different analyses 183 | - Time series analysis capabilities 184 | 185 | ## Development Conventions 186 | 187 | ### Code Organization 188 | - Core classes in separate modules (`repository.py`, `project.py`) 189 | - Utility functions in the `utilities` package 190 | - Caching mechanisms in `cache.py` 191 | 192 | ### Error Handling 193 | - Graceful handling of missing repositories 194 | - Proper cleanup of temporary repositories 195 | - Appropriate warnings for deprecated functionality 196 | 197 | ### Dependencies 198 | - Core dependencies: GitPython, pandas, numpy 199 | - Optional dependencies: matplotlib (for plotting), redis (for Redis caching), joblib (for parallel processing) 200 | 201 | ## Practical Applications 202 | 203 | GitPandas can be used for: 204 | 1. **Project Management**: Track contributor activity and estimate effort 205 | 2. **Code Quality Analysis**: Identify areas with high churn or single-owner code 206 | 3. **Team Dynamics**: Understand how team members collaborate and contribute 207 | 4. **Historical Analysis**: Track how a project evolves over time 208 | 5. **Risk Assessment**: Identify bus factor and knowledge silos 209 | 6. **Performance Metrics**: Generate developer and team performance metrics 210 | 211 | ## Extending GitPandas 212 | 213 | Developers can extend GitPandas by: 214 | 1. Adding new analysis methods to the Repository or ProjectDirectory classes 215 | 2. Creating new visualization utilities 216 | 3. Implementing additional caching backends 217 | 4. Building higher-level abstractions for specific use cases 218 | 219 | ## Best Practices 220 | 221 | When working with GitPandas: 222 | 1. Use appropriate filtering to limit analysis scope for large repositories 223 | 2. Implement caching for repeated analyses 224 | 3. Consider using parallel processing for intensive operations 225 | 4. Combine multiple analyses for comprehensive insights 226 | 5. Visualize results for easier interpretation 227 | 228 | This overview should provide new developers with a solid understanding of GitPandas, its capabilities, and how to effectively use it for git repository analysis. 229 | -------------------------------------------------------------------------------- /tests/test_Project/test_properties.py: -------------------------------------------------------------------------------- 1 | import git 2 | import pytest 3 | 4 | from gitpandas import ProjectDirectory 5 | 6 | __author__ = "willmcginnis" 7 | 8 | 9 | @pytest.fixture 10 | def remote_project(): 11 | """Fixture for a remote project directory.""" 12 | project = ProjectDirectory(working_dir=["https://github.com/wdm0006/git-pandas.git"], verbose=True) 13 | yield project 14 | project.__del__() 15 | 16 | 17 | @pytest.fixture 18 | def local_project(tmp_path): 19 | """Fixture for a local project directory with multiple repositories.""" 20 | # Create a temporary directory for the project 21 | project_dir = tmp_path / "repos" 22 | project_dir.mkdir() 23 | 24 | # Create repository directories 25 | repo1_dir = project_dir / "repository1" 26 | repo2_dir = project_dir / "repository2" 27 | repo1_dir.mkdir() 28 | repo2_dir.mkdir() 29 | 30 | # Initialize git repos 31 | grepo1 = git.Repo.init(str(repo1_dir)) 32 | grepo2 = git.Repo.init(str(repo2_dir)) 33 | 34 | # Configure git user 35 | grepo1.git.config("user.name", "Test User") 36 | grepo1.git.config("user.email", "test@example.com") 37 | grepo2.git.config("user.name", "Test User") 38 | grepo2.git.config("user.email", "test@example.com") 39 | 40 | # Rename master to main 41 | grepo1.git.branch("-M", "main") 42 | grepo2.git.branch("-M", "main") 43 | 44 | # Add README files 45 | with open(f"{repo1_dir}/README.md", "w") as f: 46 | f.write("Sample README for a sample python project\n") 47 | 48 | with open(f"{repo2_dir}/README.md", "w") as f: 49 | f.write("Sample README for a sample js project\n") 50 | 51 | # Commit them 52 | grepo1.git.add("README.md") 53 | grepo1.git.commit(m="first commit") 54 | 55 | grepo2.git.add("README.md") 56 | grepo2.git.commit(m="first commit") 57 | 58 | # Add Python files to repo1 59 | for idx in range(5): # Increased from 3 to 5 files 60 | with open(f"{repo1_dir}/file_{idx}.py", "w") as f: 61 | f.write("import sys\nimport os\n") 62 | 63 | grepo1.git.add(all=True) 64 | grepo1.git.commit(m=f"adding file_{idx}.py") 65 | 66 | # Add JS files to repo2 67 | for idx in range(5): # Increased from 3 to 5 files 68 | with open(f"{repo2_dir}/file_{idx}.js", "w") as f: 69 | f.write('document.write("hello world!");\n') 70 | 71 | grepo2.git.add(all=True) 72 | grepo2.git.commit(m=f"adding file_{idx}.js") 73 | 74 | # Create ProjectDirectory objects 75 | projectd_1 = ProjectDirectory(working_dir=[str(repo1_dir), str(repo2_dir)], verbose=True) 76 | projectd_2 = ProjectDirectory(working_dir=str(project_dir), verbose=True) 77 | 78 | yield {"projectd_1": projectd_1, "projectd_2": projectd_2} 79 | 80 | # Cleanup 81 | projectd_1.__del__() 82 | projectd_2.__del__() 83 | 84 | 85 | # Remote project tests 86 | class TestRemoteProperties: 87 | @pytest.mark.remote 88 | def test_repo_name(self, remote_project): 89 | assert "git-pandas" in list(remote_project.repo_name()["repository"].values) 90 | 91 | @pytest.mark.remote 92 | def test_branches(self, remote_project): 93 | branches = list(remote_project.branches()["branch"].values) 94 | assert "master" in branches 95 | assert "gh-pages" in branches 96 | 97 | @pytest.mark.remote 98 | def test_tags(self, remote_project): 99 | tags = list(remote_project.tags()["tag"].values) 100 | assert "0.0.1" in tags 101 | assert "0.0.2" in tags 102 | 103 | @pytest.mark.remote 104 | def test_is_bare(self, remote_project): 105 | for x in remote_project.is_bare()["is_bare"].values: 106 | assert not x 107 | 108 | 109 | # Local project tests 110 | class TestLocalProperties: 111 | def test_repo_name(self, local_project): 112 | projectd_1 = local_project["projectd_1"] 113 | projectd_2 = local_project["projectd_2"] 114 | 115 | assert "repository1" in list(projectd_1.repo_name()["repository"].values) 116 | assert "repository2" in list(projectd_1.repo_name()["repository"].values) 117 | assert "repository1" in list(projectd_2.repo_name()["repository"].values) 118 | assert "repository2" in list(projectd_2.repo_name()["repository"].values) 119 | 120 | def test_branches(self, local_project): 121 | projectd_1 = local_project["projectd_1"] 122 | projectd_2 = local_project["projectd_2"] 123 | 124 | branches = list(projectd_1.branches()["branch"].values) 125 | assert "main" in branches 126 | 127 | branches = list(projectd_2.branches()["branch"].values) 128 | assert "main" in branches 129 | 130 | def test_tags(self, local_project): 131 | projectd_1 = local_project["projectd_1"] 132 | projectd_2 = local_project["projectd_2"] 133 | 134 | tags = projectd_1.tags() 135 | assert len(tags) == 0 136 | 137 | tags = projectd_2.tags() 138 | assert len(tags) == 0 139 | 140 | def test_is_bare(self, local_project): 141 | projectd_1 = local_project["projectd_1"] 142 | projectd_2 = local_project["projectd_2"] 143 | 144 | bares = projectd_1.is_bare()["is_bare"].values 145 | for bare in bares: 146 | assert not bare 147 | 148 | bares = projectd_2.is_bare()["is_bare"].values 149 | for bare in bares: 150 | assert not bare 151 | 152 | def test_commit_history(self, local_project): 153 | projectd_1 = local_project["projectd_1"] 154 | 155 | ch = projectd_1.commit_history(branch="main") 156 | assert ch.shape[0] == 12 157 | 158 | ch2 = projectd_1.commit_history(branch="main", ignore_globs=["*.[!p][!y]"]) 159 | assert ch2.shape[0] == 5 160 | 161 | ch3 = projectd_1.commit_history(branch="main", limit=4) 162 | assert ch3.shape[0] == 4 163 | 164 | ch4 = projectd_1.commit_history(branch="main", days=5) 165 | assert ch4.shape[0] == 12 166 | 167 | def test_file_change_history(self, local_project): 168 | projectd_1 = local_project["projectd_1"] 169 | 170 | fch = projectd_1.file_change_history(branch="main") 171 | assert fch.shape[0] == 12 # 2 READMEs + 5 py files + 5 js files 172 | 173 | fch2 = projectd_1.file_change_history(branch="main", ignore_globs=["*.[!p][!y]"]) 174 | assert fch2.shape[0] == 5 # 5 py files 175 | 176 | fch4 = projectd_1.file_change_history(branch="main", ignore_globs=["*.[!j][!s]"]) 177 | assert fch4.shape[0] == 5 # 5 js files 178 | 179 | fch3 = projectd_1.file_change_history(branch="main", limit=4) 180 | assert fch3.shape[0] == 4 181 | 182 | def test_file_change_rates(self, local_project): 183 | projectd_1 = local_project["projectd_1"] 184 | 185 | fcr = projectd_1.file_change_rates(branch="main") 186 | assert fcr.shape[0] == 12 187 | assert "unique_committers" in fcr.columns 188 | assert "abs_rate_of_change" in fcr.columns 189 | assert "net_rate_of_change" in fcr.columns 190 | assert "net_change" in fcr.columns 191 | assert "abs_change" in fcr.columns 192 | assert "edit_rate" in fcr.columns 193 | assert "file" in fcr.columns 194 | assert "lines" in fcr.columns 195 | assert "repository" in fcr.columns 196 | # Check that data is populated 197 | assert fcr["unique_committers"].sum() > 0 198 | 199 | def test_has_coverage(self, local_project): 200 | projectd_1 = local_project["projectd_1"] 201 | 202 | # We know this repo doesn't have coverage 203 | coverages = projectd_1.has_coverage()["has_coverage"].values 204 | for coverage in coverages: 205 | assert not coverage 206 | 207 | def test_bus_factor(self, local_project): 208 | projectd_1 = local_project["projectd_1"] 209 | 210 | # We know this repo only has one committer 211 | bf = projectd_1.bus_factor(by="projectd") 212 | assert bf["bus factor"].values[0] == 1 213 | 214 | def test_blame(self, local_project): 215 | projectd_1 = local_project["projectd_1"] 216 | 217 | blame = projectd_1.blame(ignore_globs=["*.[!p][!y]"]) 218 | assert blame["loc"].sum() == 10 # 5 files * 2 lines each 219 | assert blame.shape[0] == 1 220 | 221 | def test_cumulative_blame(self, local_project): 222 | projectd_1 = local_project["projectd_1"] 223 | 224 | cblame = projectd_1.cumulative_blame(by="committer", branch="main") 225 | assert cblame.shape[0] > 0 # Just check that we have some rows 226 | assert cblame[cblame.columns.values[0]].sum() > 0 # Check that we have some blame data 227 | 228 | def test_revs(self, local_project): 229 | projectd_1 = local_project["projectd_1"] 230 | 231 | revs = projectd_1.revs(branch="main", num_datapoints=2) 232 | assert revs.shape[0] == 2 233 | 234 | revs = projectd_1.revs(branch="main", limit=2) 235 | assert revs.shape[0] == 2 236 | 237 | revs = projectd_1.revs(branch="main") 238 | assert revs.shape[0] == 12 # 2 READMEs + 5 py files + 5 js files 239 | -------------------------------------------------------------------------------- /docs/source/cache.rst: -------------------------------------------------------------------------------- 1 | Caching 2 | ======= 3 | 4 | Git-Pandas supports pluggable cache backends to optimize performance for expensive, repetitive operations. This is particularly useful for large repositories or when running multiple analyses. 5 | 6 | Overview 7 | -------- 8 | 9 | The caching system provides: 10 | * In-memory caching for temporary results 11 | * Disk-based caching for persistent storage across sessions 12 | * Redis-based caching for distributed storage 13 | * Cache management and invalidation methods 14 | * Decorator-based caching for expensive operations 15 | * **Cache timestamp tracking** - know when cache entries were populated 16 | * **Cache statistics and monitoring** - track cache performance and usage 17 | 18 | Available Cache Backends 19 | ------------------------ 20 | 21 | In-Memory Cache (EphemeralCache) 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | The default in-memory cache is ephemeral and will be cleared when the process ends: 25 | 26 | .. code-block:: python 27 | 28 | from gitpandas import Repository 29 | from gitpandas.cache import EphemeralCache 30 | 31 | # Create an in-memory cache with default settings 32 | cache = EphemeralCache() 33 | 34 | # Or customize the cache size 35 | cache = EphemeralCache(max_keys=500) 36 | 37 | # Use the cache with a repository 38 | repo = Repository('/path/to/repo', cache_backend=cache) 39 | 40 | Disk Cache (DiskCache) 41 | ~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | For persistent caching that survives between sessions: 44 | 45 | .. code-block:: python 46 | 47 | from gitpandas import Repository 48 | from gitpandas.cache import DiskCache 49 | 50 | # Create a disk cache 51 | cache = DiskCache(filepath='/path/to/cache.gz', max_keys=1000) 52 | 53 | # Use the cache with a repository 54 | repo = Repository('/path/to/repo', cache_backend=cache) 55 | 56 | Redis Cache (RedisDFCache) 57 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 58 | 59 | For persistent caching across sessions, use Redis: 60 | 61 | .. code-block:: python 62 | 63 | from gitpandas import Repository 64 | from gitpandas.cache import RedisDFCache 65 | 66 | # Create a Redis cache with default settings 67 | cache = RedisDFCache() 68 | 69 | # Or customize Redis connection and cache settings 70 | cache = RedisDFCache( 71 | host='localhost', 72 | port=6379, 73 | db=12, 74 | max_keys=1000, 75 | ttl=3600 # Cache entries expire after 1 hour 76 | ) 77 | 78 | # Use the cache with a repository 79 | repo = Repository('/path/to/repo', cache_backend=cache) 80 | 81 | Cache Timestamp Information 82 | --------------------------- 83 | 84 | All cache backends now track when cache entries were populated. You can access this information 85 | without any changes to the Repository or ProjectDirectory API: 86 | 87 | .. code-block:: python 88 | 89 | from gitpandas import Repository 90 | from gitpandas.cache import EphemeralCache 91 | 92 | # Create repository with cache 93 | cache = EphemeralCache() 94 | repo = Repository('/path/to/repo', cache_backend=cache) 95 | 96 | # Populate cache with some operations 97 | commit_history = repo.commit_history(limit=10) 98 | file_list = repo.list_files() 99 | 100 | # Check what's in the cache and when it was cached 101 | cached_keys = cache.list_cached_keys() 102 | for entry in cached_keys: 103 | print(f"Key: {entry['key']}") 104 | print(f"Cached at: {entry['cached_at']}") 105 | print(f"Age: {entry['age_seconds']:.1f} seconds") 106 | 107 | # Get specific cache information 108 | key = "commit_history_main_10_None_None_None_None" 109 | info = cache.get_cache_info(key) 110 | if info: 111 | print(f"Cache entry age: {info['age_minutes']:.2f} minutes") 112 | 113 | Cache Information Methods 114 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 115 | 116 | All cache backends support these methods for accessing timestamp information: 117 | 118 | * ``list_cached_keys()`` - Returns list of all cached keys with metadata 119 | * ``get_cache_info(key)`` - Returns detailed information about a specific cache entry 120 | 121 | The returned information includes: 122 | 123 | * ``cached_at`` - UTC timestamp when the entry was cached 124 | * ``age_seconds`` - Age of the cache entry in seconds 125 | * ``age_minutes`` - Age of the cache entry in minutes 126 | * ``age_hours`` - Age of the cache entry in hours 127 | * ``cache_key`` - The original cache key 128 | 129 | Using the Cache Decorator 130 | ------------------------- 131 | 132 | The `@multicache` decorator can be used to cache method results: 133 | 134 | .. code-block:: python 135 | 136 | from gitpandas.cache import multicache 137 | 138 | @multicache( 139 | key_prefix="method_name", 140 | key_list=["param1", "param2"], 141 | skip_if=lambda x: x.get("param1") is None 142 | ) 143 | def expensive_method(self, param1, param2): 144 | # Method implementation 145 | pass 146 | 147 | Configuration 148 | ------------- 149 | 150 | Cache backends can be configured with various parameters: 151 | 152 | EphemeralCache: 153 | * ``max_keys``: Maximum number of keys to store in memory (default: 1000) 154 | 155 | DiskCache: 156 | * ``filepath``: Path to the cache file (required) 157 | * ``max_keys``: Maximum number of keys to store (default: 1000) 158 | 159 | RedisDFCache: 160 | * ``host``: Redis host (default: 'localhost') 161 | * ``port``: Redis port (default: 6379) 162 | * ``db``: Redis database number (default: 12) 163 | * ``max_keys``: Maximum number of keys to store (default: 1000) 164 | * ``ttl``: Time-to-live in seconds for cache entries (default: None, no expiration) 165 | * Additional keyword arguments are passed to redis.StrictRedis 166 | 167 | Backward Compatibility 168 | ---------------------- 169 | 170 | The cache timestamp functionality is fully backward compatible: 171 | 172 | * Existing cache files will continue to work 173 | * Old cache entries without timestamps will be automatically converted 174 | * No changes to Repository or ProjectDirectory APIs 175 | * All existing code continues to work unchanged 176 | 177 | Best Practices 178 | -------------- 179 | 180 | Shared Cache Usage 181 | ~~~~~~~~~~~~~~~~~~ 182 | 183 | .. warning:: 184 | **Recommendation: Use Separate Cache Instances** 185 | 186 | While it's technically possible to share the same cache object across multiple Repository instances, 187 | we **strongly recommend using separate cache instances** for each repository for the following reasons: 188 | 189 | **Recommended Approach - Separate Caches:** 190 | 191 | .. code-block:: python 192 | 193 | from gitpandas import Repository 194 | from gitpandas.cache import DiskCache 195 | 196 | # Create separate cache instances for each repository 197 | cache1 = DiskCache(filepath='repo1_cache.gz') 198 | cache2 = DiskCache(filepath='repo2_cache.gz') 199 | 200 | repo1 = Repository('/path/to/repo1', cache_backend=cache1) 201 | repo2 = Repository('/path/to/repo2', cache_backend=cache2) 202 | 203 | **Benefits of Separate Caches:** 204 | 205 | * **Complete Isolation**: No risk of cache eviction conflicts between repositories 206 | * **Predictable Memory Usage**: Each repository has its own memory budget 207 | * **Easier Debugging**: Cache issues are isolated to specific repositories 208 | * **Better Performance**: No lock contention in multi-threaded scenarios 209 | * **Clear Cache Management**: You can clear or manage each repository's cache independently 210 | 211 | **If You Must Share Caches:** 212 | 213 | If you need to share a cache object across multiple repositories (e.g., for memory constraints), 214 | the system is designed to handle this safely: 215 | 216 | .. code-block:: python 217 | 218 | from gitpandas import Repository 219 | from gitpandas.cache import EphemeralCache 220 | 221 | # Shared cache (not recommended but supported) 222 | shared_cache = EphemeralCache(max_keys=1000) 223 | 224 | repo1 = Repository('/path/to/repo1', cache_backend=shared_cache) 225 | repo2 = Repository('/path/to/repo2', cache_backend=shared_cache) 226 | 227 | # Each repository gets separate cache entries 228 | files1 = repo1.list_files() # Creates cache key: list_files||repo1||None 229 | files2 = repo2.list_files() # Creates cache key: list_files||repo2||None 230 | 231 | **Shared Cache Considerations:** 232 | 233 | * Repository names are included in cache keys to prevent collisions 234 | * Cache eviction affects all repositories sharing the cache 235 | * Memory usage is shared across all repositories 236 | * Very active repositories may evict cache entries from less active ones 237 | 238 | Cache Size Planning 239 | ~~~~~~~~~~~~~~~~~~~ 240 | 241 | When planning cache sizes, consider: 242 | 243 | * **Repository Size**: Larger repositories generate more cache entries 244 | * **Operation Types**: Some operations (like ``cumulative_blame``) create many cache entries 245 | * **Memory Constraints**: Balance cache size with available system memory 246 | * **Analysis Patterns**: Frequently repeated analyses benefit from larger caches 247 | 248 | **Recommended Cache Sizes:** 249 | 250 | .. code-block:: python 251 | 252 | # Small repositories (< 1000 commits) 253 | cache = EphemeralCache(max_keys=100) 254 | 255 | # Medium repositories (1000-10000 commits) 256 | cache = EphemeralCache(max_keys=500) 257 | 258 | # Large repositories (> 10000 commits) 259 | cache = EphemeralCache(max_keys=1000) 260 | 261 | # For disk/Redis caches, you can use larger sizes 262 | cache = DiskCache(filepath='cache.gz', max_keys=5000) 263 | 264 | API Reference 265 | ------------- 266 | 267 | .. currentmodule:: gitpandas.cache 268 | 269 | .. autoclass:: EphemeralCache 270 | :members: 271 | :undoc-members: 272 | :show-inheritance: 273 | :inherited-members: 274 | :special-members: __init__ 275 | 276 | .. autoclass:: DiskCache 277 | :members: 278 | :undoc-members: 279 | :show-inheritance: 280 | :inherited-members: 281 | :special-members: __init__ 282 | 283 | .. autoclass:: RedisDFCache 284 | :members: 285 | :undoc-members: 286 | :show-inheritance: 287 | :inherited-members: 288 | :special-members: __init__ 289 | 290 | .. autofunction:: multicache 291 | 292 | .. autoclass:: CacheEntry 293 | :members: 294 | :undoc-members: 295 | :show-inheritance: 296 | :special-members: __init__ 297 | -------------------------------------------------------------------------------- /tests/test_Repository/test_bus_factor.py: -------------------------------------------------------------------------------- 1 | import git 2 | import pandas as pd 3 | import pytest 4 | 5 | from gitpandas import Repository 6 | 7 | 8 | @pytest.fixture 9 | def multi_committer_repo(tmp_path, default_branch): 10 | """Fixture for a local repository with multiple committers.""" 11 | # Create a temporary directory 12 | repo_dir = tmp_path / "repository1" 13 | repo_dir.mkdir() 14 | 15 | # Initialize a git repo 16 | grepo = git.Repo.init(str(repo_dir)) 17 | 18 | # Add a README file with first committer 19 | readme_path = repo_dir / "README.md" 20 | readme_path.write_text("Sample README for a sample project\n") 21 | 22 | # Commit it with first committer 23 | grepo.git.config("user.name", "User One") 24 | grepo.git.config("user.email", "user1@example.com") 25 | grepo.git.add("README.md") 26 | grepo.git.commit(m="first commit") 27 | 28 | # Add Python files with different committers 29 | committers = [ 30 | ("User One", "user1@example.com"), 31 | ("User Two", "user2@example.com"), 32 | ("User Three", "user3@example.com"), 33 | ] 34 | 35 | # Each committer adds 3 files 36 | for committer_idx, (name, email) in enumerate(committers): 37 | grepo.git.config("user.name", name) 38 | grepo.git.config("user.email", email) 39 | 40 | for file_idx in range(3): 41 | py_file = repo_dir / f"file_{committer_idx}_{file_idx}.py" 42 | py_file.write_text( 43 | f"import sys\nimport os\n\ndef function_{committer_idx}_{file_idx}():\n return {committer_idx * 10 + file_idx}\n" # noqa: E501 44 | ) 45 | 46 | grepo.git.add(all=True) 47 | grepo.git.commit(m=f"adding file_{committer_idx}_{file_idx}.py") 48 | 49 | # Create a shared file that all committers contribute to 50 | shared_file = repo_dir / "shared.py" 51 | 52 | # First committer creates the file 53 | grepo.git.config("user.name", committers[0][0]) 54 | grepo.git.config("user.email", committers[0][1]) 55 | shared_file.write_text("import sys\nimport os\n\n# Shared file\n") 56 | grepo.git.add("shared.py") 57 | grepo.git.commit(m="creating shared file") 58 | 59 | # Second committer adds to the file 60 | grepo.git.config("user.name", committers[1][0]) 61 | grepo.git.config("user.email", committers[1][1]) 62 | with open(shared_file, "a") as f: 63 | f.write('\ndef shared_function_1():\n return "shared1"\n') 64 | grepo.git.add("shared.py") 65 | grepo.git.commit(m="adding to shared file") 66 | 67 | # Third committer adds to the file 68 | grepo.git.config("user.name", committers[2][0]) 69 | grepo.git.config("user.email", committers[2][1]) 70 | with open(shared_file, "a") as f: 71 | f.write('\ndef shared_function_2():\n return "shared2"\n') 72 | grepo.git.add("shared.py") 73 | grepo.git.commit(m="adding more to shared file") 74 | 75 | # Create the Repository object 76 | git_pandas_repo = Repository(working_dir=str(repo_dir), default_branch=default_branch, verbose=True) 77 | 78 | yield git_pandas_repo 79 | 80 | # Cleanup 81 | git_pandas_repo.__del__() 82 | 83 | 84 | class TestBusFactor: 85 | def test_bus_factor_by_repository(self, multi_committer_repo, default_branch): 86 | """Test the bus_factor method with by='repository'.""" 87 | bus_factor = multi_committer_repo.bus_factor(by="repository") 88 | 89 | # Check the shape and columns 90 | assert isinstance(bus_factor, pd.DataFrame) 91 | assert bus_factor.shape[0] == 1 92 | 93 | # Check that we have the expected columns 94 | expected_columns = ["repository", "bus factor"] 95 | for col in expected_columns: 96 | assert col in bus_factor.columns 97 | 98 | # With 3 committers, the bus factor should be at least 1 and at most 3 99 | assert 1 <= bus_factor["bus factor"].values[0] <= 3 100 | 101 | # Since each committer has contributed roughly equally, the bus factor should be close to 3 102 | # But we'll just check it's at least 1 to be safe 103 | assert bus_factor["bus factor"].values[0] >= 1 104 | 105 | def test_bus_factor_with_globs(self, multi_committer_repo, default_branch): 106 | """Test the ignore_globs and include_globs parameters.""" 107 | # Get bus factor for all files 108 | bus_factor_all = multi_committer_repo.bus_factor(by="repository") 109 | 110 | # Get bus factor ignoring files from the first committer 111 | bus_factor_no_user1 = multi_committer_repo.bus_factor(by="repository", ignore_globs=["file_0_*.py"]) 112 | 113 | # Get bus factor including only files from the first committer 114 | bus_factor_only_user1 = multi_committer_repo.bus_factor(by="repository", include_globs=["file_0_*.py"]) 115 | 116 | # The bus factor should be different when we filter files 117 | assert bus_factor_no_user1["bus factor"].values[0] <= bus_factor_all["bus factor"].values[0] 118 | assert bus_factor_only_user1["bus factor"].values[0] <= bus_factor_all["bus factor"].values[0] 119 | 120 | def test_bus_factor_calculation(self, multi_committer_repo, default_branch): 121 | """Test the bus factor calculation logic.""" 122 | # Get the blame data to understand the distribution of contributions 123 | blame = multi_committer_repo.blame(by="repository") 124 | 125 | # Calculate the bus factor manually for the repository 126 | committer_loc = blame.groupby("committer")["loc"].sum() 127 | total_loc = committer_loc.sum() 128 | 129 | # Sort committers by LOC in descending order 130 | sorted_committers = committer_loc.sort_values(ascending=False) 131 | 132 | # Calculate cumulative percentage 133 | cumulative_pct = sorted_committers.cumsum() / total_loc 134 | 135 | # Find the number of committers needed to reach 50% 136 | # The implementation counts the number of committers needed to reach >= 50% 137 | manual_bus_factor = (cumulative_pct < 0.5).sum() + 1 138 | 139 | # Get the bus factor from the method 140 | bus_factor = multi_committer_repo.bus_factor(by="repository")["bus factor"].values[0] 141 | 142 | # The calculated bus factor should match our manual calculation 143 | assert bus_factor == manual_bus_factor 144 | 145 | def test_bus_factor_by_file(self, multi_committer_repo, default_branch): 146 | """Test the bus_factor method with by='file'.""" 147 | bus_factor_df = multi_committer_repo.bus_factor(by="file") 148 | 149 | # Check the shape and columns 150 | assert isinstance(bus_factor_df, pd.DataFrame) 151 | assert bus_factor_df.shape[0] > 0 # Should have at least one file 152 | 153 | # Check that we have the expected columns 154 | expected_columns = ["file", "bus factor", "repository"] 155 | for col in expected_columns: 156 | assert col in bus_factor_df.columns 157 | 158 | # All bus factors should be at least 1 (minimum one contributor per file) 159 | assert (bus_factor_df["bus factor"] >= 1).all() 160 | 161 | # Bus factors should be reasonable (not exceed total number of committers) 162 | max_committers = 3 # We created 3 committers in the fixture 163 | assert (bus_factor_df["bus factor"] <= max_committers).all() 164 | 165 | # Repository column should be consistent 166 | assert len(bus_factor_df["repository"].unique()) == 1 167 | 168 | # Check that we have results for the expected files 169 | file_list = bus_factor_df["file"].tolist() 170 | 171 | # We should have Python files from our test fixture 172 | python_files = [f for f in file_list if f.endswith(".py")] 173 | assert len(python_files) > 0, "Should have Python files in results" 174 | 175 | def test_bus_factor_by_file_with_globs(self, multi_committer_repo, default_branch): 176 | """Test the file-wise bus factor with glob filtering.""" 177 | # Get bus factor for all files 178 | bus_factor_all = multi_committer_repo.bus_factor(by="file") 179 | 180 | # Get bus factor for only Python files 181 | bus_factor_py = multi_committer_repo.bus_factor(by="file", include_globs=["*.py"]) 182 | 183 | # Get bus factor excluding Python files 184 | bus_factor_no_py = multi_committer_repo.bus_factor(by="file", ignore_globs=["*.py"]) 185 | 186 | # Python-only results should be a subset of all results 187 | assert len(bus_factor_py) <= len(bus_factor_all) 188 | 189 | # All files in Python-only results should end with .py 190 | if not bus_factor_py.empty: 191 | assert bus_factor_py["file"].str.endswith(".py").all() 192 | 193 | # No files in no-Python results should end with .py 194 | if not bus_factor_no_py.empty: 195 | assert not bus_factor_no_py["file"].str.endswith(".py").any() 196 | 197 | def test_bus_factor_by_file_single_committer_files(self, multi_committer_repo, default_branch): 198 | """Test file-wise bus factor for files with single committers.""" 199 | # Get all file-wise bus factors 200 | bus_factor_df = multi_committer_repo.bus_factor(by="file") 201 | 202 | # Filter for files that have only one committer (bus factor of 1) 203 | single_committer_files = bus_factor_df[bus_factor_df["bus factor"] == 1] 204 | 205 | # Should have some single-committer files from our fixture 206 | # (each committer created their own files) 207 | assert len(single_committer_files) > 0 208 | 209 | # Verify the bus factor calculation for a single-committer file 210 | if len(single_committer_files) > 0: 211 | sample_file = single_committer_files.iloc[0]["file"] 212 | 213 | # Get blame data for this specific file 214 | blame = multi_committer_repo.blame(by="file") 215 | if isinstance(blame.index, pd.MultiIndex): 216 | blame = blame.reset_index() 217 | 218 | file_blame = blame[blame["file"] == sample_file] 219 | unique_committers = ( 220 | file_blame["committer"].nunique() 221 | if "committer" in file_blame.columns 222 | else file_blame["author"].nunique() 223 | ) 224 | 225 | # A file with bus factor 1 should have contributions that are >=50% from one person 226 | # But due to rounding and the nature of our test data, we'll just verify it's reasonable 227 | assert unique_committers >= 1 228 | --------------------------------------------------------------------------------