├── requirements.txt ├── MANIFEST.in ├── github_backup ├── __init__.py ├── __main__.py ├── cli.py └── github_backup.py ├── tests ├── __init__.py ├── test_case_sensitivity.py ├── test_pagination.py ├── test_http_451.py ├── test_all_starred.py ├── test_json_dump_if_changed.py ├── test_skip_assets_on.py └── test_attachments.py ├── python-github-backup.code-workspace ├── pytest.ini ├── release-requirements.txt ├── .github ├── dependabot.yml ├── workflows │ ├── tagged-release.yml │ ├── test.yml │ ├── lint.yml │ ├── automatic-release.yml │ └── docker.yml ├── PULL_REQUEST.md └── ISSUE_TEMPLATE │ ├── bug.yaml │ └── feature.yaml ├── .gitignore ├── bin └── github-backup ├── Dockerfile ├── .dockerignore ├── LICENSE.txt ├── setup.py ├── .gitchangelog.rc ├── release └── README.rst /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.rst 3 | -------------------------------------------------------------------------------- /github_backup/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.57.0" 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for python-github-backup.""" 2 | -------------------------------------------------------------------------------- /python-github-backup.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ] 7 | } -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | python_files = test_*.py 4 | python_classes = Test* 5 | python_functions = test_* 6 | addopts = -v 7 | -------------------------------------------------------------------------------- /release-requirements.txt: -------------------------------------------------------------------------------- 1 | # Linting & Formatting 2 | autopep8==2.3.2 3 | black==25.12.0 4 | flake8==7.3.0 5 | 6 | # Testing 7 | pytest==9.0.2 8 | 9 | # Release & Publishing 10 | twine==6.2.0 11 | gitchangelog==3.0.4 12 | setuptools==80.9.0 13 | 14 | # Documentation 15 | restructuredtext-lint==2.0.2 16 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "13:00" 8 | groups: 9 | python-packages: 10 | patterns: 11 | - "*" 12 | - package-ecosystem: "github-actions" 13 | directory: "/" 14 | schedule: 15 | interval: "weekly" 16 | -------------------------------------------------------------------------------- /github_backup/__main__.py: -------------------------------------------------------------------------------- 1 | """Allow running as: python -m github_backup""" 2 | 3 | import sys 4 | 5 | from github_backup.cli import main 6 | from github_backup.github_backup import logger 7 | 8 | if __name__ == "__main__": 9 | try: 10 | main() 11 | except Exception as e: 12 | logger.error(str(e)) 13 | sys.exit(1) 14 | -------------------------------------------------------------------------------- /.github/workflows/tagged-release.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "tagged-release" 3 | 4 | # yamllint disable-line rule:truthy 5 | on: 6 | push: 7 | tags: 8 | - '*' 9 | 10 | jobs: 11 | tagged-release: 12 | name: tagged-release 13 | runs-on: ubuntu-24.04 14 | 15 | steps: 16 | - uses: "marvinpinto/action-automatic-releases@v1.2.1" 17 | with: 18 | repo_token: "${{ secrets.GITHUB_TOKEN }}" 19 | prerelease: false 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # Temp files 4 | *~ 5 | ~* 6 | .*~ 7 | \#* 8 | .#* 9 | *# 10 | dist 11 | 12 | # Build files 13 | build 14 | dist 15 | pkg 16 | *.egg 17 | *.egg-info 18 | 19 | # Debian Files 20 | debian/files 21 | debian/python-github-backup* 22 | 23 | # Sphinx build 24 | doc/_build 25 | 26 | # Generated man page 27 | doc/github_backup.1 28 | 29 | # Annoying macOS files 30 | .DS_Store 31 | ._* 32 | 33 | # IDE configuration files 34 | .vscode 35 | .atom 36 | .idea 37 | 38 | README 39 | 40 | # RSA 41 | id_rsa 42 | id_rsa.pub 43 | 44 | # Virtual env 45 | venv 46 | .venv 47 | -------------------------------------------------------------------------------- /bin/github-backup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Backwards-compatible wrapper script. 4 | 5 | The recommended way to run github-backup is via the installed command 6 | (pip install github-backup) or python -m github_backup. 7 | 8 | This script is kept for backwards compatibility with existing installations 9 | that may reference this path directly. 10 | """ 11 | 12 | import sys 13 | 14 | from github_backup.cli import main 15 | from github_backup.github_backup import logger 16 | 17 | if __name__ == "__main__": 18 | try: 19 | main() 20 | except Exception as e: 21 | logger.error(str(e)) 22 | sys.exit(1) 23 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST.md: -------------------------------------------------------------------------------- 1 | # Important notice regarding filed pull requests 2 | 3 | This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given. 4 | 5 | I will attempt to review pull requests at _my_ earliest convenience. If I am unable to get to your pull request in a timely fashion, it is what it is. This repository does not pay any bills, and I am not required to merge any pull request from any individual. 6 | 7 | If you wish to jump my personal priority queue, you may pay me for my time to review. My rate is $200 an hour - minimum 1 hour - feel free contact me via my github email address if you want to go this route. 8 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "test" 3 | 4 | # yamllint disable-line rule:truthy 5 | on: 6 | pull_request: 7 | branches: 8 | - "*" 9 | push: 10 | branches: 11 | - "main" 12 | - "master" 13 | 14 | jobs: 15 | test: 16 | name: test 17 | runs-on: ubuntu-24.04 18 | strategy: 19 | matrix: 20 | python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] 21 | 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v6 25 | with: 26 | fetch-depth: 0 27 | - name: Setup Python 28 | uses: actions/setup-python@v6 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | cache: "pip" 32 | - run: pip install -r release-requirements.txt 33 | - run: pytest tests/ -v 34 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "lint" 3 | 4 | # yamllint disable-line rule:truthy 5 | on: 6 | pull_request: 7 | branches: 8 | - "*" 9 | push: 10 | branches: 11 | - "main" 12 | - "master" 13 | 14 | jobs: 15 | lint: 16 | name: lint 17 | runs-on: ubuntu-24.04 18 | strategy: 19 | matrix: 20 | python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] 21 | 22 | steps: 23 | - name: Checkout repository 24 | uses: actions/checkout@v6 25 | with: 26 | fetch-depth: 0 27 | - name: Setup Python 28 | uses: actions/setup-python@v6 29 | with: 30 | python-version: ${{ matrix.python-version }} 31 | cache: "pip" 32 | - run: pip install -r release-requirements.txt && pip install wheel 33 | - run: flake8 --ignore=E501,E203,W503 34 | - run: black . 35 | - run: rst-lint README.rst 36 | - run: python setup.py sdist bdist_wheel && twine check dist/* 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-alpine3.22 AS builder 2 | 3 | RUN pip install --no-cache-dir --upgrade pip \ 4 | && pip install --no-cache-dir uv 5 | 6 | WORKDIR /app 7 | 8 | RUN --mount=type=cache,target=/root/.cache/uv \ 9 | --mount=type=bind,source=requirements.txt,target=requirements.txt \ 10 | --mount=type=bind,source=release-requirements.txt,target=release-requirements.txt \ 11 | uv venv \ 12 | && uv pip install -r release-requirements.txt 13 | 14 | COPY . . 15 | 16 | RUN --mount=type=cache,target=/root/.cache/uv \ 17 | uv pip install . 18 | 19 | 20 | FROM python:3.12-alpine3.22 21 | ENV PYTHONUNBUFFERED=1 22 | 23 | RUN apk add --no-cache \ 24 | ca-certificates \ 25 | git \ 26 | git-lfs \ 27 | && addgroup -g 1000 appuser \ 28 | && adduser -D -u 1000 -G appuser appuser 29 | 30 | COPY --from=builder --chown=appuser:appuser /app /app 31 | 32 | WORKDIR /app 33 | 34 | USER appuser 35 | 36 | ENV PATH="/app/.venv/bin:$PATH" 37 | 38 | ENTRYPOINT ["github-backup"] 39 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Docker ignore file to reduce build context size 2 | 3 | # Temp files 4 | *~ 5 | ~* 6 | .*~ 7 | \#* 8 | .#* 9 | *# 10 | dist 11 | 12 | # Build files 13 | build 14 | dist 15 | pkg 16 | *.egg 17 | *.egg-info 18 | 19 | # Debian Files 20 | debian/files 21 | debian/python-github-backup* 22 | 23 | # Sphinx build 24 | doc/_build 25 | 26 | # Generated man page 27 | doc/github_backup.1 28 | 29 | # Annoying macOS files 30 | .DS_Store 31 | ._* 32 | 33 | # IDE configuration files 34 | .vscode 35 | .atom 36 | .idea 37 | *.code-workspace 38 | 39 | # RSA 40 | id_rsa 41 | id_rsa.pub 42 | 43 | # Virtual env 44 | venv 45 | .venv 46 | 47 | # Git 48 | .git 49 | .gitignore 50 | .gitchangelog.rc 51 | .github 52 | 53 | # Documentation 54 | *.md 55 | !README.md 56 | 57 | # Environment variables files 58 | .env 59 | .env.* 60 | !.env.example 61 | *.log 62 | 63 | # Cache files 64 | **/__pycache__/ 65 | *.py[cod] 66 | 67 | # Docker files 68 | docker-compose.yml 69 | Dockerfile* 70 | 71 | # Other files 72 | release 73 | *.tar 74 | *.zip 75 | *.gzip 76 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Jose Diaz-Gonzalez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | description: File a bug report. 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | # Important notice regarding filed issues 9 | 10 | This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given. 11 | 12 | If pull requests implementing bug fixes or enhancements are pushed, I am happy to review and merge them (time permitting). 13 | 14 | If you wish to have a bug fixed, you have a few options: 15 | 16 | - Fix it yourself and file a pull request. 17 | - File a bug and hope someone else fixes it for you. 18 | - Pay me to fix it (my rate is $200 an hour, minimum 1 hour, contact me via my [github email address](https://github.com/josegonzalez) if you want to go this route). 19 | 20 | In all cases, feel free to file an issue, they may be of help to others in the future. 21 | - type: textarea 22 | id: what-happened 23 | attributes: 24 | label: What happened? 25 | description: Also tell us, what did you expect to happen? 26 | placeholder: Tell us what you see! 27 | validations: 28 | required: true 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | description: File a feature request. 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | # Important notice regarding filed issues 9 | 10 | This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given. 11 | 12 | If pull requests implementing bug fixes or enhancements are pushed, I am happy to review and merge them (time permitting). 13 | 14 | If you wish to have a feature implemented, you have a few options: 15 | 16 | - Implement it yourself and file a pull request. 17 | - File an issue and hope someone else implements it for you. 18 | - Pay me to implement it (my rate is $200 an hour, minimum 1 hour, contact me via my [github email address](https://github.com/josegonzalez) if you want to go this route). 19 | 20 | In all cases, feel free to file an issue, they may be of help to others in the future. 21 | - type: textarea 22 | id: what-would-you-like-to-happen 23 | attributes: 24 | label: What would you like to happen? 25 | description: Please describe in detail how the new functionality should work as well as any issues with existing functionality. 26 | validations: 27 | required: true 28 | -------------------------------------------------------------------------------- /.github/workflows/automatic-release.yml: -------------------------------------------------------------------------------- 1 | name: automatic-release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | release_type: 7 | description: Release type 8 | required: true 9 | type: choice 10 | options: 11 | - patch 12 | - minor 13 | - major 14 | 15 | jobs: 16 | release: 17 | name: Release 18 | runs-on: ubuntu-24.04 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v6 22 | with: 23 | fetch-depth: 0 24 | ssh-key: ${{ secrets.DEPLOY_PRIVATE_KEY }} 25 | - name: Setup Git 26 | run: | 27 | git config --local user.email "action@github.com" 28 | git config --local user.name "GitHub Action" 29 | - name: Setup Python 30 | uses: actions/setup-python@v6 31 | with: 32 | python-version: '3.12' 33 | - name: Install prerequisites 34 | run: pip install -r release-requirements.txt 35 | - name: Execute release 36 | env: 37 | SEMVER_BUMP: ${{ github.event.inputs.release_type }} 38 | TWINE_REPOSITORY: ${{ vars.TWINE_REPOSITORY }} 39 | TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} 40 | TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} 41 | run: ./release $SEMVER_BUMP 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | 5 | from github_backup import __version__ 6 | 7 | try: 8 | from setuptools import setup 9 | 10 | setup # workaround for pyflakes issue #13 11 | except ImportError: 12 | from distutils.core import setup 13 | 14 | # Hack to prevent stupid TypeError: 'NoneType' object is not callable error on 15 | # exit of python setup.py test # in multiprocessing/util.py _exit_function when 16 | # running python setup.py test (see 17 | # http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html) 18 | try: 19 | import multiprocessing 20 | 21 | multiprocessing 22 | except ImportError: 23 | pass 24 | 25 | 26 | def open_file(fname): 27 | return open(os.path.join(os.path.dirname(__file__), fname)) 28 | 29 | 30 | setup( 31 | name="github-backup", 32 | version=__version__, 33 | author="Jose Diaz-Gonzalez", 34 | author_email="github-backup@josediazgonzalez.com", 35 | packages=["github_backup"], 36 | entry_points={ 37 | "console_scripts": [ 38 | "github-backup=github_backup.cli:main", 39 | ], 40 | }, 41 | url="http://github.com/josegonzalez/python-github-backup", 42 | license="MIT", 43 | classifiers=[ 44 | "Development Status :: 5 - Production/Stable", 45 | "Topic :: System :: Archiving :: Backup", 46 | "License :: OSI Approved :: MIT License", 47 | "Programming Language :: Python :: 3.10", 48 | "Programming Language :: Python :: 3.11", 49 | "Programming Language :: Python :: 3.12", 50 | "Programming Language :: Python :: 3.13", 51 | "Programming Language :: Python :: 3.14", 52 | ], 53 | description="backup a github user or organization", 54 | long_description=open_file("README.rst").read(), 55 | long_description_content_type="text/x-rst", 56 | install_requires=open_file("requirements.txt").readlines(), 57 | python_requires=">=3.10", 58 | zip_safe=True, 59 | ) 60 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. 2 | # They are provided by a third-party and are governed by 3 | # separate terms of service, privacy policy, and support 4 | # documentation. 5 | 6 | name: Create and publish a Docker image 7 | 8 | on: 9 | push: 10 | branches: 11 | - 'master' 12 | - 'main' 13 | - 'dev' 14 | 15 | tags: 16 | - 'v*' 17 | - 'v*.*' 18 | - 'v*.*.*' 19 | - '*' 20 | - '*.*' 21 | - '*.*.*' 22 | pull_request: 23 | branches: 24 | - 'main' 25 | - 'dev' 26 | 27 | 28 | env: 29 | REGISTRY: ghcr.io 30 | IMAGE_NAME: ${{ github.repository }} 31 | 32 | jobs: 33 | build-and-push-image: 34 | runs-on: ubuntu-latest 35 | permissions: 36 | contents: read 37 | packages: write 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v6 42 | with: 43 | persist-credentials: false 44 | 45 | - name: Set up QEMU 46 | uses: docker/setup-qemu-action@v3 47 | 48 | - name: Set up Docker Buildx 49 | uses: docker/setup-buildx-action@v3 50 | 51 | - name: Log in to the Container registry 52 | uses: docker/login-action@v3 53 | with: 54 | registry: ${{ env.REGISTRY }} 55 | username: ${{ github.actor }} 56 | password: ${{ secrets.GITHUB_TOKEN }} 57 | 58 | - name: Extract metadata (tags, labels) for Docker 59 | id: meta 60 | uses: docker/metadata-action@v5 61 | with: 62 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 63 | tags: | 64 | type=semver,pattern={{version}} 65 | type=semver,pattern={{major}}.{{minor}} 66 | type=semver,pattern={{major}} 67 | type=sha 68 | type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }} 69 | 70 | - name: Build and push Docker image 71 | uses: docker/build-push-action@v6 72 | with: 73 | context: . 74 | push: true 75 | platforms: linux/amd64,linux/arm64 76 | tags: ${{ steps.meta.outputs.tags }} 77 | labels: ${{ steps.meta.outputs.labels }} 78 | -------------------------------------------------------------------------------- /github_backup/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Command-line interface for github-backup.""" 3 | 4 | import logging 5 | import os 6 | import sys 7 | 8 | from github_backup.github_backup import ( 9 | backup_account, 10 | backup_repositories, 11 | check_git_lfs_install, 12 | filter_repositories, 13 | get_auth, 14 | get_authenticated_user, 15 | logger, 16 | mkdir_p, 17 | parse_args, 18 | retrieve_repositories, 19 | ) 20 | 21 | # INFO and DEBUG go to stdout, WARNING and above go to stderr 22 | log_format = logging.Formatter( 23 | fmt="%(asctime)s.%(msecs)03d: %(message)s", 24 | datefmt="%Y-%m-%dT%H:%M:%S", 25 | ) 26 | 27 | stdout_handler = logging.StreamHandler(sys.stdout) 28 | stdout_handler.setLevel(logging.DEBUG) 29 | stdout_handler.addFilter(lambda r: r.levelno < logging.WARNING) 30 | stdout_handler.setFormatter(log_format) 31 | 32 | stderr_handler = logging.StreamHandler(sys.stderr) 33 | stderr_handler.setLevel(logging.WARNING) 34 | stderr_handler.setFormatter(log_format) 35 | 36 | logging.basicConfig(level=logging.INFO, handlers=[stdout_handler, stderr_handler]) 37 | 38 | 39 | def main(): 40 | """Main entry point for github-backup CLI.""" 41 | args = parse_args() 42 | 43 | if args.private and not get_auth(args): 44 | logger.warning( 45 | "The --private flag has no effect without authentication. " 46 | "Use -t/--token, -f/--token-fine, or -u/--username to authenticate." 47 | ) 48 | 49 | if args.quiet: 50 | logger.setLevel(logging.WARNING) 51 | 52 | output_directory = os.path.realpath(args.output_directory) 53 | if not os.path.isdir(output_directory): 54 | logger.info("Create output directory {0}".format(output_directory)) 55 | mkdir_p(output_directory) 56 | 57 | if args.lfs_clone: 58 | check_git_lfs_install() 59 | 60 | if args.log_level: 61 | log_level = logging.getLevelName(args.log_level.upper()) 62 | if isinstance(log_level, int): 63 | logger.root.setLevel(log_level) 64 | 65 | if not args.as_app: 66 | logger.info("Backing up user {0} to {1}".format(args.user, output_directory)) 67 | authenticated_user = get_authenticated_user(args) 68 | else: 69 | authenticated_user = {"login": None} 70 | 71 | repositories = retrieve_repositories(args, authenticated_user) 72 | repositories = filter_repositories(args, repositories) 73 | backup_repositories(args, output_directory, repositories) 74 | backup_account(args, output_directory) 75 | 76 | 77 | if __name__ == "__main__": 78 | try: 79 | main() 80 | except Exception as e: 81 | logger.error(str(e)) 82 | sys.exit(1) 83 | -------------------------------------------------------------------------------- /.gitchangelog.rc: -------------------------------------------------------------------------------- 1 | # 2 | # Format 3 | # 4 | # ACTION: [AUDIENCE:] COMMIT_MSG [@TAG ...] 5 | # 6 | # Description 7 | # 8 | # ACTION is one of 'chg', 'fix', 'new' 9 | # 10 | # Is WHAT the change is about. 11 | # 12 | # 'chg' is for refactor, small improvement, cosmetic changes... 13 | # 'fix' is for bug fixes 14 | # 'new' is for new features, big improvement 15 | # 16 | # SUBJECT is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc' 17 | # 18 | # Is WHO is concerned by the change. 19 | # 20 | # 'dev' is for developpers (API changes, refactors...) 21 | # 'usr' is for final users (UI changes) 22 | # 'pkg' is for packagers (packaging changes) 23 | # 'test' is for testers (test only related changes) 24 | # 'doc' is for doc guys (doc only changes) 25 | # 26 | # COMMIT_MSG is ... well ... the commit message itself. 27 | # 28 | # TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic' 29 | # 30 | # 'refactor' is obviously for refactoring code only 31 | # 'minor' is for a very meaningless change (a typo, adding a comment) 32 | # 'cosmetic' is for cosmetic driven change (re-indentation, 80-col...) 33 | # 34 | # Example: 35 | # 36 | # new: usr: support of bazaar implemented 37 | # chg: re-indentend some lines @cosmetic 38 | # new: dev: updated code to be compatible with last version of killer lib. 39 | # fix: pkg: updated year of licence coverage. 40 | # new: test: added a bunch of test around user usability of feature X. 41 | # fix: typo in spelling my name in comment. @minor 42 | # 43 | # Please note that multi-line commit message are supported, and only the 44 | # first line will be considered as the "summary" of the commit message. So 45 | # tags, and other rules only applies to the summary. The body of the commit 46 | # message will be displayed in the changelog with minor reformating. 47 | 48 | # 49 | # ``ignore_regexps`` is a line of regexps 50 | # 51 | # Any commit having its full commit message matching any regexp listed here 52 | # will be ignored and won't be reported in the changelog. 53 | # 54 | ignore_regexps = [ 55 | r'(?i)^(Merge pull request|Merge branch|Release|Update)', 56 | ] 57 | 58 | 59 | # 60 | # ``replace_regexps`` is a dict associating a regexp pattern and its replacement 61 | # 62 | # It will be applied to get the summary line from the full commit message. 63 | # 64 | # Note that you can provide multiple replacement patterns, they will be all 65 | # tried. If None matches, the summary line will be the full commit message. 66 | # 67 | replace_regexps = { 68 | # current format (ie: 'chg: dev: my commit msg @tag1 @tag2') 69 | 70 | r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$': 71 | r'\4', 72 | } 73 | 74 | 75 | # ``section_regexps`` is a list of 2-tuples associating a string label and a 76 | # list of regexp 77 | # 78 | # Commit messages will be classified in sections thanks to this. Section 79 | # titles are the label, and a commit is classified under this section if any 80 | # of the regexps associated is matching. 81 | # 82 | section_regexps = [ 83 | ('New', [ 84 | r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 85 | ]), 86 | ('Changes', [ 87 | r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 88 | ]), 89 | ('Fix', [ 90 | r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$', 91 | ]), 92 | ('Other', None # Match all lines 93 | ), 94 | 95 | ] 96 | 97 | # ``body_split_regexp`` is a regexp 98 | # 99 | # Commit message body (not the summary) if existing will be split 100 | # (new line) on this regexp 101 | # 102 | body_split_regexp = r'[\n-]' 103 | 104 | 105 | # ``tag_filter_regexp`` is a regexp 106 | # 107 | # Tags that will be used for the changelog must match this regexp. 108 | # 109 | # tag_filter_regexp = r'^[0-9]+$' 110 | tag_filter_regexp = r'^(?:[vV])?[0-9\.]+$' 111 | 112 | 113 | # ``unreleased_version_label`` is a string 114 | # 115 | # This label will be used as the changelog Title of the last set of changes 116 | # between last valid tag and HEAD if any. 117 | unreleased_version_label = "%%version%% (unreleased)" 118 | -------------------------------------------------------------------------------- /tests/test_case_sensitivity.py: -------------------------------------------------------------------------------- 1 | """Tests for case-insensitive username/organization filtering.""" 2 | 3 | import pytest 4 | from unittest.mock import Mock 5 | 6 | from github_backup import github_backup 7 | 8 | 9 | class TestCaseSensitivity: 10 | """Test suite for case-insensitive username matching in filter_repositories.""" 11 | 12 | def test_filter_repositories_case_insensitive_user(self): 13 | """Should filter repositories case-insensitively for usernames. 14 | 15 | Reproduces issue #198 where typing 'iamrodos' fails to match 16 | repositories with owner.login='Iamrodos' (the canonical case from GitHub API). 17 | """ 18 | # Simulate user typing lowercase username 19 | args = Mock() 20 | args.user = "iamrodos" # lowercase (what user typed) 21 | args.repository = None 22 | args.name_regex = None 23 | args.languages = None 24 | args.exclude = None 25 | args.fork = False 26 | args.private = False 27 | args.public = False 28 | args.all = True 29 | 30 | # Simulate GitHub API returning canonical case 31 | repos = [ 32 | { 33 | "name": "repo1", 34 | "owner": {"login": "Iamrodos"}, # Capital I (canonical from API) 35 | "private": False, 36 | "fork": False, 37 | }, 38 | { 39 | "name": "repo2", 40 | "owner": {"login": "Iamrodos"}, 41 | "private": False, 42 | "fork": False, 43 | }, 44 | ] 45 | 46 | filtered = github_backup.filter_repositories(args, repos) 47 | 48 | # Should match despite case difference 49 | assert len(filtered) == 2 50 | assert filtered[0]["name"] == "repo1" 51 | assert filtered[1]["name"] == "repo2" 52 | 53 | def test_filter_repositories_case_insensitive_org(self): 54 | """Should filter repositories case-insensitively for organizations. 55 | 56 | Tests the example from issue #198 where 'prai-org' doesn't match 'PRAI-Org'. 57 | """ 58 | args = Mock() 59 | args.user = "prai-org" # lowercase (what user typed) 60 | args.repository = None 61 | args.name_regex = None 62 | args.languages = None 63 | args.exclude = None 64 | args.fork = False 65 | args.private = False 66 | args.public = False 67 | args.all = True 68 | 69 | repos = [ 70 | { 71 | "name": "repo1", 72 | "owner": {"login": "PRAI-Org"}, # Different case (canonical from API) 73 | "private": False, 74 | "fork": False, 75 | }, 76 | ] 77 | 78 | filtered = github_backup.filter_repositories(args, repos) 79 | 80 | # Should match despite case difference 81 | assert len(filtered) == 1 82 | assert filtered[0]["name"] == "repo1" 83 | 84 | def test_filter_repositories_case_variations(self): 85 | """Should handle various case combinations correctly.""" 86 | args = Mock() 87 | args.user = "TeSt-UsEr" # Mixed case 88 | args.repository = None 89 | args.name_regex = None 90 | args.languages = None 91 | args.exclude = None 92 | args.fork = False 93 | args.private = False 94 | args.public = False 95 | args.all = True 96 | 97 | repos = [ 98 | {"name": "repo1", "owner": {"login": "test-user"}, "private": False, "fork": False}, 99 | {"name": "repo2", "owner": {"login": "TEST-USER"}, "private": False, "fork": False}, 100 | {"name": "repo3", "owner": {"login": "TeSt-UsEr"}, "private": False, "fork": False}, 101 | {"name": "repo4", "owner": {"login": "other-user"}, "private": False, "fork": False}, 102 | ] 103 | 104 | filtered = github_backup.filter_repositories(args, repos) 105 | 106 | # Should match first 3 (all case variations of same user) 107 | assert len(filtered) == 3 108 | assert set(r["name"] for r in filtered) == {"repo1", "repo2", "repo3"} 109 | 110 | 111 | if __name__ == "__main__": 112 | pytest.main([__file__, "-v"]) 113 | -------------------------------------------------------------------------------- /release: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eo pipefail 3 | [[ $RELEASE_TRACE ]] && set -x 4 | 5 | if [[ ! -f setup.py ]]; then 6 | echo -e "${RED}WARNING: Missing setup.py${COLOR_OFF}\n" 7 | exit 1 8 | fi 9 | 10 | PACKAGE_NAME="$(cat setup.py | grep 'name="' | head | cut -d '"' -f2)" 11 | INIT_PACKAGE_NAME="$(echo "${PACKAGE_NAME//-/_}")" 12 | PUBLIC="true" 13 | 14 | # Colors 15 | COLOR_OFF="\033[0m" # unsets color to term fg color 16 | RED="\033[0;31m" # red 17 | GREEN="\033[0;32m" # green 18 | YELLOW="\033[0;33m" # yellow 19 | MAGENTA="\033[0;35m" # magenta 20 | CYAN="\033[0;36m" # cyan 21 | 22 | # ensure wheel is available 23 | pip install wheel >/dev/null 24 | 25 | command -v gitchangelog >/dev/null 2>&1 || { 26 | echo -e "${RED}WARNING: Missing gitchangelog binary, please run: pip install gitchangelog==3.0.4${COLOR_OFF}\n" 27 | exit 1 28 | } 29 | 30 | command -v rst-lint >/dev/null || { 31 | echo -e "${RED}WARNING: Missing rst-lint binary, please run: pip install restructuredtext_lint${COLOR_OFF}\n" 32 | exit 1 33 | } 34 | 35 | command -v twine >/dev/null || { 36 | echo -e "${RED}WARNING: Missing twine binary, please run: pip install twine==3.2.0${COLOR_OFF}\n" 37 | exit 1 38 | } 39 | 40 | if [[ "$@" != "major" ]] && [[ "$@" != "minor" ]] && [[ "$@" != "patch" ]]; then 41 | echo -e "${RED}WARNING: Invalid release type, must specify 'major', 'minor', or 'patch'${COLOR_OFF}\n" 42 | exit 1 43 | fi 44 | 45 | echo -e "\n${GREEN}STARTING RELEASE PROCESS${COLOR_OFF}\n" 46 | 47 | set +e 48 | git status | grep -Eo "working (directory|tree) clean" &>/dev/null 49 | if [ ! $? -eq 0 ]; then # working directory is NOT clean 50 | echo -e "${RED}WARNING: You have uncomitted changes, you may have forgotten something${COLOR_OFF}\n" 51 | exit 1 52 | fi 53 | set -e 54 | 55 | echo -e "${YELLOW}--->${COLOR_OFF} Updating local copy" 56 | git pull -q origin master 57 | 58 | echo -e "${YELLOW}--->${COLOR_OFF} Retrieving release versions" 59 | 60 | current_version=$(cat ${INIT_PACKAGE_NAME}/__init__.py | grep '__version__ =' | sed 's/[^0-9.]//g') 61 | major=$(echo $current_version | awk '{split($0,a,"."); print a[1]}') 62 | minor=$(echo $current_version | awk '{split($0,a,"."); print a[2]}') 63 | patch=$(echo $current_version | awk '{split($0,a,"."); print a[3]}') 64 | 65 | if [[ "$@" == "major" ]]; then 66 | major=$(($major + 1)) 67 | minor="0" 68 | patch="0" 69 | elif [[ "$@" == "minor" ]]; then 70 | minor=$(($minor + 1)) 71 | patch="0" 72 | elif [[ "$@" == "patch" ]]; then 73 | patch=$(($patch + 1)) 74 | fi 75 | 76 | next_version="${major}.${minor}.${patch}" 77 | 78 | echo -e "${YELLOW} >${COLOR_OFF} ${MAGENTA}${current_version}${COLOR_OFF} -> ${MAGENTA}${next_version}${COLOR_OFF}" 79 | 80 | echo -e "${YELLOW}--->${COLOR_OFF} Ensuring readme passes lint checks (if this fails, run rst-lint)" 81 | rst-lint README.rst || exit 1 82 | 83 | echo -e "${YELLOW}--->${COLOR_OFF} Creating necessary temp file" 84 | tempfoo=$(basename $0) 85 | TMPFILE=$(mktemp /tmp/${tempfoo}.XXXXXX) || { 86 | echo -e "${RED}WARNING: Cannot create temp file using mktemp in /tmp dir ${COLOR_OFF}\n" 87 | exit 1 88 | } 89 | 90 | find_this="__version__ = \"$current_version\"" 91 | replace_with="__version__ = \"$next_version\"" 92 | 93 | echo -e "${YELLOW}--->${COLOR_OFF} Updating ${INIT_PACKAGE_NAME}/__init__.py" 94 | sed "s/$find_this/$replace_with/" ${INIT_PACKAGE_NAME}/__init__.py >$TMPFILE && mv $TMPFILE ${INIT_PACKAGE_NAME}/__init__.py 95 | 96 | if [ -f docs/conf.py ]; then 97 | echo -e "${YELLOW}--->${COLOR_OFF} Updating docs" 98 | find_this="version = '${current_version}'" 99 | replace_with="version = '${next_version}'" 100 | sed "s/$find_this/$replace_with/" docs/conf.py >$TMPFILE && mv $TMPFILE docs/conf.py 101 | 102 | find_this="version = '${current_version}'" 103 | replace_with="release = '${next_version}'" 104 | sed "s/$find_this/$replace_with/" docs/conf.py >$TMPFILE && mv $TMPFILE docs/conf.py 105 | fi 106 | 107 | echo -e "${YELLOW}--->${COLOR_OFF} Updating CHANGES.rst for new release" 108 | version_header="$next_version ($(date +%F))" 109 | set +e 110 | dashes=$(yes '-' | head -n ${#version_header} | tr -d '\n') 111 | set -e 112 | gitchangelog | sed "4s/.*/$version_header/" | sed "5s/.*/$dashes/" >$TMPFILE && mv $TMPFILE CHANGES.rst 113 | 114 | echo -e "${YELLOW}--->${COLOR_OFF} Adding changed files to git" 115 | git add CHANGES.rst README.rst ${INIT_PACKAGE_NAME}/__init__.py 116 | if [ -f docs/conf.py ]; then git add docs/conf.py; fi 117 | 118 | echo -e "${YELLOW}--->${COLOR_OFF} Creating release" 119 | git commit -q -m "Release version $next_version" 120 | 121 | if [[ "$PUBLIC" == "true" ]]; then 122 | echo -e "${YELLOW}--->${COLOR_OFF} Creating python release files" 123 | cp README.rst README 124 | python setup.py sdist bdist_wheel >/dev/null 125 | 126 | echo -e "${YELLOW}--->${COLOR_OFF} Validating long_description" 127 | twine check dist/* 128 | fi 129 | 130 | echo -e "${YELLOW}--->${COLOR_OFF} Tagging release" 131 | git tag -a $next_version -m "Release version $next_version" 132 | 133 | echo -e "${YELLOW}--->${COLOR_OFF} Pushing release and tags to github" 134 | git push -q origin master && git push -q --tags 135 | 136 | if [[ "$PUBLIC" == "true" ]]; then 137 | echo -e "${YELLOW}--->${COLOR_OFF} Uploading python release" 138 | twine upload dist/* 139 | rm README 140 | fi 141 | 142 | echo -e "\n${CYAN}RELEASED VERSION ${next_version}!${COLOR_OFF}\n" 143 | -------------------------------------------------------------------------------- /tests/test_pagination.py: -------------------------------------------------------------------------------- 1 | """Tests for Link header pagination handling.""" 2 | 3 | import json 4 | from unittest.mock import Mock, patch 5 | 6 | import pytest 7 | 8 | from github_backup import github_backup 9 | 10 | 11 | class MockHTTPResponse: 12 | """Mock HTTP response for paginated API calls.""" 13 | 14 | def __init__(self, data, link_header=None): 15 | self._content = json.dumps(data).encode("utf-8") 16 | self._link_header = link_header 17 | self._read = False 18 | self.reason = "OK" 19 | 20 | def getcode(self): 21 | return 200 22 | 23 | def read(self): 24 | if self._read: 25 | return b"" 26 | self._read = True 27 | return self._content 28 | 29 | def get_header(self, name, default=None): 30 | """Mock method for headers.get().""" 31 | return self.headers.get(name, default) 32 | 33 | @property 34 | def headers(self): 35 | headers = {"x-ratelimit-remaining": "5000"} 36 | if self._link_header: 37 | headers["Link"] = self._link_header 38 | return headers 39 | 40 | 41 | @pytest.fixture 42 | def mock_args(): 43 | """Mock args for retrieve_data_gen.""" 44 | args = Mock() 45 | args.as_app = False 46 | args.token_fine = None 47 | args.token_classic = "fake_token" 48 | args.username = None 49 | args.password = None 50 | args.osx_keychain_item_name = None 51 | args.osx_keychain_item_account = None 52 | args.throttle_limit = None 53 | args.throttle_pause = 0 54 | return args 55 | 56 | 57 | def test_cursor_based_pagination(mock_args): 58 | """Link header with 'after' cursor parameter works correctly.""" 59 | 60 | # Simulate issues endpoint behavior: returns cursor in Link header 61 | responses = [ 62 | # Issues endpoint returns 'after' cursor parameter (not 'page') 63 | MockHTTPResponse( 64 | data=[{"issue": i} for i in range(1, 101)], # Page 1 contents 65 | link_header='; rel="next"', 66 | ), 67 | MockHTTPResponse( 68 | data=[{"issue": i} for i in range(101, 151)], # Page 2 contents 69 | link_header=None, # No Link header - signals end of pagination 70 | ), 71 | ] 72 | requests_made = [] 73 | 74 | def mock_urlopen(request, *args, **kwargs): 75 | url = request.get_full_url() 76 | requests_made.append(url) 77 | return responses[len(requests_made) - 1] 78 | 79 | with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): 80 | results = list( 81 | github_backup.retrieve_data_gen( 82 | mock_args, "https://api.github.com/repos/owner/repo/issues" 83 | ) 84 | ) 85 | 86 | # Verify all items retrieved and cursor was used in second request 87 | assert len(results) == 150 88 | assert len(requests_made) == 2 89 | assert "after=ABC123" in requests_made[1] 90 | 91 | 92 | def test_page_based_pagination(mock_args): 93 | """Link header with 'page' parameter works correctly.""" 94 | 95 | # Simulate pulls/repos endpoint behavior: returns page numbers in Link header 96 | responses = [ 97 | # Pulls endpoint uses traditional 'page' parameter (not cursor) 98 | MockHTTPResponse( 99 | data=[{"pull": i} for i in range(1, 101)], # Page 1 contents 100 | link_header='; rel="next"', 101 | ), 102 | MockHTTPResponse( 103 | data=[{"pull": i} for i in range(101, 181)], # Page 2 contents 104 | link_header=None, # No Link header - signals end of pagination 105 | ), 106 | ] 107 | requests_made = [] 108 | 109 | def mock_urlopen(request, *args, **kwargs): 110 | url = request.get_full_url() 111 | requests_made.append(url) 112 | return responses[len(requests_made) - 1] 113 | 114 | with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): 115 | results = list( 116 | github_backup.retrieve_data_gen( 117 | mock_args, "https://api.github.com/repos/owner/repo/pulls" 118 | ) 119 | ) 120 | 121 | # Verify all items retrieved and page parameter was used (not cursor) 122 | assert len(results) == 180 123 | assert len(requests_made) == 2 124 | assert "page=2" in requests_made[1] 125 | assert "after" not in requests_made[1] 126 | 127 | 128 | def test_no_link_header_stops_pagination(mock_args): 129 | """Pagination stops when Link header is absent.""" 130 | 131 | # Simulate endpoint with results that fit in a single page 132 | responses = [ 133 | MockHTTPResponse( 134 | data=[{"label": i} for i in range(1, 51)], # Page contents 135 | link_header=None, # No Link header - signals end of pagination 136 | ) 137 | ] 138 | requests_made = [] 139 | 140 | def mock_urlopen(request, *args, **kwargs): 141 | requests_made.append(request.get_full_url()) 142 | return responses[len(requests_made) - 1] 143 | 144 | with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen): 145 | results = list( 146 | github_backup.retrieve_data_gen( 147 | mock_args, "https://api.github.com/repos/owner/repo/labels" 148 | ) 149 | ) 150 | 151 | # Verify pagination stopped after first request 152 | assert len(results) == 50 153 | assert len(requests_made) == 1 154 | -------------------------------------------------------------------------------- /tests/test_http_451.py: -------------------------------------------------------------------------------- 1 | """Tests for HTTP 451 (DMCA takedown) handling.""" 2 | 3 | import json 4 | from unittest.mock import Mock, patch 5 | 6 | import pytest 7 | 8 | from github_backup import github_backup 9 | 10 | 11 | class TestHTTP451Exception: 12 | """Test suite for HTTP 451 DMCA takedown exception handling.""" 13 | 14 | def test_repository_unavailable_error_raised(self): 15 | """HTTP 451 should raise RepositoryUnavailableError with DMCA URL.""" 16 | # Create mock args 17 | args = Mock() 18 | args.as_app = False 19 | args.token_fine = None 20 | args.token_classic = None 21 | args.username = None 22 | args.password = None 23 | args.osx_keychain_item_name = None 24 | args.osx_keychain_item_account = None 25 | args.throttle_limit = None 26 | args.throttle_pause = 0 27 | 28 | # Mock HTTPError 451 response 29 | mock_response = Mock() 30 | mock_response.getcode.return_value = 451 31 | 32 | dmca_data = { 33 | "message": "Repository access blocked", 34 | "block": { 35 | "reason": "dmca", 36 | "created_at": "2024-11-12T14:38:04Z", 37 | "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" 38 | } 39 | } 40 | mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8") 41 | mock_response.headers = {"x-ratelimit-remaining": "5000"} 42 | mock_response.reason = "Unavailable For Legal Reasons" 43 | 44 | def mock_get_response(request, auth, template): 45 | return mock_response, [] 46 | 47 | with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): 48 | with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: 49 | list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) 50 | 51 | # Check exception has DMCA URL 52 | assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md" 53 | assert "451" in str(exc_info.value) 54 | 55 | def test_repository_unavailable_error_without_dmca_url(self): 56 | """HTTP 451 without DMCA details should still raise exception.""" 57 | args = Mock() 58 | args.as_app = False 59 | args.token_fine = None 60 | args.token_classic = None 61 | args.username = None 62 | args.password = None 63 | args.osx_keychain_item_name = None 64 | args.osx_keychain_item_account = None 65 | args.throttle_limit = None 66 | args.throttle_pause = 0 67 | 68 | mock_response = Mock() 69 | mock_response.getcode.return_value = 451 70 | mock_response.read.return_value = b'{"message": "Blocked"}' 71 | mock_response.headers = {"x-ratelimit-remaining": "5000"} 72 | mock_response.reason = "Unavailable For Legal Reasons" 73 | 74 | def mock_get_response(request, auth, template): 75 | return mock_response, [] 76 | 77 | with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): 78 | with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info: 79 | list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) 80 | 81 | # Exception raised even without DMCA URL 82 | assert exc_info.value.dmca_url is None 83 | assert "451" in str(exc_info.value) 84 | 85 | def test_repository_unavailable_error_with_malformed_json(self): 86 | """HTTP 451 with malformed JSON should still raise exception.""" 87 | args = Mock() 88 | args.as_app = False 89 | args.token_fine = None 90 | args.token_classic = None 91 | args.username = None 92 | args.password = None 93 | args.osx_keychain_item_name = None 94 | args.osx_keychain_item_account = None 95 | args.throttle_limit = None 96 | args.throttle_pause = 0 97 | 98 | mock_response = Mock() 99 | mock_response.getcode.return_value = 451 100 | mock_response.read.return_value = b"invalid json {" 101 | mock_response.headers = {"x-ratelimit-remaining": "5000"} 102 | mock_response.reason = "Unavailable For Legal Reasons" 103 | 104 | def mock_get_response(request, auth, template): 105 | return mock_response, [] 106 | 107 | with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): 108 | with pytest.raises(github_backup.RepositoryUnavailableError): 109 | list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues")) 110 | 111 | def test_other_http_errors_unchanged(self): 112 | """Other HTTP errors should still raise generic Exception.""" 113 | args = Mock() 114 | args.as_app = False 115 | args.token_fine = None 116 | args.token_classic = None 117 | args.username = None 118 | args.password = None 119 | args.osx_keychain_item_name = None 120 | args.osx_keychain_item_account = None 121 | args.throttle_limit = None 122 | args.throttle_pause = 0 123 | 124 | mock_response = Mock() 125 | mock_response.getcode.return_value = 404 126 | mock_response.read.return_value = b'{"message": "Not Found"}' 127 | mock_response.headers = {"x-ratelimit-remaining": "5000"} 128 | mock_response.reason = "Not Found" 129 | 130 | def mock_get_response(request, auth, template): 131 | return mock_response, [] 132 | 133 | with patch("github_backup.github_backup._get_response", side_effect=mock_get_response): 134 | # Should raise generic Exception, not RepositoryUnavailableError 135 | with pytest.raises(Exception) as exc_info: 136 | list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues")) 137 | 138 | assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError) 139 | assert "404" in str(exc_info.value) 140 | 141 | 142 | if __name__ == "__main__": 143 | pytest.main([__file__, "-v"]) 144 | -------------------------------------------------------------------------------- /tests/test_all_starred.py: -------------------------------------------------------------------------------- 1 | """Tests for --all-starred flag behavior (issue #225).""" 2 | 3 | import pytest 4 | from unittest.mock import Mock, patch 5 | 6 | from github_backup import github_backup 7 | 8 | 9 | class TestAllStarredCloning: 10 | """Test suite for --all-starred repository cloning behavior. 11 | 12 | Issue #225: --all-starred should clone starred repos without requiring --repositories. 13 | """ 14 | 15 | def _create_mock_args(self, **overrides): 16 | """Create a mock args object with sensible defaults.""" 17 | args = Mock() 18 | args.user = "testuser" 19 | args.output_directory = "/tmp/backup" 20 | args.include_repository = False 21 | args.include_everything = False 22 | args.include_gists = False 23 | args.include_starred_gists = False 24 | args.all_starred = False 25 | args.skip_existing = False 26 | args.bare_clone = False 27 | args.lfs_clone = False 28 | args.no_prune = False 29 | args.include_wiki = False 30 | args.include_issues = False 31 | args.include_issue_comments = False 32 | args.include_issue_events = False 33 | args.include_pulls = False 34 | args.include_pull_comments = False 35 | args.include_pull_commits = False 36 | args.include_pull_details = False 37 | args.include_labels = False 38 | args.include_hooks = False 39 | args.include_milestones = False 40 | args.include_releases = False 41 | args.include_assets = False 42 | args.include_attachments = False 43 | args.incremental = False 44 | args.incremental_by_files = False 45 | args.github_host = None 46 | args.prefer_ssh = False 47 | args.token_classic = None 48 | args.token_fine = None 49 | args.username = None 50 | args.password = None 51 | args.as_app = False 52 | args.osx_keychain_item_name = None 53 | args.osx_keychain_item_account = None 54 | 55 | for key, value in overrides.items(): 56 | setattr(args, key, value) 57 | 58 | return args 59 | 60 | @patch('github_backup.github_backup.fetch_repository') 61 | @patch('github_backup.github_backup.get_github_repo_url') 62 | def test_all_starred_clones_without_repositories_flag(self, mock_get_url, mock_fetch): 63 | """--all-starred should clone starred repos without --repositories flag. 64 | 65 | This is the core fix for issue #225. 66 | """ 67 | args = self._create_mock_args(all_starred=True) 68 | mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git" 69 | 70 | # A starred repository (is_starred flag set by retrieve_repositories) 71 | starred_repo = { 72 | "name": "awesome-project", 73 | "full_name": "otheruser/awesome-project", 74 | "owner": {"login": "otheruser"}, 75 | "private": False, 76 | "fork": False, 77 | "has_wiki": False, 78 | "is_starred": True, # This flag is set for starred repos 79 | } 80 | 81 | with patch('github_backup.github_backup.mkdir_p'): 82 | github_backup.backup_repositories(args, "/tmp/backup", [starred_repo]) 83 | 84 | # fetch_repository should be called for the starred repo 85 | assert mock_fetch.called, "--all-starred should trigger repository cloning" 86 | mock_fetch.assert_called_once() 87 | call_args = mock_fetch.call_args 88 | assert call_args[0][0] == "awesome-project" # repo name 89 | 90 | @patch('github_backup.github_backup.fetch_repository') 91 | @patch('github_backup.github_backup.get_github_repo_url') 92 | def test_starred_repo_not_cloned_without_all_starred_flag(self, mock_get_url, mock_fetch): 93 | """Starred repos should NOT be cloned if --all-starred is not set.""" 94 | args = self._create_mock_args(all_starred=False) 95 | mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git" 96 | 97 | starred_repo = { 98 | "name": "awesome-project", 99 | "full_name": "otheruser/awesome-project", 100 | "owner": {"login": "otheruser"}, 101 | "private": False, 102 | "fork": False, 103 | "has_wiki": False, 104 | "is_starred": True, 105 | } 106 | 107 | with patch('github_backup.github_backup.mkdir_p'): 108 | github_backup.backup_repositories(args, "/tmp/backup", [starred_repo]) 109 | 110 | # fetch_repository should NOT be called 111 | assert not mock_fetch.called, "Starred repos should not be cloned without --all-starred" 112 | 113 | @patch('github_backup.github_backup.fetch_repository') 114 | @patch('github_backup.github_backup.get_github_repo_url') 115 | def test_non_starred_repo_not_cloned_with_only_all_starred(self, mock_get_url, mock_fetch): 116 | """Non-starred repos should NOT be cloned when only --all-starred is set.""" 117 | args = self._create_mock_args(all_starred=True) 118 | mock_get_url.return_value = "https://github.com/testuser/my-project.git" 119 | 120 | # A regular (non-starred) repository 121 | regular_repo = { 122 | "name": "my-project", 123 | "full_name": "testuser/my-project", 124 | "owner": {"login": "testuser"}, 125 | "private": False, 126 | "fork": False, 127 | "has_wiki": False, 128 | # No is_starred flag 129 | } 130 | 131 | with patch('github_backup.github_backup.mkdir_p'): 132 | github_backup.backup_repositories(args, "/tmp/backup", [regular_repo]) 133 | 134 | # fetch_repository should NOT be called for non-starred repos 135 | assert not mock_fetch.called, "Non-starred repos should not be cloned with only --all-starred" 136 | 137 | @patch('github_backup.github_backup.fetch_repository') 138 | @patch('github_backup.github_backup.get_github_repo_url') 139 | def test_repositories_flag_still_works(self, mock_get_url, mock_fetch): 140 | """--repositories flag should still clone repos as before.""" 141 | args = self._create_mock_args(include_repository=True) 142 | mock_get_url.return_value = "https://github.com/testuser/my-project.git" 143 | 144 | regular_repo = { 145 | "name": "my-project", 146 | "full_name": "testuser/my-project", 147 | "owner": {"login": "testuser"}, 148 | "private": False, 149 | "fork": False, 150 | "has_wiki": False, 151 | } 152 | 153 | with patch('github_backup.github_backup.mkdir_p'): 154 | github_backup.backup_repositories(args, "/tmp/backup", [regular_repo]) 155 | 156 | # fetch_repository should be called 157 | assert mock_fetch.called, "--repositories should trigger repository cloning" 158 | 159 | 160 | if __name__ == "__main__": 161 | pytest.main([__file__, "-v"]) 162 | -------------------------------------------------------------------------------- /tests/test_json_dump_if_changed.py: -------------------------------------------------------------------------------- 1 | """Tests for json_dump_if_changed functionality.""" 2 | 3 | import codecs 4 | import json 5 | import os 6 | import tempfile 7 | 8 | import pytest 9 | 10 | from github_backup import github_backup 11 | 12 | 13 | class TestJsonDumpIfChanged: 14 | """Test suite for json_dump_if_changed function.""" 15 | 16 | def test_writes_new_file(self): 17 | """Should write file when it doesn't exist.""" 18 | with tempfile.TemporaryDirectory() as tmpdir: 19 | output_file = os.path.join(tmpdir, "test.json") 20 | test_data = {"key": "value", "number": 42} 21 | 22 | result = github_backup.json_dump_if_changed(test_data, output_file) 23 | 24 | assert result is True 25 | assert os.path.exists(output_file) 26 | 27 | # Verify content matches expected format 28 | with codecs.open(output_file, "r", encoding="utf-8") as f: 29 | content = f.read() 30 | loaded = json.loads(content) 31 | assert loaded == test_data 32 | 33 | def test_skips_unchanged_file(self): 34 | """Should skip write when content is identical.""" 35 | with tempfile.TemporaryDirectory() as tmpdir: 36 | output_file = os.path.join(tmpdir, "test.json") 37 | test_data = {"key": "value", "number": 42} 38 | 39 | # First write 40 | result1 = github_backup.json_dump_if_changed(test_data, output_file) 41 | assert result1 is True 42 | 43 | # Get the initial mtime 44 | mtime1 = os.path.getmtime(output_file) 45 | 46 | # Second write with same data 47 | result2 = github_backup.json_dump_if_changed(test_data, output_file) 48 | assert result2 is False 49 | 50 | # File should not have been modified 51 | mtime2 = os.path.getmtime(output_file) 52 | assert mtime1 == mtime2 53 | 54 | def test_writes_when_content_changed(self): 55 | """Should write file when content has changed.""" 56 | with tempfile.TemporaryDirectory() as tmpdir: 57 | output_file = os.path.join(tmpdir, "test.json") 58 | test_data1 = {"key": "value1"} 59 | test_data2 = {"key": "value2"} 60 | 61 | # First write 62 | result1 = github_backup.json_dump_if_changed(test_data1, output_file) 63 | assert result1 is True 64 | 65 | # Second write with different data 66 | result2 = github_backup.json_dump_if_changed(test_data2, output_file) 67 | assert result2 is True 68 | 69 | # Verify new content 70 | with codecs.open(output_file, "r", encoding="utf-8") as f: 71 | loaded = json.load(f) 72 | assert loaded == test_data2 73 | 74 | def test_uses_consistent_formatting(self): 75 | """Should use same JSON formatting as json_dump.""" 76 | with tempfile.TemporaryDirectory() as tmpdir: 77 | output_file = os.path.join(tmpdir, "test.json") 78 | test_data = {"z": "last", "a": "first", "m": "middle"} 79 | 80 | github_backup.json_dump_if_changed(test_data, output_file) 81 | 82 | with codecs.open(output_file, "r", encoding="utf-8") as f: 83 | content = f.read() 84 | 85 | # Check for consistent formatting: 86 | # - sorted keys 87 | # - 4-space indent 88 | # - comma-colon-space separator 89 | expected = json.dumps( 90 | test_data, 91 | ensure_ascii=False, 92 | sort_keys=True, 93 | indent=4, 94 | separators=(",", ": "), 95 | ) 96 | assert content == expected 97 | 98 | def test_atomic_write_always_used(self): 99 | """Should always use temp file and rename for atomic writes.""" 100 | with tempfile.TemporaryDirectory() as tmpdir: 101 | output_file = os.path.join(tmpdir, "test.json") 102 | test_data = {"key": "value"} 103 | 104 | result = github_backup.json_dump_if_changed(test_data, output_file) 105 | 106 | assert result is True 107 | assert os.path.exists(output_file) 108 | 109 | # Temp file should not exist after atomic write 110 | temp_file = output_file + ".temp" 111 | assert not os.path.exists(temp_file) 112 | 113 | # Verify content 114 | with codecs.open(output_file, "r", encoding="utf-8") as f: 115 | loaded = json.load(f) 116 | assert loaded == test_data 117 | 118 | def test_handles_unicode_content(self): 119 | """Should correctly handle Unicode content.""" 120 | with tempfile.TemporaryDirectory() as tmpdir: 121 | output_file = os.path.join(tmpdir, "test.json") 122 | test_data = { 123 | "emoji": "🚀", 124 | "chinese": "你好", 125 | "arabic": "مرحبا", 126 | "cyrillic": "Привет", 127 | } 128 | 129 | result = github_backup.json_dump_if_changed(test_data, output_file) 130 | assert result is True 131 | 132 | # Verify Unicode is preserved 133 | with codecs.open(output_file, "r", encoding="utf-8") as f: 134 | loaded = json.load(f) 135 | assert loaded == test_data 136 | 137 | # Second write should skip 138 | result2 = github_backup.json_dump_if_changed(test_data, output_file) 139 | assert result2 is False 140 | 141 | def test_handles_complex_nested_data(self): 142 | """Should handle complex nested data structures.""" 143 | with tempfile.TemporaryDirectory() as tmpdir: 144 | output_file = os.path.join(tmpdir, "test.json") 145 | test_data = { 146 | "users": [ 147 | {"id": 1, "name": "Alice", "tags": ["admin", "user"]}, 148 | {"id": 2, "name": "Bob", "tags": ["user"]}, 149 | ], 150 | "metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}}, 151 | } 152 | 153 | result = github_backup.json_dump_if_changed(test_data, output_file) 154 | assert result is True 155 | 156 | # Verify structure is preserved 157 | with codecs.open(output_file, "r", encoding="utf-8") as f: 158 | loaded = json.load(f) 159 | assert loaded == test_data 160 | 161 | def test_overwrites_on_unicode_decode_error(self): 162 | """Should overwrite if existing file has invalid UTF-8.""" 163 | with tempfile.TemporaryDirectory() as tmpdir: 164 | output_file = os.path.join(tmpdir, "test.json") 165 | test_data = {"key": "value"} 166 | 167 | # Write invalid UTF-8 bytes 168 | with open(output_file, "wb") as f: 169 | f.write(b"\xff\xfe invalid utf-8") 170 | 171 | # Should catch UnicodeDecodeError and overwrite 172 | result = github_backup.json_dump_if_changed(test_data, output_file) 173 | assert result is True 174 | 175 | # Verify new content was written 176 | with codecs.open(output_file, "r", encoding="utf-8") as f: 177 | loaded = json.load(f) 178 | assert loaded == test_data 179 | 180 | def test_key_order_independence(self): 181 | """Should treat differently-ordered dicts as same if keys/values match.""" 182 | with tempfile.TemporaryDirectory() as tmpdir: 183 | output_file = os.path.join(tmpdir, "test.json") 184 | 185 | # Write first dict 186 | data1 = {"z": 1, "a": 2, "m": 3} 187 | github_backup.json_dump_if_changed(data1, output_file) 188 | 189 | # Try to write same data but different order 190 | data2 = {"a": 2, "m": 3, "z": 1} 191 | result = github_backup.json_dump_if_changed(data2, output_file) 192 | 193 | # Should skip because content is the same (keys are sorted) 194 | assert result is False 195 | 196 | 197 | if __name__ == "__main__": 198 | pytest.main([__file__, "-v"]) 199 | -------------------------------------------------------------------------------- /tests/test_skip_assets_on.py: -------------------------------------------------------------------------------- 1 | """Tests for --skip-assets-on flag behavior (issue #135).""" 2 | 3 | import pytest 4 | from unittest.mock import Mock, patch 5 | 6 | from github_backup import github_backup 7 | 8 | 9 | class TestSkipAssetsOn: 10 | """Test suite for --skip-assets-on flag. 11 | 12 | Issue #135: Allow skipping asset downloads for specific repositories 13 | while still backing up release metadata. 14 | """ 15 | 16 | def _create_mock_args(self, **overrides): 17 | """Create a mock args object with sensible defaults.""" 18 | args = Mock() 19 | args.user = "testuser" 20 | args.output_directory = "/tmp/backup" 21 | args.include_repository = False 22 | args.include_everything = False 23 | args.include_gists = False 24 | args.include_starred_gists = False 25 | args.all_starred = False 26 | args.skip_existing = False 27 | args.bare_clone = False 28 | args.lfs_clone = False 29 | args.no_prune = False 30 | args.include_wiki = False 31 | args.include_issues = False 32 | args.include_issue_comments = False 33 | args.include_issue_events = False 34 | args.include_pulls = False 35 | args.include_pull_comments = False 36 | args.include_pull_commits = False 37 | args.include_pull_details = False 38 | args.include_labels = False 39 | args.include_hooks = False 40 | args.include_milestones = False 41 | args.include_releases = True 42 | args.include_assets = True 43 | args.skip_assets_on = [] 44 | args.include_attachments = False 45 | args.incremental = False 46 | args.incremental_by_files = False 47 | args.github_host = None 48 | args.prefer_ssh = False 49 | args.token_classic = "test-token" 50 | args.token_fine = None 51 | args.username = None 52 | args.password = None 53 | args.as_app = False 54 | args.osx_keychain_item_name = None 55 | args.osx_keychain_item_account = None 56 | args.skip_prerelease = False 57 | args.number_of_latest_releases = None 58 | 59 | for key, value in overrides.items(): 60 | setattr(args, key, value) 61 | 62 | return args 63 | 64 | def _create_mock_repository(self, name="test-repo", owner="testuser"): 65 | """Create a mock repository object.""" 66 | return { 67 | "name": name, 68 | "full_name": f"{owner}/{name}", 69 | "owner": {"login": owner}, 70 | "private": False, 71 | "fork": False, 72 | "has_wiki": False, 73 | } 74 | 75 | def _create_mock_release(self, tag="v1.0.0"): 76 | """Create a mock release object.""" 77 | return { 78 | "tag_name": tag, 79 | "name": tag, 80 | "prerelease": False, 81 | "draft": False, 82 | "assets_url": f"https://api.github.com/repos/testuser/test-repo/releases/{tag}/assets", 83 | } 84 | 85 | def _create_mock_asset(self, name="asset.zip"): 86 | """Create a mock asset object.""" 87 | return { 88 | "name": name, 89 | "url": f"https://api.github.com/repos/testuser/test-repo/releases/assets/{name}", 90 | } 91 | 92 | 93 | class TestSkipAssetsOnArgumentParsing(TestSkipAssetsOn): 94 | """Tests for --skip-assets-on argument parsing.""" 95 | 96 | def test_skip_assets_on_not_set_defaults_to_none(self): 97 | """When --skip-assets-on is not specified, it should default to None.""" 98 | args = github_backup.parse_args(["testuser"]) 99 | assert args.skip_assets_on is None 100 | 101 | def test_skip_assets_on_single_repo(self): 102 | """Single --skip-assets-on should create list with one item.""" 103 | args = github_backup.parse_args(["testuser", "--skip-assets-on", "big-repo"]) 104 | assert args.skip_assets_on == ["big-repo"] 105 | 106 | def test_skip_assets_on_multiple_repos(self): 107 | """Multiple repos can be specified space-separated (like --exclude).""" 108 | args = github_backup.parse_args( 109 | [ 110 | "testuser", 111 | "--skip-assets-on", 112 | "big-repo", 113 | "another-repo", 114 | "owner/third-repo", 115 | ] 116 | ) 117 | assert args.skip_assets_on == ["big-repo", "another-repo", "owner/third-repo"] 118 | 119 | 120 | class TestSkipAssetsOnBehavior(TestSkipAssetsOn): 121 | """Tests for --skip-assets-on behavior in backup_releases.""" 122 | 123 | @patch("github_backup.github_backup.download_file") 124 | @patch("github_backup.github_backup.retrieve_data") 125 | @patch("github_backup.github_backup.mkdir_p") 126 | @patch("github_backup.github_backup.json_dump_if_changed") 127 | def test_assets_downloaded_when_not_skipped( 128 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 129 | ): 130 | """Assets should be downloaded when repo is not in skip list.""" 131 | args = self._create_mock_args(skip_assets_on=[]) 132 | repository = self._create_mock_repository(name="normal-repo") 133 | release = self._create_mock_release() 134 | asset = self._create_mock_asset() 135 | 136 | mock_json_dump.return_value = True 137 | mock_retrieve.side_effect = [ 138 | [release], # First call: get releases 139 | [asset], # Second call: get assets 140 | ] 141 | 142 | with patch("os.path.join", side_effect=lambda *args: "/".join(args)): 143 | github_backup.backup_releases( 144 | args, 145 | "/tmp/backup/repositories/normal-repo", 146 | repository, 147 | "https://api.github.com/repos/{owner}/{repo}", 148 | include_assets=True, 149 | ) 150 | 151 | # download_file should have been called for the asset 152 | mock_download.assert_called_once() 153 | 154 | @patch("github_backup.github_backup.download_file") 155 | @patch("github_backup.github_backup.retrieve_data") 156 | @patch("github_backup.github_backup.mkdir_p") 157 | @patch("github_backup.github_backup.json_dump_if_changed") 158 | def test_assets_skipped_when_repo_name_matches( 159 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 160 | ): 161 | """Assets should be skipped when repo name is in skip list.""" 162 | args = self._create_mock_args(skip_assets_on=["big-repo"]) 163 | repository = self._create_mock_repository(name="big-repo") 164 | release = self._create_mock_release() 165 | 166 | mock_json_dump.return_value = True 167 | mock_retrieve.return_value = [release] 168 | 169 | github_backup.backup_releases( 170 | args, 171 | "/tmp/backup/repositories/big-repo", 172 | repository, 173 | "https://api.github.com/repos/{owner}/{repo}", 174 | include_assets=True, 175 | ) 176 | 177 | # download_file should NOT have been called 178 | mock_download.assert_not_called() 179 | 180 | @patch("github_backup.github_backup.download_file") 181 | @patch("github_backup.github_backup.retrieve_data") 182 | @patch("github_backup.github_backup.mkdir_p") 183 | @patch("github_backup.github_backup.json_dump_if_changed") 184 | def test_assets_skipped_when_full_name_matches( 185 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 186 | ): 187 | """Assets should be skipped when owner/repo format matches.""" 188 | args = self._create_mock_args(skip_assets_on=["otheruser/big-repo"]) 189 | repository = self._create_mock_repository(name="big-repo", owner="otheruser") 190 | release = self._create_mock_release() 191 | 192 | mock_json_dump.return_value = True 193 | mock_retrieve.return_value = [release] 194 | 195 | github_backup.backup_releases( 196 | args, 197 | "/tmp/backup/repositories/big-repo", 198 | repository, 199 | "https://api.github.com/repos/{owner}/{repo}", 200 | include_assets=True, 201 | ) 202 | 203 | # download_file should NOT have been called 204 | mock_download.assert_not_called() 205 | 206 | @patch("github_backup.github_backup.download_file") 207 | @patch("github_backup.github_backup.retrieve_data") 208 | @patch("github_backup.github_backup.mkdir_p") 209 | @patch("github_backup.github_backup.json_dump_if_changed") 210 | def test_case_insensitive_matching( 211 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 212 | ): 213 | """Skip matching should be case-insensitive.""" 214 | # User types uppercase, repo name is lowercase 215 | args = self._create_mock_args(skip_assets_on=["BIG-REPO"]) 216 | repository = self._create_mock_repository(name="big-repo") 217 | release = self._create_mock_release() 218 | 219 | mock_json_dump.return_value = True 220 | mock_retrieve.return_value = [release] 221 | 222 | github_backup.backup_releases( 223 | args, 224 | "/tmp/backup/repositories/big-repo", 225 | repository, 226 | "https://api.github.com/repos/{owner}/{repo}", 227 | include_assets=True, 228 | ) 229 | 230 | # download_file should NOT have been called (case-insensitive match) 231 | assert not mock_download.called 232 | 233 | @patch("github_backup.github_backup.download_file") 234 | @patch("github_backup.github_backup.retrieve_data") 235 | @patch("github_backup.github_backup.mkdir_p") 236 | @patch("github_backup.github_backup.json_dump_if_changed") 237 | def test_multiple_skip_repos( 238 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 239 | ): 240 | """Multiple repos in skip list should all be skipped.""" 241 | args = self._create_mock_args(skip_assets_on=["repo1", "repo2", "repo3"]) 242 | repository = self._create_mock_repository(name="repo2") 243 | release = self._create_mock_release() 244 | 245 | mock_json_dump.return_value = True 246 | mock_retrieve.return_value = [release] 247 | 248 | github_backup.backup_releases( 249 | args, 250 | "/tmp/backup/repositories/repo2", 251 | repository, 252 | "https://api.github.com/repos/{owner}/{repo}", 253 | include_assets=True, 254 | ) 255 | 256 | # download_file should NOT have been called 257 | mock_download.assert_not_called() 258 | 259 | @patch("github_backup.github_backup.download_file") 260 | @patch("github_backup.github_backup.retrieve_data") 261 | @patch("github_backup.github_backup.mkdir_p") 262 | @patch("github_backup.github_backup.json_dump_if_changed") 263 | def test_release_metadata_still_saved_when_assets_skipped( 264 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 265 | ): 266 | """Release JSON should still be saved even when assets are skipped.""" 267 | args = self._create_mock_args(skip_assets_on=["big-repo"]) 268 | repository = self._create_mock_repository(name="big-repo") 269 | release = self._create_mock_release() 270 | 271 | mock_json_dump.return_value = True 272 | mock_retrieve.return_value = [release] 273 | 274 | github_backup.backup_releases( 275 | args, 276 | "/tmp/backup/repositories/big-repo", 277 | repository, 278 | "https://api.github.com/repos/{owner}/{repo}", 279 | include_assets=True, 280 | ) 281 | 282 | # json_dump_if_changed should have been called for release metadata 283 | mock_json_dump.assert_called_once() 284 | # But download_file should NOT have been called 285 | mock_download.assert_not_called() 286 | 287 | @patch("github_backup.github_backup.download_file") 288 | @patch("github_backup.github_backup.retrieve_data") 289 | @patch("github_backup.github_backup.mkdir_p") 290 | @patch("github_backup.github_backup.json_dump_if_changed") 291 | def test_non_matching_repo_still_downloads_assets( 292 | self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download 293 | ): 294 | """Repos not in skip list should still download assets.""" 295 | args = self._create_mock_args(skip_assets_on=["other-repo"]) 296 | repository = self._create_mock_repository(name="normal-repo") 297 | release = self._create_mock_release() 298 | asset = self._create_mock_asset() 299 | 300 | mock_json_dump.return_value = True 301 | mock_retrieve.side_effect = [ 302 | [release], # First call: get releases 303 | [asset], # Second call: get assets 304 | ] 305 | 306 | with patch("os.path.join", side_effect=lambda *args: "/".join(args)): 307 | github_backup.backup_releases( 308 | args, 309 | "/tmp/backup/repositories/normal-repo", 310 | repository, 311 | "https://api.github.com/repos/{owner}/{repo}", 312 | include_assets=True, 313 | ) 314 | 315 | # download_file SHOULD have been called 316 | mock_download.assert_called_once() 317 | 318 | 319 | if __name__ == "__main__": 320 | pytest.main([__file__, "-v"]) 321 | -------------------------------------------------------------------------------- /tests/test_attachments.py: -------------------------------------------------------------------------------- 1 | """Behavioral tests for attachment functionality.""" 2 | 3 | import json 4 | import os 5 | import tempfile 6 | from pathlib import Path 7 | from unittest.mock import Mock 8 | 9 | import pytest 10 | 11 | from github_backup import github_backup 12 | 13 | 14 | @pytest.fixture 15 | def attachment_test_setup(tmp_path): 16 | """Fixture providing setup and helper for attachment download tests.""" 17 | from unittest.mock import patch 18 | 19 | issue_cwd = tmp_path / "issues" 20 | issue_cwd.mkdir() 21 | 22 | # Mock args 23 | args = Mock() 24 | args.as_app = False 25 | args.token_fine = None 26 | args.token_classic = None 27 | args.username = None 28 | args.password = None 29 | args.osx_keychain_item_name = None 30 | args.osx_keychain_item_account = None 31 | args.user = "testuser" 32 | args.repository = "testrepo" 33 | 34 | repository = {"full_name": "testuser/testrepo"} 35 | 36 | def call_download(issue_data, issue_number=123): 37 | """Call download_attachments with mocked HTTP downloads. 38 | 39 | Returns list of URLs that were actually downloaded. 40 | """ 41 | downloaded_urls = [] 42 | 43 | def mock_download(url, path, auth, as_app, fine): 44 | downloaded_urls.append(url) 45 | return { 46 | "success": True, 47 | "saved_as": os.path.basename(path), 48 | "url": url, 49 | } 50 | 51 | with patch( 52 | "github_backup.github_backup.download_attachment_file", 53 | side_effect=mock_download, 54 | ): 55 | github_backup.download_attachments( 56 | args, str(issue_cwd), issue_data, issue_number, repository 57 | ) 58 | 59 | return downloaded_urls 60 | 61 | return { 62 | "issue_cwd": str(issue_cwd), 63 | "args": args, 64 | "repository": repository, 65 | "call_download": call_download, 66 | } 67 | 68 | 69 | class TestURLExtraction: 70 | """Test URL extraction with realistic issue content.""" 71 | 72 | def test_mixed_urls(self): 73 | issue_data = { 74 | "body": """ 75 | ## Bug Report 76 | 77 | When uploading files, I see this error. Here's a screenshot: 78 | https://github.com/user-attachments/assets/abc123def456 79 | 80 | The logs show: https://github.com/user-attachments/files/789/error-log.txt 81 | 82 | This is similar to https://github.com/someorg/somerepo/issues/42 but different. 83 | 84 | You can also see the video at https://user-images.githubusercontent.com/12345/video-demo.mov 85 | 86 | Here's how to reproduce: 87 | ```bash 88 | # Don't extract this example URL: 89 | curl https://github.com/user-attachments/assets/example999 90 | ``` 91 | 92 | More info at https://docs.example.com/guide 93 | 94 | Also see this inline code `https://github.com/user-attachments/files/111/inline.pdf` should not extract. 95 | 96 | Final attachment: https://github.com/user-attachments/files/222/report.pdf. 97 | """, 98 | "comment_data": [ 99 | { 100 | "body": "Here's another attachment: https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" 101 | }, 102 | { 103 | "body": """ 104 | Example code: 105 | ```python 106 | url = "https://github.com/user-attachments/assets/code-example" 107 | ``` 108 | But this is real: https://github.com/user-attachments/files/333/actual.zip 109 | """ 110 | }, 111 | ], 112 | } 113 | 114 | # Extract URLs 115 | urls = github_backup.extract_attachment_urls(issue_data) 116 | 117 | expected_urls = [ 118 | "https://github.com/user-attachments/assets/abc123def456", 119 | "https://github.com/user-attachments/files/789/error-log.txt", 120 | "https://user-images.githubusercontent.com/12345/video-demo.mov", 121 | "https://github.com/user-attachments/files/222/report.pdf", 122 | "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123", 123 | "https://github.com/user-attachments/files/333/actual.zip", 124 | ] 125 | 126 | assert set(urls) == set(expected_urls) 127 | 128 | def test_trailing_punctuation_stripped(self): 129 | """URLs with trailing punctuation should have punctuation stripped.""" 130 | issue_data = { 131 | "body": """ 132 | See this file: https://github.com/user-attachments/files/1/doc.pdf. 133 | And this one (https://github.com/user-attachments/files/2/image.png). 134 | Check it out! https://github.com/user-attachments/files/3/data.csv! 135 | """ 136 | } 137 | 138 | urls = github_backup.extract_attachment_urls(issue_data) 139 | 140 | expected = [ 141 | "https://github.com/user-attachments/files/1/doc.pdf", 142 | "https://github.com/user-attachments/files/2/image.png", 143 | "https://github.com/user-attachments/files/3/data.csv", 144 | ] 145 | assert set(urls) == set(expected) 146 | 147 | def test_deduplication_across_body_and_comments(self): 148 | """Same URL in body and comments should only appear once.""" 149 | duplicate_url = "https://github.com/user-attachments/assets/abc123" 150 | 151 | issue_data = { 152 | "body": f"First mention: {duplicate_url}", 153 | "comment_data": [ 154 | {"body": f"Second mention: {duplicate_url}"}, 155 | {"body": f"Third mention: {duplicate_url}"}, 156 | ], 157 | } 158 | 159 | urls = github_backup.extract_attachment_urls(issue_data) 160 | 161 | assert set(urls) == {duplicate_url} 162 | 163 | 164 | class TestFilenameExtraction: 165 | """Test filename extraction from different URL types.""" 166 | 167 | def test_modern_assets_url(self): 168 | """Modern assets URL returns UUID.""" 169 | url = "https://github.com/user-attachments/assets/abc123def456" 170 | filename = github_backup.get_attachment_filename(url) 171 | assert filename == "abc123def456" 172 | 173 | def test_modern_files_url(self): 174 | """Modern files URL returns filename.""" 175 | url = "https://github.com/user-attachments/files/12345/report.pdf" 176 | filename = github_backup.get_attachment_filename(url) 177 | assert filename == "report.pdf" 178 | 179 | def test_legacy_cdn_url(self): 180 | """Legacy CDN URL returns filename with extension.""" 181 | url = "https://user-images.githubusercontent.com/123456/abc-def.png" 182 | filename = github_backup.get_attachment_filename(url) 183 | assert filename == "abc-def.png" 184 | 185 | def test_private_cdn_url(self): 186 | """Private CDN URL returns filename.""" 187 | url = "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123" 188 | filename = github_backup.get_attachment_filename(url) 189 | assert filename == "secret.png" 190 | 191 | def test_repo_files_url(self): 192 | """Repo-scoped files URL returns filename.""" 193 | url = "https://github.com/owner/repo/files/789/document.txt" 194 | filename = github_backup.get_attachment_filename(url) 195 | assert filename == "document.txt" 196 | 197 | 198 | class TestFilenameCollision: 199 | """Test filename collision resolution.""" 200 | 201 | def test_collision_behavior(self): 202 | """Test filename collision resolution with real files.""" 203 | with tempfile.TemporaryDirectory() as tmpdir: 204 | # No collision - file doesn't exist 205 | result = github_backup.resolve_filename_collision( 206 | os.path.join(tmpdir, "report.pdf") 207 | ) 208 | assert result == os.path.join(tmpdir, "report.pdf") 209 | 210 | # Create the file, now collision exists 211 | Path(os.path.join(tmpdir, "report.pdf")).touch() 212 | result = github_backup.resolve_filename_collision( 213 | os.path.join(tmpdir, "report.pdf") 214 | ) 215 | assert result == os.path.join(tmpdir, "report_1.pdf") 216 | 217 | # Create report_1.pdf too 218 | Path(os.path.join(tmpdir, "report_1.pdf")).touch() 219 | result = github_backup.resolve_filename_collision( 220 | os.path.join(tmpdir, "report.pdf") 221 | ) 222 | assert result == os.path.join(tmpdir, "report_2.pdf") 223 | 224 | def test_manifest_reserved(self): 225 | """manifest.json is always treated as reserved.""" 226 | with tempfile.TemporaryDirectory() as tmpdir: 227 | # Even if manifest.json doesn't exist, should get manifest_1.json 228 | result = github_backup.resolve_filename_collision( 229 | os.path.join(tmpdir, "manifest.json") 230 | ) 231 | assert result == os.path.join(tmpdir, "manifest_1.json") 232 | 233 | 234 | class TestManifestDuplicatePrevention: 235 | """Test that manifest prevents duplicate downloads (the bug fix).""" 236 | 237 | def test_manifest_filters_existing_urls(self, attachment_test_setup): 238 | """URLs in manifest are not re-downloaded.""" 239 | setup = attachment_test_setup 240 | 241 | # Create manifest with existing URLs 242 | attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") 243 | os.makedirs(attachments_dir) 244 | manifest_path = os.path.join(attachments_dir, "manifest.json") 245 | 246 | manifest = { 247 | "attachments": [ 248 | { 249 | "url": "https://github.com/user-attachments/assets/old1", 250 | "success": True, 251 | "saved_as": "old1.pdf", 252 | }, 253 | { 254 | "url": "https://github.com/user-attachments/assets/old2", 255 | "success": True, 256 | "saved_as": "old2.pdf", 257 | }, 258 | ] 259 | } 260 | with open(manifest_path, "w") as f: 261 | json.dump(manifest, f) 262 | 263 | # Issue data with 2 old URLs and 1 new URL 264 | issue_data = { 265 | "body": """ 266 | Old: https://github.com/user-attachments/assets/old1 267 | Old: https://github.com/user-attachments/assets/old2 268 | New: https://github.com/user-attachments/assets/new1 269 | """ 270 | } 271 | 272 | downloaded_urls = setup["call_download"](issue_data) 273 | 274 | # Should only download the NEW URL (old ones filtered by manifest) 275 | assert len(downloaded_urls) == 1 276 | assert downloaded_urls[0] == "https://github.com/user-attachments/assets/new1" 277 | 278 | def test_no_manifest_downloads_all(self, attachment_test_setup): 279 | """Without manifest, all URLs should be downloaded.""" 280 | setup = attachment_test_setup 281 | 282 | # Issue data with 2 URLs 283 | issue_data = { 284 | "body": """ 285 | https://github.com/user-attachments/assets/url1 286 | https://github.com/user-attachments/assets/url2 287 | """ 288 | } 289 | 290 | downloaded_urls = setup["call_download"](issue_data) 291 | 292 | # Should download ALL URLs (no manifest to filter) 293 | assert len(downloaded_urls) == 2 294 | assert set(downloaded_urls) == { 295 | "https://github.com/user-attachments/assets/url1", 296 | "https://github.com/user-attachments/assets/url2", 297 | } 298 | 299 | def test_manifest_skips_permanent_failures(self, attachment_test_setup): 300 | """Manifest skips permanent failures (404, 410) but retries transient (503).""" 301 | setup = attachment_test_setup 302 | 303 | # Create manifest with different failure types 304 | attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123") 305 | os.makedirs(attachments_dir) 306 | manifest_path = os.path.join(attachments_dir, "manifest.json") 307 | 308 | manifest = { 309 | "attachments": [ 310 | { 311 | "url": "https://github.com/user-attachments/assets/success", 312 | "success": True, 313 | "saved_as": "success.pdf", 314 | }, 315 | { 316 | "url": "https://github.com/user-attachments/assets/notfound", 317 | "success": False, 318 | "http_status": 404, 319 | }, 320 | { 321 | "url": "https://github.com/user-attachments/assets/gone", 322 | "success": False, 323 | "http_status": 410, 324 | }, 325 | { 326 | "url": "https://github.com/user-attachments/assets/unavailable", 327 | "success": False, 328 | "http_status": 503, 329 | }, 330 | ] 331 | } 332 | with open(manifest_path, "w") as f: 333 | json.dump(manifest, f) 334 | 335 | # Issue data has all 4 URLs 336 | issue_data = { 337 | "body": """ 338 | https://github.com/user-attachments/assets/success 339 | https://github.com/user-attachments/assets/notfound 340 | https://github.com/user-attachments/assets/gone 341 | https://github.com/user-attachments/assets/unavailable 342 | """ 343 | } 344 | 345 | downloaded_urls = setup["call_download"](issue_data) 346 | 347 | # Should only retry 503 (transient failure) 348 | # Success, 404, and 410 should be skipped 349 | assert len(downloaded_urls) == 1 350 | assert ( 351 | downloaded_urls[0] 352 | == "https://github.com/user-attachments/assets/unavailable" 353 | ) 354 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | github-backup 3 | ============= 4 | 5 | |PyPI| |Python Versions| 6 | 7 | The package can be used to backup an *entire* `Github `_ organization, repository or user account, including starred repos, issues and wikis in the most appropriate format (clones for wikis, json files for issues). 8 | 9 | Requirements 10 | ============ 11 | 12 | - Python 3.10 or higher 13 | - GIT 1.9+ 14 | 15 | Installation 16 | ============ 17 | 18 | Using PIP via PyPI:: 19 | 20 | pip install github-backup 21 | 22 | Using PIP via Github (more likely the latest version):: 23 | 24 | pip install git+https://github.com/josegonzalez/python-github-backup.git#egg=github-backup 25 | 26 | *Install note for python newcomers:* 27 | 28 | Python scripts are unlikely to be included in your ``$PATH`` by default, this means it cannot be run directly in terminal with ``$ github-backup ...``, you can either add python's install path to your environments ``$PATH`` or call the script directly e.g. using ``$ ~/.local/bin/github-backup``.* 29 | 30 | Basic Help 31 | ========== 32 | 33 | Show the CLI help output:: 34 | 35 | github-backup -h 36 | 37 | CLI Help output:: 38 | 39 | github-backup [-h] [-u USERNAME] [-p PASSWORD] [-t TOKEN_CLASSIC] 40 | [-f TOKEN_FINE] [--as-app] [-o OUTPUT_DIRECTORY] 41 | [-l LOG_LEVEL] [-i] [--starred] [--all-starred] 42 | [--watched] [--followers] [--following] [--all] [--issues] 43 | [--issue-comments] [--issue-events] [--pulls] 44 | [--pull-comments] [--pull-commits] [--pull-details] 45 | [--labels] [--hooks] [--milestones] [--repositories] 46 | [--bare] [--lfs] [--wikis] [--gists] [--starred-gists] 47 | [--skip-archived] [--skip-existing] [-L [LANGUAGES ...]] 48 | [-N NAME_REGEX] [-H GITHUB_HOST] [-O] [-R REPOSITORY] 49 | [-P] [-F] [--prefer-ssh] [-v] 50 | [--keychain-name OSX_KEYCHAIN_ITEM_NAME] 51 | [--keychain-account OSX_KEYCHAIN_ITEM_ACCOUNT] 52 | [--releases] [--latest-releases NUMBER_OF_LATEST_RELEASES] 53 | [--skip-prerelease] [--assets] [--skip-assets-on [REPO ...]] 54 | [--attachments] [--exclude [REPOSITORY [REPOSITORY ...]] 55 | [--throttle-limit THROTTLE_LIMIT] [--throttle-pause THROTTLE_PAUSE] 56 | USER 57 | 58 | Backup a github account 59 | 60 | positional arguments: 61 | USER github username 62 | 63 | optional arguments: 64 | -h, --help show this help message and exit 65 | -u USERNAME, --username USERNAME 66 | username for basic auth 67 | -p PASSWORD, --password PASSWORD 68 | password for basic auth. If a username is given but 69 | not a password, the password will be prompted for. 70 | -f TOKEN_FINE, --token-fine TOKEN_FINE 71 | fine-grained personal access token or path to token 72 | (file://...) 73 | -t TOKEN_CLASSIC, --token TOKEN_CLASSIC 74 | personal access, OAuth, or JSON Web token, or path to 75 | token (file://...) 76 | --as-app authenticate as github app instead of as a user. 77 | -o OUTPUT_DIRECTORY, --output-directory OUTPUT_DIRECTORY 78 | directory at which to backup the repositories 79 | -l LOG_LEVEL, --log-level LOG_LEVEL 80 | log level to use (default: info, possible levels: 81 | debug, info, warning, error, critical) 82 | -i, --incremental incremental backup 83 | --incremental-by-files incremental backup using modified time of files 84 | --starred include JSON output of starred repositories in backup 85 | --all-starred include starred repositories in backup [*] 86 | --watched include JSON output of watched repositories in backup 87 | --followers include JSON output of followers in backup 88 | --following include JSON output of following users in backup 89 | --all include everything in backup (not including [*]) 90 | --issues include issues in backup 91 | --issue-comments include issue comments in backup 92 | --issue-events include issue events in backup 93 | --pulls include pull requests in backup 94 | --pull-comments include pull request review comments in backup 95 | --pull-commits include pull request commits in backup 96 | --pull-details include more pull request details in backup [*] 97 | --labels include labels in backup 98 | --hooks include hooks in backup (works only when 99 | authenticated) 100 | --milestones include milestones in backup 101 | --repositories include repository clone in backup 102 | --bare clone bare repositories 103 | --lfs clone LFS repositories (requires Git LFS to be 104 | installed, https://git-lfs.github.com) [*] 105 | --wikis include wiki clone in backup 106 | --gists include gists in backup [*] 107 | --starred-gists include starred gists in backup [*] 108 | --skip-existing skip project if a backup directory exists 109 | -L [LANGUAGES [LANGUAGES ...]], --languages [LANGUAGES [LANGUAGES ...]] 110 | only allow these languages 111 | -N NAME_REGEX, --name-regex NAME_REGEX 112 | python regex to match names against 113 | -H GITHUB_HOST, --github-host GITHUB_HOST 114 | GitHub Enterprise hostname 115 | -O, --organization whether or not this is an organization user 116 | -R REPOSITORY, --repository REPOSITORY 117 | name of repository to limit backup to 118 | -P, --private include private repositories [*] 119 | -F, --fork include forked repositories [*] 120 | --prefer-ssh Clone repositories using SSH instead of HTTPS 121 | -v, --version show program's version number and exit 122 | --keychain-name OSX_KEYCHAIN_ITEM_NAME 123 | OSX ONLY: name field of password item in OSX keychain 124 | that holds the personal access or OAuth token 125 | --keychain-account OSX_KEYCHAIN_ITEM_ACCOUNT 126 | OSX ONLY: account field of password item in OSX 127 | keychain that holds the personal access or OAuth token 128 | --releases include release information, not including assets or 129 | binaries 130 | --latest-releases NUMBER_OF_LATEST_RELEASES 131 | include certain number of the latest releases; 132 | only applies if including releases 133 | --skip-prerelease skip prerelease and draft versions; only applies if including releases 134 | --assets include assets alongside release information; only 135 | applies if including releases 136 | --skip-assets-on [REPO ...] 137 | skip asset downloads for these repositories (e.g. 138 | --skip-assets-on repo1 owner/repo2) 139 | --attachments download user-attachments from issues and pull requests 140 | to issues/attachments/{issue_number}/ and 141 | pulls/attachments/{pull_number}/ directories 142 | --exclude [REPOSITORY [REPOSITORY ...]] 143 | names of repositories to exclude from backup. 144 | --throttle-limit THROTTLE_LIMIT 145 | start throttling of GitHub API requests after this 146 | amount of API requests remain 147 | --throttle-pause THROTTLE_PAUSE 148 | wait this amount of seconds when API request 149 | throttling is active (default: 30.0, requires 150 | --throttle-limit to be set) 151 | 152 | 153 | Usage Details 154 | ============= 155 | 156 | Authentication 157 | -------------- 158 | 159 | **Password-based authentication** will fail if you have two-factor authentication enabled, and will `be deprecated `_ by 2023 EOY. 160 | 161 | ``--username`` is used for basic password authentication and separate from the positional argument ``USER``, which specifies the user account you wish to back up. 162 | 163 | **Classic tokens** are `slightly less secure `_ as they provide very coarse-grained permissions. 164 | 165 | If you need authentication for long-running backups (e.g. for a cron job) it is recommended to use **fine-grained personal access token** ``-f TOKEN_FINE``. 166 | 167 | 168 | Fine Tokens 169 | ~~~~~~~~~~~ 170 | 171 | You can "generate new token", choosing the repository scope by selecting specific repos or all repos. On Github this is under *Settings -> Developer Settings -> Personal access tokens -> Fine-grained Tokens* 172 | 173 | Customise the permissions for your use case, but for a personal account full backup you'll need to enable the following permissions: 174 | 175 | **User permissions**: Read access to followers, starring, and watching. 176 | 177 | **Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks. 178 | 179 | 180 | GitHub Apps 181 | ~~~~~~~~~~~ 182 | 183 | GitHub Apps are ideal for organization backups in CI/CD. Tokens are scoped to specific repositories and expire after 1 hour. 184 | 185 | **One-time setup:** 186 | 187 | 1. Create a GitHub App at *Settings -> Developer Settings -> GitHub Apps -> New GitHub App* 188 | 2. Set a name and homepage URL (can be any URL) 189 | 3. Uncheck "Webhook > Active" (not needed for backups) 190 | 4. Set permissions (same as fine-grained tokens above) 191 | 5. Click "Create GitHub App", then note the **App ID** shown on the next page 192 | 6. Under "Private keys", click "Generate a private key" and save the downloaded file 193 | 7. Go to *Install App* in your app's settings 194 | 8. Select the account/organization and which repositories to back up 195 | 196 | **CI/CD usage with GitHub Actions:** 197 | 198 | Store the App ID as a repository variable and the private key contents as a secret, then use ``actions/create-github-app-token``:: 199 | 200 | - uses: actions/create-github-app-token@v1 201 | id: app-token 202 | with: 203 | app-id: ${{ vars.APP_ID }} 204 | private-key: ${{ secrets.APP_PRIVATE_KEY }} 205 | 206 | - run: github-backup myorg -t ${{ steps.app-token.outputs.token }} --as-app -o ./backup --all 207 | 208 | Note: Installation tokens expire after 1 hour. For long-running backups, use a fine-grained personal access token instead. 209 | 210 | 211 | Prefer SSH 212 | ~~~~~~~~~~ 213 | 214 | If cloning repos is enabled with ``--repositories``, ``--all-starred``, ``--wikis``, ``--gists``, ``--starred-gists`` using the ``--prefer-ssh`` argument will use ssh for cloning the git repos, but all other connections will still use their own protocol, e.g. API requests for issues uses HTTPS. 215 | 216 | To clone with SSH, you'll need SSH authentication setup `as usual with Github `_, e.g. via SSH public and private keys. 217 | 218 | 219 | Using the Keychain on Mac OSX 220 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 221 | Note: On Mac OSX the token can be stored securely in the user's keychain. To do this: 222 | 223 | 1. Open Keychain from "Applications -> Utilities -> Keychain Access" 224 | 2. Add a new password item using "File -> New Password Item" 225 | 3. Enter a name in the "Keychain Item Name" box. You must provide this name to github-backup using the --keychain-name argument. 226 | 4. Enter an account name in the "Account Name" box, enter your Github username as set above. You must provide this name to github-backup using the --keychain-account argument. 227 | 5. Enter your Github personal access token in the "Password" box 228 | 229 | Note: When you run github-backup, you will be asked whether you want to allow "security" to use your confidential information stored in your keychain. You have two options: 230 | 231 | 1. **Allow:** In this case you will need to click "Allow" each time you run `github-backup` 232 | 2. **Always Allow:** In this case, you will not be asked for permission when you run `github-backup` in future. This is less secure, but is required if you want to schedule `github-backup` to run automatically 233 | 234 | 235 | Github Rate-limit and Throttling 236 | -------------------------------- 237 | 238 | "github-backup" will automatically throttle itself based on feedback from the Github API. 239 | 240 | Their API is usually rate-limited to 5000 calls per hour. The API will ask github-backup to pause until a specific time when the limit is reset again (at the start of the next hour). This continues until the backup is complete. 241 | 242 | During a large backup, such as ``--all-starred``, and on a fast connection this can result in (~20 min) pauses with bursts of API calls periodically maxing out the API limit. If this is not suitable `it has been observed `_ under real-world conditions that overriding the throttle with ``--throttle-limit 5000 --throttle-pause 0.6`` provides a smooth rate across the hour, although a ``--throttle-pause 0.72`` (3600 seconds [1 hour] / 5000 limit) is theoretically safer to prevent large rate-limit pauses. 243 | 244 | 245 | About Git LFS 246 | ------------- 247 | 248 | When you use the ``--lfs`` option, you will need to make sure you have Git LFS installed. 249 | 250 | Instructions on how to do this can be found on https://git-lfs.github.com. 251 | 252 | LFS objects are fetched for all refs, not just the current checkout, ensuring a complete backup of all LFS content across all branches and history. 253 | 254 | 255 | About Attachments 256 | ----------------- 257 | 258 | When you use the ``--attachments`` option with ``--issues`` or ``--pulls``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue and pull request descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently. 259 | 260 | Attachments are saved to ``issues/attachments/{issue_number}/`` and ``pulls/attachments/{pull_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains: 261 | 262 | - The downloaded attachment files (named by their GitHub identifier with appropriate file extensions) 263 | - If multiple attachments have the same filename, conflicts are resolved with numeric suffixes (e.g., ``report.pdf``, ``report_1.pdf``, ``report_2.pdf``) 264 | - A ``manifest.json`` file documenting all downloads, including URLs, file metadata, and download status 265 | 266 | The tool automatically extracts file extensions from HTTP headers to ensure files can be more easily opened by your operating system. 267 | 268 | **Supported URL formats:** 269 | 270 | - Modern: ``github.com/user-attachments/{assets,files}/*`` 271 | - Legacy: ``user-images.githubusercontent.com/*`` and ``private-user-images.githubusercontent.com/*`` 272 | - Repo files: ``github.com/{owner}/{repo}/files/*`` (filtered to current repository) 273 | - Repo assets: ``github.com/{owner}/{repo}/assets/*`` (filtered to current repository) 274 | 275 | **Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer). 276 | 277 | 278 | Run in Docker container 279 | ----------------------- 280 | 281 | To run the tool in a Docker container use the following command: 282 | 283 | sudo docker run --rm -v /path/to/backup:/data --name github-backup ghcr.io/josegonzalez/python-github-backup -o /data $OPTIONS $USER 284 | 285 | Gotchas / Known-issues 286 | ====================== 287 | 288 | All is not everything 289 | --------------------- 290 | 291 | The ``--all`` argument does not include: cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more. 292 | 293 | Cloning all starred size 294 | ------------------------ 295 | 296 | Using the ``--all-starred`` argument to clone all starred repositories may use a large amount of storage space, especially if ``--all`` or more arguments are used. e.g. commonly starred repos can have tens of thousands of issues, many large assets and the repo itself etc. Consider just storing links to starred repos in JSON format with ``--starred``. 297 | 298 | Incremental Backup 299 | ------------------ 300 | 301 | Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. 302 | 303 | This means any blocking errors on previous runs can cause a large amount of missing data in backups. 304 | 305 | Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something. 306 | 307 | Still saver than the previous version. 308 | 309 | Specifically, issues and pull requests are handled like this. 310 | 311 | Known blocking errors 312 | --------------------- 313 | 314 | Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API. 315 | 316 | If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data. 317 | 318 | It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs. 319 | 320 | **Starred public repo hooks blocking** 321 | 322 | Since the ``--all`` argument includes ``--hooks``, if you use ``--all`` and ``--all-starred`` together to clone a users starred public repositories, the backup will likely error and block the backup continuing. 323 | 324 | This is due to needing the correct permission for ``--hooks`` on public repos. 325 | 326 | 327 | "bare" is actually "mirror" 328 | --------------------------- 329 | 330 | Using the bare clone argument (``--bare``) will actually call git's ``clone --mirror`` command. There's a subtle difference between `bare `_ and `mirror `_ clone. 331 | 332 | *From git docs "Compared to --bare, --mirror not only maps local branches of the source to local branches of the target, it maps all refs (including remote-tracking branches, notes etc.) and sets up a refspec configuration such that all these refs are overwritten by a git remote update in the target repository."* 333 | 334 | 335 | Starred gists vs starred repo behaviour 336 | --------------------------------------- 337 | 338 | The starred normal repo cloning (``--all-starred``) argument stores starred repos separately to the users own repositories. However, using ``--starred-gists`` will store starred gists within the same directory as the users own gists ``--gists``. Also, all gist repo directory names are IDs not the gist's name. 339 | 340 | Note: ``--starred-gists`` only retrieves starred gists for the authenticated user, not the target user, due to a GitHub API limitation. 341 | 342 | 343 | Skip existing on incomplete backups 344 | ----------------------------------- 345 | 346 | The ``--skip-existing`` argument will skip a backup if the directory already exists, even if the backup in that directory failed (perhaps due to a blocking error). This may result in unexpected missing data in a regular backup. 347 | 348 | 349 | Updates use fetch, not pull 350 | --------------------------- 351 | 352 | When updating an existing repository backup, ``github-backup`` uses ``git fetch`` rather than ``git pull``. This is intentional - a backup tool should reliably download data without risk of failure. Using ``git pull`` would require handling merge conflicts, which adds complexity and could cause backups to fail unexpectedly. 353 | 354 | With fetch, **all branches and commits are downloaded** safely into remote-tracking branches. The working directory files won't change, but your backup is complete. 355 | 356 | If you look at files directly (e.g., ``cat README.md``), you'll see the old content. The new data is in the remote-tracking branches (confusingly named "remote" but stored locally). To view or use the latest files:: 357 | 358 | git show origin/main:README.md # view a file 359 | git merge origin/main # update working directory 360 | 361 | All branches are backed up as remote refs (``origin/main``, ``origin/feature-branch``, etc.). 362 | 363 | If you want to browse files directly without merging, consider using ``--bare`` which skips the working directory entirely - the backup is just the git data. 364 | 365 | See `#269 `_ for more discussion. 366 | 367 | 368 | Github Backup Examples 369 | ====================== 370 | 371 | Backup all repositories, including private ones using a classic token:: 372 | 373 | export ACCESS_TOKEN=SOME-GITHUB-TOKEN 374 | github-backup WhiteHouse --token $ACCESS_TOKEN --organization --output-directory /tmp/white-house --repositories --private 375 | 376 | Use a fine-grained access token to backup a single organization repository with everything else (wiki, pull requests, comments, issues etc):: 377 | 378 | export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN 379 | ORGANIZATION=docker 380 | REPO=cli 381 | # e.g. git@github.com:docker/cli.git 382 | github-backup $ORGANIZATION -P -f $FINE_ACCESS_TOKEN -o . --all -O -R $REPO 383 | 384 | Quietly and incrementally backup useful Github user data (public and private repos with SSH) including; all issues, pulls, all public starred repos and gists (omitting "hooks", "releases" and therefore "assets" to prevent blocking). *Great for a cron job.* :: 385 | 386 | export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN 387 | GH_USER=YOUR-GITHUB-USER 388 | 389 | github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER 390 | 391 | Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. :: 392 | 393 | export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN 394 | GH_USER=YOUR-GITHUB-USER 395 | 396 | github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER 397 | 398 | Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only):: 399 | 400 | my-secret-manager get github-token | github-backup user -t file:///dev/stdin -o /backup --repositories 401 | 402 | Restoring from Backup 403 | ===================== 404 | 405 | This tool creates backups only, there is no inbuilt restore command. 406 | 407 | **Git repositories, wikis, and gists** can be restored by pushing them back to GitHub as you would any git repository. For example, to restore a bare repository backup:: 408 | 409 | cd /tmp/white-house/repositories/petitions/repository 410 | git push --mirror git@github.com:WhiteHouse/petitions.git 411 | 412 | **Issues, pull requests, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations: 413 | 414 | - New issue/PR numbers are assigned (original numbers cannot be set) 415 | - Timestamps reflect creation time (original dates cannot be set) 416 | - The API caller becomes the author (original authors cannot be set) 417 | - Cross-references between issues and PRs will break 418 | 419 | These are GitHub API limitations that affect all backup and migration tools, not just this one. Recreating issues with these limitations via the GitHub API is an exercise for the reader. The JSON backups remain useful for searching, auditing, or manual reference. 420 | 421 | 422 | Development 423 | =========== 424 | 425 | This project is considered feature complete for the primary maintainer @josegonzalez. If you would like a bugfix or enhancement, pull requests are welcome. Feel free to contact the maintainer for consulting estimates if you'd like to sponsor the work instead. 426 | 427 | Contibuters 428 | ----------- 429 | 430 | A huge thanks to all the contibuters! 431 | 432 | .. image:: https://contrib.rocks/image?repo=josegonzalez/python-github-backup 433 | :target: https://github.com/josegonzalez/python-github-backup/graphs/contributors 434 | :alt: contributors 435 | 436 | Testing 437 | ------- 438 | 439 | To run the test suite:: 440 | 441 | pip install pytest 442 | pytest 443 | 444 | To run linting:: 445 | 446 | pip install flake8 447 | flake8 --ignore=E501 448 | 449 | 450 | .. |PyPI| image:: https://img.shields.io/pypi/v/github-backup.svg 451 | :target: https://pypi.python.org/pypi/github-backup/ 452 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/github-backup.svg 453 | :target: https://github.com/josegonzalez/python-github-backup 454 | -------------------------------------------------------------------------------- /github_backup/github_backup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import base64 7 | import calendar 8 | import codecs 9 | import errno 10 | import getpass 11 | import json 12 | import logging 13 | import os 14 | import platform 15 | import re 16 | import select 17 | import socket 18 | import ssl 19 | import subprocess 20 | import sys 21 | import time 22 | from datetime import datetime 23 | from http.client import IncompleteRead 24 | from urllib.error import HTTPError, URLError 25 | from urllib.parse import quote as urlquote 26 | from urllib.parse import urlencode, urlparse 27 | from urllib.request import HTTPRedirectHandler, Request, build_opener, urlopen 28 | 29 | try: 30 | from . import __version__ 31 | 32 | VERSION = __version__ 33 | except ImportError: 34 | VERSION = "unknown" 35 | 36 | FNULL = open(os.devnull, "w") 37 | FILE_URI_PREFIX = "file://" 38 | logger = logging.getLogger(__name__) 39 | 40 | 41 | class RepositoryUnavailableError(Exception): 42 | """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown).""" 43 | 44 | def __init__(self, message, dmca_url=None): 45 | super().__init__(message) 46 | self.dmca_url = dmca_url 47 | 48 | 49 | # Setup SSL context with fallback chain 50 | https_ctx = ssl.create_default_context() 51 | if https_ctx.get_ca_certs(): 52 | # Layer 1: Certificates pre-loaded from system (file-based) 53 | pass 54 | else: 55 | paths = ssl.get_default_verify_paths() 56 | if (paths.cafile and os.path.exists(paths.cafile)) or ( 57 | paths.capath and os.path.exists(paths.capath) 58 | ): 59 | # Layer 2: Cert paths exist, will be lazy-loaded on first use (directory-based) 60 | pass 61 | else: 62 | # Layer 3: Try certifi package as optional fallback 63 | try: 64 | import certifi 65 | 66 | https_ctx = ssl.create_default_context(cafile=certifi.where()) 67 | except ImportError: 68 | # All layers failed - no certificates available anywhere 69 | sys.exit( 70 | "\nERROR: No CA certificates found. Cannot connect to GitHub over SSL.\n\n" 71 | "Solutions you can explore:\n" 72 | " 1. pip install certifi\n" 73 | " 2. Alpine: apk add ca-certificates\n" 74 | " 3. Debian/Ubuntu: apt-get install ca-certificates\n\n" 75 | ) 76 | 77 | 78 | def logging_subprocess( 79 | popenargs, stdout_log_level=logging.DEBUG, stderr_log_level=logging.ERROR, **kwargs 80 | ): 81 | """ 82 | Variant of subprocess.call that accepts a logger instead of stdout/stderr, 83 | and logs stdout messages via logger.debug and stderr messages via 84 | logger.error. 85 | """ 86 | child = subprocess.Popen( 87 | popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs 88 | ) 89 | if sys.platform == "win32": 90 | logger.info( 91 | "Windows operating system detected - no subprocess logging will be returned" 92 | ) 93 | 94 | log_level = {child.stdout: stdout_log_level, child.stderr: stderr_log_level} 95 | 96 | def check_io(): 97 | if sys.platform == "win32": 98 | return 99 | ready_to_read = select.select([child.stdout, child.stderr], [], [], 1000)[0] 100 | for io in ready_to_read: 101 | line = io.readline() 102 | if not logger: 103 | continue 104 | if not (io == child.stderr and not line): 105 | logger.log(log_level[io], line[:-1]) 106 | 107 | # keep checking stdout/stderr until the child exits 108 | while child.poll() is None: 109 | check_io() 110 | 111 | check_io() # check again to catch anything after the process exits 112 | 113 | rc = child.wait() 114 | 115 | if rc != 0: 116 | print("{} returned {}:".format(popenargs[0], rc), file=sys.stderr) 117 | print("\t", " ".join(popenargs), file=sys.stderr) 118 | 119 | return rc 120 | 121 | 122 | def mkdir_p(*args): 123 | for path in args: 124 | try: 125 | os.makedirs(path) 126 | except OSError as exc: # Python >2.5 127 | if exc.errno == errno.EEXIST and os.path.isdir(path): 128 | pass 129 | else: 130 | raise 131 | 132 | 133 | def mask_password(url, secret="*****"): 134 | parsed = urlparse(url) 135 | 136 | if not parsed.password: 137 | return url 138 | elif parsed.password == "x-oauth-basic": 139 | return url.replace(parsed.username, secret) 140 | 141 | return url.replace(parsed.password, secret) 142 | 143 | 144 | def parse_args(args=None): 145 | parser = argparse.ArgumentParser(description="Backup a github account") 146 | parser.add_argument("user", metavar="USER", type=str, help="github username") 147 | parser.add_argument( 148 | "-u", "--username", dest="username", help="username for basic auth" 149 | ) 150 | parser.add_argument( 151 | "-p", 152 | "--password", 153 | dest="password", 154 | help="password for basic auth. " 155 | "If a username is given but not a password, the " 156 | "password will be prompted for.", 157 | ) 158 | parser.add_argument( 159 | "-t", 160 | "--token", 161 | dest="token_classic", 162 | help="personal access, OAuth, or JSON Web token, or path to token (file://...)", 163 | ) # noqa 164 | parser.add_argument( 165 | "-f", 166 | "--token-fine", 167 | dest="token_fine", 168 | help="fine-grained personal access token (github_pat_....), or path to token (file://...)", 169 | ) # noqa 170 | parser.add_argument( 171 | "-q", 172 | "--quiet", 173 | action="store_true", 174 | dest="quiet", 175 | help="supress log messages less severe than warning, e.g. info", 176 | ) 177 | parser.add_argument( 178 | "--as-app", 179 | action="store_true", 180 | dest="as_app", 181 | help="authenticate as github app instead of as a user.", 182 | ) 183 | parser.add_argument( 184 | "-o", 185 | "--output-directory", 186 | default=".", 187 | dest="output_directory", 188 | help="directory at which to backup the repositories", 189 | ) 190 | parser.add_argument( 191 | "-l", 192 | "--log-level", 193 | default="info", 194 | dest="log_level", 195 | help="log level to use (default: info, possible levels: debug, info, warning, error, critical)", 196 | ) 197 | parser.add_argument( 198 | "-i", 199 | "--incremental", 200 | action="store_true", 201 | dest="incremental", 202 | help="incremental backup", 203 | ) 204 | parser.add_argument( 205 | "--incremental-by-files", 206 | action="store_true", 207 | dest="incremental_by_files", 208 | help="incremental backup based on modification date of files", 209 | ) 210 | parser.add_argument( 211 | "--starred", 212 | action="store_true", 213 | dest="include_starred", 214 | help="include JSON output of starred repositories in backup", 215 | ) 216 | parser.add_argument( 217 | "--all-starred", 218 | action="store_true", 219 | dest="all_starred", 220 | help="include starred repositories in backup [*]", 221 | ) 222 | parser.add_argument( 223 | "--watched", 224 | action="store_true", 225 | dest="include_watched", 226 | help="include JSON output of watched repositories in backup", 227 | ) 228 | parser.add_argument( 229 | "--followers", 230 | action="store_true", 231 | dest="include_followers", 232 | help="include JSON output of followers in backup", 233 | ) 234 | parser.add_argument( 235 | "--following", 236 | action="store_true", 237 | dest="include_following", 238 | help="include JSON output of following users in backup", 239 | ) 240 | parser.add_argument( 241 | "--all", 242 | action="store_true", 243 | dest="include_everything", 244 | help="include everything in backup (not including [*])", 245 | ) 246 | parser.add_argument( 247 | "--issues", 248 | action="store_true", 249 | dest="include_issues", 250 | help="include issues in backup", 251 | ) 252 | parser.add_argument( 253 | "--issue-comments", 254 | action="store_true", 255 | dest="include_issue_comments", 256 | help="include issue comments in backup", 257 | ) 258 | parser.add_argument( 259 | "--issue-events", 260 | action="store_true", 261 | dest="include_issue_events", 262 | help="include issue events in backup", 263 | ) 264 | parser.add_argument( 265 | "--pulls", 266 | action="store_true", 267 | dest="include_pulls", 268 | help="include pull requests in backup", 269 | ) 270 | parser.add_argument( 271 | "--pull-comments", 272 | action="store_true", 273 | dest="include_pull_comments", 274 | help="include pull request review comments in backup", 275 | ) 276 | parser.add_argument( 277 | "--pull-commits", 278 | action="store_true", 279 | dest="include_pull_commits", 280 | help="include pull request commits in backup", 281 | ) 282 | parser.add_argument( 283 | "--pull-details", 284 | action="store_true", 285 | dest="include_pull_details", 286 | help="include more pull request details in backup [*]", 287 | ) 288 | parser.add_argument( 289 | "--labels", 290 | action="store_true", 291 | dest="include_labels", 292 | help="include labels in backup", 293 | ) 294 | parser.add_argument( 295 | "--hooks", 296 | action="store_true", 297 | dest="include_hooks", 298 | help="include hooks in backup (works only when authenticated)", 299 | ) # noqa 300 | parser.add_argument( 301 | "--milestones", 302 | action="store_true", 303 | dest="include_milestones", 304 | help="include milestones in backup", 305 | ) 306 | parser.add_argument( 307 | "--repositories", 308 | action="store_true", 309 | dest="include_repository", 310 | help="include repository clone in backup", 311 | ) 312 | parser.add_argument( 313 | "--bare", action="store_true", dest="bare_clone", help="clone bare repositories" 314 | ) 315 | parser.add_argument( 316 | "--no-prune", 317 | action="store_true", 318 | dest="no_prune", 319 | help="disable prune option for git fetch", 320 | ) 321 | parser.add_argument( 322 | "--lfs", 323 | action="store_true", 324 | dest="lfs_clone", 325 | help="clone LFS repositories (requires Git LFS to be installed, https://git-lfs.github.com) [*]", 326 | ) 327 | parser.add_argument( 328 | "--wikis", 329 | action="store_true", 330 | dest="include_wiki", 331 | help="include wiki clone in backup", 332 | ) 333 | parser.add_argument( 334 | "--gists", 335 | action="store_true", 336 | dest="include_gists", 337 | help="include gists in backup [*]", 338 | ) 339 | parser.add_argument( 340 | "--starred-gists", 341 | action="store_true", 342 | dest="include_starred_gists", 343 | help="include starred gists in backup [*]", 344 | ) 345 | parser.add_argument( 346 | "--skip-archived", 347 | action="store_true", 348 | dest="skip_archived", 349 | help="skip project if it is archived", 350 | ) 351 | parser.add_argument( 352 | "--skip-existing", 353 | action="store_true", 354 | dest="skip_existing", 355 | help="skip project if a backup directory exists", 356 | ) 357 | parser.add_argument( 358 | "-L", 359 | "--languages", 360 | dest="languages", 361 | help="only allow these languages", 362 | nargs="*", 363 | ) 364 | parser.add_argument( 365 | "-N", 366 | "--name-regex", 367 | dest="name_regex", 368 | help="python regex to match names against", 369 | ) 370 | parser.add_argument( 371 | "-H", "--github-host", dest="github_host", help="GitHub Enterprise hostname" 372 | ) 373 | parser.add_argument( 374 | "-O", 375 | "--organization", 376 | action="store_true", 377 | dest="organization", 378 | help="whether or not this is an organization user", 379 | ) 380 | parser.add_argument( 381 | "-R", 382 | "--repository", 383 | dest="repository", 384 | help="name of repository to limit backup to", 385 | ) 386 | parser.add_argument( 387 | "-P", 388 | "--private", 389 | action="store_true", 390 | dest="private", 391 | help="include private repositories [*]", 392 | ) 393 | parser.add_argument( 394 | "-F", 395 | "--fork", 396 | action="store_true", 397 | dest="fork", 398 | help="include forked repositories [*]", 399 | ) 400 | parser.add_argument( 401 | "--prefer-ssh", 402 | action="store_true", 403 | help="Clone repositories using SSH instead of HTTPS", 404 | ) 405 | parser.add_argument( 406 | "-v", "--version", action="version", version="%(prog)s " + VERSION 407 | ) 408 | parser.add_argument( 409 | "--keychain-name", 410 | dest="osx_keychain_item_name", 411 | help="OSX ONLY: name field of password item in OSX keychain that holds the personal access or OAuth token", 412 | ) 413 | parser.add_argument( 414 | "--keychain-account", 415 | dest="osx_keychain_item_account", 416 | help="OSX ONLY: account field of password item in OSX keychain that holds the personal access or OAuth token", 417 | ) 418 | parser.add_argument( 419 | "--releases", 420 | action="store_true", 421 | dest="include_releases", 422 | help="include release information, not including assets or binaries", 423 | ) 424 | parser.add_argument( 425 | "--latest-releases", 426 | type=int, 427 | default=0, 428 | dest="number_of_latest_releases", 429 | help="include certain number of the latest releases; only applies if including releases", 430 | ) 431 | parser.add_argument( 432 | "--skip-prerelease", 433 | action="store_true", 434 | dest="skip_prerelease", 435 | help="skip prerelease and draft versions; only applies if including releases", 436 | ) 437 | parser.add_argument( 438 | "--assets", 439 | action="store_true", 440 | dest="include_assets", 441 | help="include assets alongside release information; only applies if including releases", 442 | ) 443 | parser.add_argument( 444 | "--skip-assets-on", 445 | dest="skip_assets_on", 446 | nargs="*", 447 | help="skip asset downloads for these repositories", 448 | ) 449 | parser.add_argument( 450 | "--attachments", 451 | action="store_true", 452 | dest="include_attachments", 453 | help="download user-attachments from issues and pull requests", 454 | ) 455 | parser.add_argument( 456 | "--throttle-limit", 457 | dest="throttle_limit", 458 | type=int, 459 | default=0, 460 | help="start throttling of GitHub API requests after this amount of API requests remain", 461 | ) 462 | parser.add_argument( 463 | "--throttle-pause", 464 | dest="throttle_pause", 465 | type=float, 466 | default=30.0, 467 | help="wait this amount of seconds when API request throttling is active (default: 30.0, requires --throttle-limit to be set)", 468 | ) 469 | parser.add_argument( 470 | "--exclude", dest="exclude", help="names of repositories to exclude", nargs="*" 471 | ) 472 | return parser.parse_args(args) 473 | 474 | 475 | def get_auth(args, encode=True, for_git_cli=False): 476 | auth = None 477 | 478 | if args.osx_keychain_item_name: 479 | if not args.osx_keychain_item_account: 480 | raise Exception( 481 | "You must specify both name and account fields for osx keychain password items" 482 | ) 483 | else: 484 | if platform.system() != "Darwin": 485 | raise Exception("Keychain arguments are only supported on Mac OSX") 486 | try: 487 | with open(os.devnull, "w") as devnull: 488 | token = subprocess.check_output( 489 | [ 490 | "security", 491 | "find-generic-password", 492 | "-s", 493 | args.osx_keychain_item_name, 494 | "-a", 495 | args.osx_keychain_item_account, 496 | "-w", 497 | ], 498 | stderr=devnull, 499 | ).strip() 500 | token = token.decode("utf-8") 501 | auth = token + ":" + "x-oauth-basic" 502 | except subprocess.SubprocessError: 503 | raise Exception( 504 | "No password item matching the provided name and account could be found in the osx keychain." 505 | ) 506 | elif args.osx_keychain_item_account: 507 | raise Exception( 508 | "You must specify both name and account fields for osx keychain password items" 509 | ) 510 | elif args.token_fine: 511 | if args.token_fine.startswith(FILE_URI_PREFIX): 512 | args.token_fine = read_file_contents(args.token_fine) 513 | 514 | if args.token_fine.startswith("github_pat_"): 515 | auth = args.token_fine 516 | else: 517 | raise Exception( 518 | "Fine-grained token supplied does not look like a GitHub PAT" 519 | ) 520 | elif args.token_classic: 521 | if args.token_classic.startswith(FILE_URI_PREFIX): 522 | args.token_classic = read_file_contents(args.token_classic) 523 | 524 | if not args.as_app: 525 | auth = args.token_classic + ":" + "x-oauth-basic" 526 | else: 527 | if not for_git_cli: 528 | auth = args.token_classic 529 | else: 530 | auth = "x-access-token:" + args.token_classic 531 | elif args.username: 532 | if not args.password: 533 | args.password = getpass.getpass() 534 | if encode: 535 | password = args.password 536 | else: 537 | password = urlquote(args.password) 538 | auth = args.username + ":" + password 539 | elif args.password: 540 | raise Exception("You must specify a username for basic auth") 541 | 542 | if not auth: 543 | return None 544 | 545 | if not encode or args.token_fine is not None: 546 | return auth 547 | 548 | return base64.b64encode(auth.encode("ascii")) 549 | 550 | 551 | def get_github_api_host(args): 552 | if args.github_host: 553 | host = args.github_host + "/api/v3" 554 | else: 555 | host = "api.github.com" 556 | 557 | return host 558 | 559 | 560 | def get_github_host(args): 561 | if args.github_host: 562 | host = args.github_host 563 | else: 564 | host = "github.com" 565 | 566 | return host 567 | 568 | 569 | def read_file_contents(file_uri): 570 | return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip() 571 | 572 | 573 | def get_github_repo_url(args, repository): 574 | if repository.get("is_gist"): 575 | if args.prefer_ssh: 576 | # The git_pull_url value is always https for gists, so we need to transform it to ssh form 577 | repo_url = re.sub( 578 | r"^https?:\/\/(.+)\/(.+)\.git$", 579 | r"git@\1:\2.git", 580 | repository["git_pull_url"], 581 | ) 582 | repo_url = re.sub( 583 | r"^git@gist\.", "git@", repo_url 584 | ) # strip gist subdomain for better hostkey compatibility 585 | else: 586 | repo_url = repository["git_pull_url"] 587 | return repo_url 588 | 589 | if args.prefer_ssh: 590 | return repository["ssh_url"] 591 | 592 | auth = get_auth(args, encode=False, for_git_cli=True) 593 | if auth: 594 | repo_url = "https://{0}@{1}/{2}/{3}.git".format( 595 | auth if args.token_fine is None else "oauth2:" + auth, 596 | get_github_host(args), 597 | repository["owner"]["login"], 598 | repository["name"], 599 | ) 600 | else: 601 | repo_url = repository["clone_url"] 602 | 603 | return repo_url 604 | 605 | 606 | def retrieve_data_gen(args, template, query_args=None, single_request=False): 607 | auth = get_auth(args, encode=not args.as_app) 608 | query_args = get_query_args(query_args) 609 | per_page = 100 610 | next_url = None 611 | 612 | while True: 613 | if single_request: 614 | request_per_page = None 615 | else: 616 | request_per_page = per_page 617 | 618 | request = _construct_request( 619 | request_per_page, 620 | query_args, 621 | next_url or template, 622 | auth, 623 | as_app=args.as_app, 624 | fine=True if args.token_fine is not None else False, 625 | ) # noqa 626 | r, errors = _get_response(request, auth, next_url or template) 627 | 628 | status_code = int(r.getcode()) 629 | 630 | # Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository 631 | if status_code == 451: 632 | dmca_url = None 633 | try: 634 | response_data = json.loads(r.read().decode("utf-8")) 635 | dmca_url = response_data.get("block", {}).get("html_url") 636 | except Exception: 637 | pass 638 | raise RepositoryUnavailableError( 639 | "Repository unavailable due to legal reasons (HTTP 451)", 640 | dmca_url=dmca_url, 641 | ) 642 | 643 | # Check if we got correct data 644 | try: 645 | response = json.loads(r.read().decode("utf-8")) 646 | except IncompleteRead: 647 | logger.warning("Incomplete read error detected") 648 | read_error = True 649 | except json.decoder.JSONDecodeError: 650 | logger.warning("JSON decode error detected") 651 | read_error = True 652 | except TimeoutError: 653 | logger.warning("Tiemout error detected") 654 | read_error = True 655 | else: 656 | read_error = False 657 | 658 | # be gentle with API request limit and throttle requests if remaining requests getting low 659 | limit_remaining = int(r.headers.get("x-ratelimit-remaining", 0)) 660 | if args.throttle_limit and limit_remaining <= args.throttle_limit: 661 | logger.info( 662 | "API request limit hit: {} requests left, pausing further requests for {}s".format( 663 | limit_remaining, args.throttle_pause 664 | ) 665 | ) 666 | time.sleep(args.throttle_pause) 667 | 668 | retries = 0 669 | while retries < 3 and (status_code == 502 or read_error): 670 | logger.warning("API request failed. Retrying in 5 seconds") 671 | retries += 1 672 | time.sleep(5) 673 | request = _construct_request( 674 | request_per_page, 675 | query_args, 676 | next_url or template, 677 | auth, 678 | as_app=args.as_app, 679 | fine=True if args.token_fine is not None else False, 680 | ) # noqa 681 | r, errors = _get_response(request, auth, next_url or template) 682 | 683 | status_code = int(r.getcode()) 684 | try: 685 | response = json.loads(r.read().decode("utf-8")) 686 | read_error = False 687 | except IncompleteRead: 688 | logger.warning("Incomplete read error detected") 689 | read_error = True 690 | except json.decoder.JSONDecodeError: 691 | logger.warning("JSON decode error detected") 692 | read_error = True 693 | except TimeoutError: 694 | logger.warning("Tiemout error detected") 695 | read_error = True 696 | 697 | if status_code != 200: 698 | template = "API request returned HTTP {0}: {1}" 699 | errors.append(template.format(status_code, r.reason)) 700 | raise Exception(", ".join(errors)) 701 | 702 | if read_error: 703 | template = "API request problem reading response for {0}" 704 | errors.append(template.format(request)) 705 | raise Exception(", ".join(errors)) 706 | 707 | if len(errors) == 0: 708 | if type(response) is list: 709 | for resp in response: 710 | yield resp 711 | # Parse Link header for next page URL (cursor-based pagination) 712 | link_header = r.headers.get("Link", "") 713 | next_url = None 714 | if link_header: 715 | # Parse Link header: ; rel="next" 716 | for link in link_header.split(","): 717 | if 'rel="next"' in link: 718 | next_url = link[link.find("<") + 1 : link.find(">")] 719 | break 720 | if not next_url: 721 | break 722 | elif type(response) is dict and single_request: 723 | yield response 724 | 725 | if len(errors) > 0: 726 | raise Exception(", ".join(errors)) 727 | 728 | if single_request: 729 | break 730 | 731 | 732 | def retrieve_data(args, template, query_args=None, single_request=False): 733 | return list(retrieve_data_gen(args, template, query_args, single_request)) 734 | 735 | 736 | def get_query_args(query_args=None): 737 | if not query_args: 738 | query_args = {} 739 | return query_args 740 | 741 | 742 | def _get_response(request, auth, template): 743 | retry_timeout = 3 744 | errors = [] 745 | # We'll make requests in a loop so we can 746 | # delay and retry in the case of rate-limiting 747 | while True: 748 | should_continue = False 749 | try: 750 | r = urlopen(request, context=https_ctx) 751 | except HTTPError as exc: 752 | errors, should_continue = _request_http_error(exc, auth, errors) # noqa 753 | r = exc 754 | except URLError as e: 755 | logger.warning(e.reason) 756 | should_continue, retry_timeout = _request_url_error(template, retry_timeout) 757 | if not should_continue: 758 | raise 759 | except socket.error as e: 760 | logger.warning(e.strerror) 761 | should_continue, retry_timeout = _request_url_error(template, retry_timeout) 762 | if not should_continue: 763 | raise 764 | 765 | if should_continue: 766 | continue 767 | 768 | break 769 | return r, errors 770 | 771 | 772 | def _construct_request(per_page, query_args, template, auth, as_app=None, fine=False): 773 | # If template is already a full URL with query params (from Link header), use it directly 774 | if "?" in template and template.startswith("http"): 775 | request_url = template 776 | # Extract query string for logging 777 | querystring = template.split("?", 1)[1] 778 | else: 779 | # Build URL with query parameters 780 | all_query_args = {} 781 | if per_page: 782 | all_query_args["per_page"] = per_page 783 | if query_args: 784 | all_query_args.update(query_args) 785 | 786 | request_url = template 787 | if all_query_args: 788 | querystring = urlencode(all_query_args) 789 | request_url = template + "?" + querystring 790 | else: 791 | querystring = "" 792 | 793 | request = Request(request_url) 794 | if auth is not None: 795 | if not as_app: 796 | if fine: 797 | request.add_header("Authorization", "token " + auth) 798 | else: 799 | request.add_header("Authorization", "Basic ".encode("ascii") + auth) 800 | else: 801 | auth = auth.encode("ascii") 802 | request.add_header("Authorization", "token ".encode("ascii") + auth) 803 | 804 | log_url = template if "?" not in template else template.split("?")[0] 805 | if querystring: 806 | log_url += "?" + querystring 807 | logger.info("Requesting {}".format(log_url)) 808 | return request 809 | 810 | 811 | def _request_http_error(exc, auth, errors): 812 | # HTTPError behaves like a Response so we can 813 | # check the status code and headers to see exactly 814 | # what failed. 815 | 816 | should_continue = False 817 | headers = exc.headers 818 | limit_remaining = int(headers.get("x-ratelimit-remaining", 0)) 819 | 820 | if exc.code == 403 and limit_remaining < 1: 821 | # The X-RateLimit-Reset header includes a 822 | # timestamp telling us when the limit will reset 823 | # so we can calculate how long to wait rather 824 | # than inefficiently polling: 825 | gm_now = calendar.timegm(time.gmtime()) 826 | reset = int(headers.get("x-ratelimit-reset", 0)) or gm_now 827 | # We'll never sleep for less than 10 seconds: 828 | delta = max(10, reset - gm_now) 829 | 830 | limit = headers.get("x-ratelimit-limit") 831 | logger.warning( 832 | "Exceeded rate limit of {} requests; waiting {} seconds to reset".format( 833 | limit, delta 834 | ) 835 | ) # noqa 836 | 837 | if auth is None: 838 | logger.info("Hint: Authenticate to raise your GitHub rate limit") 839 | 840 | time.sleep(delta) 841 | should_continue = True 842 | return errors, should_continue 843 | 844 | 845 | def _request_url_error(template, retry_timeout): 846 | # In case of a connection timing out, we can retry a few time 847 | # But we won't crash and not back-up the rest now 848 | logger.info("'{}' timed out".format(template)) 849 | retry_timeout -= 1 850 | 851 | if retry_timeout >= 0: 852 | return True, retry_timeout 853 | 854 | raise Exception("'{}' timed out to much, skipping!".format(template)) 855 | 856 | 857 | class S3HTTPRedirectHandler(HTTPRedirectHandler): 858 | """ 859 | A subclassed redirect handler for downloading Github assets from S3. 860 | 861 | urllib will add the Authorization header to the redirected request to S3, which will result in a 400, 862 | so we should remove said header on redirect. 863 | """ 864 | 865 | def redirect_request(self, req, fp, code, msg, headers, newurl): 866 | request = super(S3HTTPRedirectHandler, self).redirect_request( 867 | req, fp, code, msg, headers, newurl 868 | ) 869 | # Only delete Authorization header if it exists (attachments may not have it) 870 | if "Authorization" in request.headers: 871 | del request.headers["Authorization"] 872 | return request 873 | 874 | 875 | def download_file(url, path, auth, as_app=False, fine=False): 876 | # Skip downloading release assets if they already exist on disk so we don't redownload on every sync 877 | if os.path.exists(path): 878 | return 879 | 880 | request = _construct_request( 881 | per_page=None, 882 | query_args={}, 883 | template=url, 884 | auth=auth, 885 | as_app=as_app, 886 | fine=fine, 887 | ) 888 | request.add_header("Accept", "application/octet-stream") 889 | opener = build_opener(S3HTTPRedirectHandler) 890 | 891 | try: 892 | response = opener.open(request) 893 | 894 | chunk_size = 16 * 1024 895 | with open(path, "wb") as f: 896 | while True: 897 | chunk = response.read(chunk_size) 898 | if not chunk: 899 | break 900 | f.write(chunk) 901 | except HTTPError as exc: 902 | # Gracefully handle 404 responses (and others) when downloading from S3 903 | logger.warning( 904 | "Skipping download of asset {0} due to HTTPError: {1}".format( 905 | url, exc.reason 906 | ) 907 | ) 908 | except URLError as e: 909 | # Gracefully handle other URL errors 910 | logger.warning( 911 | "Skipping download of asset {0} due to URLError: {1}".format(url, e.reason) 912 | ) 913 | except socket.error as e: 914 | # Gracefully handle socket errors 915 | # TODO: Implement retry logic 916 | logger.warning( 917 | "Skipping download of asset {0} due to socker error: {1}".format( 918 | url, e.strerror 919 | ) 920 | ) 921 | 922 | 923 | def download_attachment_file(url, path, auth, as_app=False, fine=False): 924 | """Download attachment file directly (not via GitHub API). 925 | 926 | Similar to download_file() but for direct file URLs, not API endpoints. 927 | Attachment URLs (user-images, user-attachments) are direct downloads, 928 | not API endpoints, so we skip _construct_request() which adds API params. 929 | 930 | URL Format Support & Authentication Requirements: 931 | 932 | | URL Format | Auth Required | Notes | 933 | |----------------------------------------------|---------------|--------------------------| 934 | | github.com/user-attachments/assets/* | Private only | Modern format (2024+) | 935 | | github.com/user-attachments/files/* | Private only | Modern format (2024+) | 936 | | user-images.githubusercontent.com/* | No (public) | Legacy CDN, all eras | 937 | | private-user-images.githubusercontent.com/* | JWT in URL | Legacy private (5min) | 938 | | github.com/{owner}/{repo}/files/* | Repo filter | Old repo files | 939 | 940 | - Modern user-attachments: Requires GitHub token auth for private repos 941 | - Legacy public CDN: No auth needed/accepted (returns 400 with auth header) 942 | - Legacy private CDN: Uses JWT token embedded in URL, no GitHub token needed 943 | - Repo files: Filtered to current repository only during extraction 944 | 945 | Returns dict with metadata: 946 | - success: bool 947 | - http_status: int (200, 404, etc.) 948 | - content_type: str or None 949 | - original_filename: str or None (from Content-Disposition) 950 | - size_bytes: int or None 951 | - error: str or None 952 | """ 953 | import re 954 | from datetime import datetime, timezone 955 | 956 | metadata = { 957 | "url": url, 958 | "success": False, 959 | "http_status": None, 960 | "content_type": None, 961 | "original_filename": None, 962 | "size_bytes": None, 963 | "downloaded_at": datetime.now(timezone.utc).isoformat(), 964 | "error": None, 965 | } 966 | 967 | # Create simple request (no API query params) 968 | request = Request(url) 969 | request.add_header("Accept", "application/octet-stream") 970 | 971 | # Add authentication header only for modern github.com/user-attachments URLs 972 | # Legacy CDN URLs (user-images.githubusercontent.com) are public and don't need/accept auth 973 | # Private CDN URLs (private-user-images) use JWT tokens embedded in the URL 974 | if auth is not None and "github.com/user-attachments/" in url: 975 | if not as_app: 976 | if fine: 977 | # Fine-grained token: plain token with "token " prefix 978 | request.add_header("Authorization", "token " + auth) 979 | else: 980 | # Classic token: base64-encoded with "Basic " prefix 981 | request.add_header("Authorization", "Basic ".encode("ascii") + auth) 982 | else: 983 | # App authentication 984 | auth = auth.encode("ascii") 985 | request.add_header("Authorization", "token ".encode("ascii") + auth) 986 | 987 | # Reuse S3HTTPRedirectHandler from download_file() 988 | opener = build_opener(S3HTTPRedirectHandler) 989 | 990 | temp_path = path + ".temp" 991 | 992 | try: 993 | response = opener.open(request) 994 | metadata["http_status"] = response.getcode() 995 | 996 | # Extract Content-Type 997 | content_type = response.headers.get("Content-Type", "").split(";")[0].strip() 998 | if content_type: 999 | metadata["content_type"] = content_type 1000 | 1001 | # Extract original filename from Content-Disposition header 1002 | # Format: attachment; filename=example.mov or attachment;filename="example.mov" 1003 | content_disposition = response.headers.get("Content-Disposition", "") 1004 | if content_disposition: 1005 | # Match: filename=something or filename="something" or filename*=UTF-8''something 1006 | match = re.search(r'filename\*?=["\']?([^"\';\r\n]+)', content_disposition) 1007 | if match: 1008 | original_filename = match.group(1).strip() 1009 | # Handle RFC 5987 encoding: filename*=UTF-8''example.mov 1010 | if "UTF-8''" in original_filename: 1011 | original_filename = original_filename.split("UTF-8''")[1] 1012 | metadata["original_filename"] = original_filename 1013 | 1014 | # Fallback: Extract filename from final URL after redirects 1015 | # This handles user-attachments/assets URLs which redirect to S3 with filename.ext 1016 | if not metadata["original_filename"]: 1017 | from urllib.parse import urlparse, unquote 1018 | 1019 | final_url = response.geturl() 1020 | parsed = urlparse(final_url) 1021 | # Get filename from path (last component before query string) 1022 | path_parts = parsed.path.split("/") 1023 | if path_parts: 1024 | # URL might be encoded, decode it 1025 | filename_from_url = unquote(path_parts[-1]) 1026 | # Only use if it has an extension 1027 | if "." in filename_from_url: 1028 | metadata["original_filename"] = filename_from_url 1029 | 1030 | # Download file to temporary location 1031 | chunk_size = 16 * 1024 1032 | bytes_downloaded = 0 1033 | with open(temp_path, "wb") as f: 1034 | while True: 1035 | chunk = response.read(chunk_size) 1036 | if not chunk: 1037 | break 1038 | f.write(chunk) 1039 | bytes_downloaded += len(chunk) 1040 | 1041 | # Atomic rename to final location 1042 | os.replace(temp_path, path) 1043 | 1044 | metadata["size_bytes"] = bytes_downloaded 1045 | metadata["success"] = True 1046 | 1047 | except HTTPError as exc: 1048 | metadata["http_status"] = exc.code 1049 | metadata["error"] = str(exc.reason) 1050 | logger.warning( 1051 | "Skipping download of attachment {0} due to HTTPError: {1}".format( 1052 | url, exc.reason 1053 | ) 1054 | ) 1055 | except URLError as e: 1056 | metadata["error"] = str(e.reason) 1057 | logger.warning( 1058 | "Skipping download of attachment {0} due to URLError: {1}".format( 1059 | url, e.reason 1060 | ) 1061 | ) 1062 | except socket.error as e: 1063 | metadata["error"] = str(e.strerror) if hasattr(e, "strerror") else str(e) 1064 | logger.warning( 1065 | "Skipping download of attachment {0} due to socket error: {1}".format( 1066 | url, e.strerror if hasattr(e, "strerror") else str(e) 1067 | ) 1068 | ) 1069 | except Exception as e: 1070 | metadata["error"] = str(e) 1071 | logger.warning( 1072 | "Skipping download of attachment {0} due to error: {1}".format(url, str(e)) 1073 | ) 1074 | # Clean up temp file if it was partially created 1075 | if os.path.exists(temp_path): 1076 | try: 1077 | os.remove(temp_path) 1078 | except Exception: 1079 | pass 1080 | 1081 | return metadata 1082 | 1083 | 1084 | def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None): 1085 | """Extract GitHub-hosted attachment URLs from issue/PR body and comments. 1086 | 1087 | What qualifies as an attachment? 1088 | There is no "attachment" concept in the GitHub API - it's a user behavior pattern 1089 | we've identified through analysis of real-world repositories. We define attachments as: 1090 | 1091 | - User-uploaded files hosted on GitHub's CDN domains 1092 | - Found outside of code blocks (not examples/documentation) 1093 | - Matches known GitHub attachment URL patterns 1094 | 1095 | This intentionally captures bare URLs pasted by users, not just markdown/HTML syntax. 1096 | Some false positives (example URLs in documentation) may occur - these fail gracefully 1097 | with HTTP 404 and are logged in the manifest. 1098 | 1099 | Supported URL formats: 1100 | - Modern: github.com/user-attachments/{assets,files}/* 1101 | - Legacy: user-images.githubusercontent.com/* (including private-user-images) 1102 | - Repo files: github.com/{owner}/{repo}/files/* (filtered to current repo) 1103 | - Repo assets: github.com/{owner}/{repo}/assets/* (filtered to current repo) 1104 | 1105 | Repository filtering (repo files/assets only): 1106 | - Direct match: URL is for current repository → included 1107 | - Redirect match: URL redirects to current repository → included (handles renames/transfers) 1108 | - Different repo: URL is for different repository → excluded 1109 | 1110 | Code block filtering: 1111 | - Removes fenced code blocks (```) and inline code (`) before extraction 1112 | - Prevents extracting URLs from code examples and documentation snippets 1113 | 1114 | Args: 1115 | item_data: Issue or PR data dict 1116 | issue_number: Issue/PR number for logging 1117 | repository_full_name: Full repository name (owner/repo) for filtering repo-scoped URLs 1118 | """ 1119 | import re 1120 | 1121 | urls = [] 1122 | 1123 | # Define all GitHub attachment patterns 1124 | # Stop at markdown punctuation: whitespace, ), `, ", >, < 1125 | # Trailing sentence punctuation (. ! ? , ; : ' ") is stripped in post-processing 1126 | patterns = [ 1127 | r'https://github\.com/user-attachments/(?:assets|files)/[^\s\)`"<>]+', # Modern 1128 | r'https://(?:private-)?user-images\.githubusercontent\.com/[^\s\)`"<>]+', # Legacy CDN 1129 | ] 1130 | 1131 | # Add repo-scoped patterns (will be filtered by repository later) 1132 | # These patterns match ANY repo, then we filter to current repo with redirect checking 1133 | repo_files_pattern = r'https://github\.com/[^/]+/[^/]+/files/\d+/[^\s\)`"<>]+' 1134 | repo_assets_pattern = r'https://github\.com/[^/]+/[^/]+/assets/\d+/[^\s\)`"<>]+' 1135 | patterns.append(repo_files_pattern) 1136 | patterns.append(repo_assets_pattern) 1137 | 1138 | def clean_url(url): 1139 | """Remove trailing sentence and markdown punctuation that's not part of the URL.""" 1140 | return url.rstrip(".!?,;:'\")") 1141 | 1142 | def remove_code_blocks(text): 1143 | """Remove markdown code blocks (fenced and inline) from text. 1144 | 1145 | This prevents extracting URLs from code examples like: 1146 | - Fenced code blocks: ```code``` 1147 | - Inline code: `code` 1148 | """ 1149 | # Remove fenced code blocks first (```...```) 1150 | # DOTALL flag makes . match newlines 1151 | text = re.sub(r"```.*?```", "", text, flags=re.DOTALL) 1152 | 1153 | # Remove inline code (`...`) 1154 | # Non-greedy match between backticks 1155 | text = re.sub(r"`[^`]*`", "", text) 1156 | 1157 | return text 1158 | 1159 | def is_repo_scoped_url(url): 1160 | """Check if URL is a repo-scoped attachment (files or assets).""" 1161 | return bool( 1162 | re.match(r"https://github\.com/[^/]+/[^/]+/(?:files|assets)/\d+/", url) 1163 | ) 1164 | 1165 | def check_redirect_to_current_repo(url, current_repo): 1166 | """Check if URL redirects to current repository. 1167 | 1168 | Returns True if: 1169 | - URL is already for current repo 1170 | - URL redirects (301/302) to current repo (handles renames/transfers) 1171 | 1172 | Returns False otherwise (URL is for a different repo). 1173 | """ 1174 | # Extract owner/repo from URL 1175 | match = re.match(r"https://github\.com/([^/]+)/([^/]+)/", url) 1176 | if not match: 1177 | return False 1178 | 1179 | url_owner, url_repo = match.groups() 1180 | url_repo_full = f"{url_owner}/{url_repo}" 1181 | 1182 | # Direct match - no need to check redirect 1183 | if url_repo_full.lower() == current_repo.lower(): 1184 | return True 1185 | 1186 | # Different repo - check if it redirects to current repo 1187 | # This handles repository transfers and renames 1188 | try: 1189 | import urllib.request 1190 | import urllib.error 1191 | 1192 | # Make HEAD request with redirect following disabled 1193 | # We need to manually handle redirects to see the Location header 1194 | request = urllib.request.Request(url, method="HEAD") 1195 | request.add_header("User-Agent", "python-github-backup") 1196 | 1197 | # Create opener that does NOT follow redirects 1198 | class NoRedirectHandler(urllib.request.HTTPRedirectHandler): 1199 | def redirect_request(self, req, fp, code, msg, headers, newurl): 1200 | return None # Don't follow redirects 1201 | 1202 | opener = urllib.request.build_opener(NoRedirectHandler) 1203 | 1204 | try: 1205 | _ = opener.open(request, timeout=10) 1206 | # Got 200 - URL works as-is but for different repo 1207 | return False 1208 | except urllib.error.HTTPError as e: 1209 | # Check if it's a redirect (301, 302, 307, 308) 1210 | if e.code in (301, 302, 307, 308): 1211 | location = e.headers.get("Location", "") 1212 | # Check if redirect points to current repo 1213 | if location: 1214 | redirect_match = re.match( 1215 | r"https://github\.com/([^/]+)/([^/]+)/", location 1216 | ) 1217 | if redirect_match: 1218 | redirect_owner, redirect_repo = redirect_match.groups() 1219 | redirect_repo_full = f"{redirect_owner}/{redirect_repo}" 1220 | return redirect_repo_full.lower() == current_repo.lower() 1221 | return False 1222 | except Exception: 1223 | # On any error (timeout, network issue, etc.), be conservative 1224 | # and exclude the URL to avoid downloading from wrong repos 1225 | return False 1226 | 1227 | # Extract from body 1228 | body = item_data.get("body") or "" 1229 | # Remove code blocks before searching for URLs 1230 | body_cleaned = remove_code_blocks(body) 1231 | for pattern in patterns: 1232 | found_urls = re.findall(pattern, body_cleaned) 1233 | urls.extend([clean_url(url) for url in found_urls]) 1234 | 1235 | # Extract from issue comments 1236 | if "comment_data" in item_data: 1237 | for comment in item_data["comment_data"]: 1238 | comment_body = comment.get("body") or "" 1239 | # Remove code blocks before searching for URLs 1240 | comment_cleaned = remove_code_blocks(comment_body) 1241 | for pattern in patterns: 1242 | found_urls = re.findall(pattern, comment_cleaned) 1243 | urls.extend([clean_url(url) for url in found_urls]) 1244 | 1245 | # Extract from PR regular comments 1246 | if "comment_regular_data" in item_data: 1247 | for comment in item_data["comment_regular_data"]: 1248 | comment_body = comment.get("body") or "" 1249 | # Remove code blocks before searching for URLs 1250 | comment_cleaned = remove_code_blocks(comment_body) 1251 | for pattern in patterns: 1252 | found_urls = re.findall(pattern, comment_cleaned) 1253 | urls.extend([clean_url(url) for url in found_urls]) 1254 | 1255 | regex_urls = list(set(urls)) # dedupe 1256 | 1257 | # Filter repo-scoped URLs to current repository only 1258 | # This handles repository transfers/renames via redirect checking 1259 | if repository_full_name: 1260 | filtered_urls = [] 1261 | for url in regex_urls: 1262 | if is_repo_scoped_url(url): 1263 | # Check if URL belongs to current repo (or redirects to it) 1264 | if check_redirect_to_current_repo(url, repository_full_name): 1265 | filtered_urls.append(url) 1266 | # else: skip URLs from other repositories 1267 | else: 1268 | # Non-repo-scoped URLs (user-attachments, CDN) - always include 1269 | filtered_urls.append(url) 1270 | regex_urls = filtered_urls 1271 | 1272 | return regex_urls 1273 | 1274 | 1275 | def get_attachment_filename(url): 1276 | """Get filename from attachment URL, handling all GitHub formats. 1277 | 1278 | Formats: 1279 | - github.com/user-attachments/assets/{uuid} → uuid (add extension later) 1280 | - github.com/user-attachments/files/{id}/{filename} → filename 1281 | - github.com/{owner}/{repo}/files/{id}/{filename} → filename 1282 | - user-images.githubusercontent.com/{user}/{hash}.{ext} → hash.ext 1283 | - private-user-images.githubusercontent.com/...?jwt=... → extract from path 1284 | """ 1285 | from urllib.parse import urlparse 1286 | 1287 | parsed = urlparse(url) 1288 | path_parts = parsed.path.split("/") 1289 | 1290 | # Modern: /user-attachments/files/{id}/{filename} 1291 | if "user-attachments/files" in parsed.path: 1292 | return path_parts[-1] 1293 | 1294 | # Modern: /user-attachments/assets/{uuid} 1295 | elif "user-attachments/assets" in parsed.path: 1296 | return path_parts[-1] # extension added later via detect_and_add_extension 1297 | 1298 | # Repo files: /{owner}/{repo}/files/{id}/{filename} 1299 | elif "/files/" in parsed.path and len(path_parts) >= 2: 1300 | return path_parts[-1] 1301 | 1302 | # Legacy: user-images.githubusercontent.com/{user}/{hash-with-ext} 1303 | elif "githubusercontent.com" in parsed.netloc: 1304 | return path_parts[-1] # Already has extension usually 1305 | 1306 | # Fallback: use last path component 1307 | return path_parts[-1] if path_parts[-1] else "unknown_attachment" 1308 | 1309 | 1310 | def resolve_filename_collision(filepath): 1311 | """Resolve filename collisions using counter suffix pattern. 1312 | 1313 | If filepath exists, returns a new filepath with counter suffix. 1314 | Pattern: report.pdf → report_1.pdf → report_2.pdf 1315 | 1316 | Also protects against manifest.json collisions by treating it as reserved. 1317 | 1318 | Args: 1319 | filepath: Full path to file that might exist 1320 | 1321 | Returns: 1322 | filepath that doesn't collide (may be same as input if no collision) 1323 | """ 1324 | directory = os.path.dirname(filepath) 1325 | filename = os.path.basename(filepath) 1326 | 1327 | # Protect manifest.json - it's a reserved filename 1328 | if filename == "manifest.json": 1329 | name, ext = os.path.splitext(filename) 1330 | counter = 1 1331 | while True: 1332 | new_filename = f"{name}_{counter}{ext}" 1333 | new_filepath = os.path.join(directory, new_filename) 1334 | if not os.path.exists(new_filepath): 1335 | return new_filepath 1336 | counter += 1 1337 | 1338 | if not os.path.exists(filepath): 1339 | return filepath 1340 | 1341 | name, ext = os.path.splitext(filename) 1342 | 1343 | counter = 1 1344 | while True: 1345 | new_filename = f"{name}_{counter}{ext}" 1346 | new_filepath = os.path.join(directory, new_filename) 1347 | if not os.path.exists(new_filepath): 1348 | return new_filepath 1349 | counter += 1 1350 | 1351 | 1352 | def download_attachments( 1353 | args, item_cwd, item_data, number, repository, item_type="issue" 1354 | ): 1355 | """Download user-attachments from issue/PR body and comments with manifest. 1356 | 1357 | Args: 1358 | args: Command line arguments 1359 | item_cwd: Working directory (issue_cwd or pulls_cwd) 1360 | item_data: Issue or PR data dict 1361 | number: Issue or PR number 1362 | repository: Repository dict 1363 | item_type: "issue" or "pull" for logging/manifest 1364 | """ 1365 | import json 1366 | from datetime import datetime, timezone 1367 | 1368 | item_type_display = "issue" if item_type == "issue" else "pull request" 1369 | 1370 | urls = extract_attachment_urls( 1371 | item_data, issue_number=number, repository_full_name=repository["full_name"] 1372 | ) 1373 | if not urls: 1374 | return 1375 | 1376 | attachments_dir = os.path.join(item_cwd, "attachments", str(number)) 1377 | manifest_path = os.path.join(attachments_dir, "manifest.json") 1378 | 1379 | # Load existing manifest to prevent duplicate downloads 1380 | existing_urls = set() 1381 | existing_metadata = [] 1382 | if os.path.exists(manifest_path): 1383 | try: 1384 | with open(manifest_path, "r") as f: 1385 | existing_manifest = json.load(f) 1386 | all_metadata = existing_manifest.get("attachments", []) 1387 | # Only skip URLs that were successfully downloaded OR failed with permanent errors 1388 | # Retry transient failures (5xx, timeouts, network errors) 1389 | for item in all_metadata: 1390 | if item.get("success"): 1391 | existing_urls.add(item["url"]) 1392 | else: 1393 | # Check if this is a permanent failure (don't retry) or transient (retry) 1394 | http_status = item.get("http_status") 1395 | if http_status in [404, 410, 451]: 1396 | # Permanent failures - don't retry 1397 | existing_urls.add(item["url"]) 1398 | # Transient failures (5xx, auth errors, timeouts) will be retried 1399 | existing_metadata = all_metadata 1400 | except (json.JSONDecodeError, IOError): 1401 | # If manifest is corrupted, re-download everything 1402 | logger.warning( 1403 | "Corrupted manifest for {0} #{1}, will re-download".format( 1404 | item_type_display, number 1405 | ) 1406 | ) 1407 | existing_urls = set() 1408 | existing_metadata = [] 1409 | 1410 | # Filter to only new URLs 1411 | new_urls = [url for url in urls if url not in existing_urls] 1412 | 1413 | if not new_urls and existing_urls: 1414 | logger.debug( 1415 | "Skipping attachments for {0} #{1} (all {2} already downloaded)".format( 1416 | item_type_display, number, len(urls) 1417 | ) 1418 | ) 1419 | return 1420 | 1421 | if new_urls: 1422 | logger.info( 1423 | "Downloading {0} new attachment(s) for {1} #{2}".format( 1424 | len(new_urls), item_type_display, number 1425 | ) 1426 | ) 1427 | 1428 | mkdir_p(item_cwd, attachments_dir) 1429 | 1430 | # Collect metadata for manifest (start with existing) 1431 | attachment_metadata_list = existing_metadata[:] 1432 | 1433 | for url in new_urls: 1434 | filename = get_attachment_filename(url) 1435 | filepath = os.path.join(attachments_dir, filename) 1436 | 1437 | # Download and get metadata 1438 | metadata = download_attachment_file( 1439 | url, 1440 | filepath, 1441 | get_auth(args, encode=not args.as_app), 1442 | as_app=args.as_app, 1443 | fine=args.token_fine is not None, 1444 | ) 1445 | 1446 | # If download succeeded but we got an extension from Content-Disposition, 1447 | # we may need to rename the file to add the extension 1448 | if metadata["success"] and metadata.get("original_filename"): 1449 | original_ext = os.path.splitext(metadata["original_filename"])[1] 1450 | current_ext = os.path.splitext(filepath)[1] 1451 | 1452 | # Add extension if not present 1453 | if original_ext and current_ext != original_ext: 1454 | final_filepath = filepath + original_ext 1455 | # Check for collision again with new extension 1456 | final_filepath = resolve_filename_collision(final_filepath) 1457 | logger.debug( 1458 | "Adding extension {0} to {1}".format(original_ext, filepath) 1459 | ) 1460 | 1461 | # Rename to add extension (already atomic from download) 1462 | try: 1463 | os.replace(filepath, final_filepath) 1464 | metadata["saved_as"] = os.path.basename(final_filepath) 1465 | except Exception as e: 1466 | logger.warning( 1467 | "Could not add extension to {0}: {1}".format(filepath, str(e)) 1468 | ) 1469 | metadata["saved_as"] = os.path.basename(filepath) 1470 | else: 1471 | metadata["saved_as"] = os.path.basename(filepath) 1472 | elif metadata["success"]: 1473 | metadata["saved_as"] = os.path.basename(filepath) 1474 | else: 1475 | metadata["saved_as"] = None 1476 | 1477 | attachment_metadata_list.append(metadata) 1478 | 1479 | # Write manifest 1480 | if attachment_metadata_list: 1481 | manifest = { 1482 | "issue_number": number, 1483 | "issue_type": item_type, 1484 | "repository": ( 1485 | f"{args.user}/{args.repository}" 1486 | if hasattr(args, "repository") and args.repository 1487 | else args.user 1488 | ), 1489 | "manifest_updated_at": datetime.now(timezone.utc).isoformat(), 1490 | "attachments": attachment_metadata_list, 1491 | } 1492 | 1493 | manifest_path = os.path.join(attachments_dir, "manifest.json") 1494 | with open(manifest_path + ".temp", "w") as f: 1495 | json.dump(manifest, f, indent=2) 1496 | os.replace(manifest_path + ".temp", manifest_path) # Atomic write 1497 | logger.debug( 1498 | "Wrote manifest for {0} #{1}: {2} attachments".format( 1499 | item_type_display, number, len(attachment_metadata_list) 1500 | ) 1501 | ) 1502 | 1503 | 1504 | def get_authenticated_user(args): 1505 | template = "https://{0}/user".format(get_github_api_host(args)) 1506 | data = retrieve_data(args, template, single_request=True) 1507 | return data[0] 1508 | 1509 | 1510 | def check_git_lfs_install(): 1511 | exit_code = subprocess.call(["git", "lfs", "version"]) 1512 | if exit_code != 0: 1513 | raise Exception( 1514 | "The argument --lfs requires you to have Git LFS installed.\nYou can get it from https://git-lfs.github.com." 1515 | ) 1516 | 1517 | 1518 | def retrieve_repositories(args, authenticated_user): 1519 | logger.info("Retrieving repositories") 1520 | single_request = False 1521 | if args.user == authenticated_user["login"]: 1522 | # we must use the /user/repos API to be able to access private repos 1523 | template = "https://{0}/user/repos".format(get_github_api_host(args)) 1524 | else: 1525 | if args.private and not args.organization: 1526 | logger.warning( 1527 | "Authenticated user is different from user being backed up, thus private repositories cannot be accessed" 1528 | ) 1529 | template = "https://{0}/users/{1}/repos".format( 1530 | get_github_api_host(args), args.user 1531 | ) 1532 | 1533 | if args.organization: 1534 | template = "https://{0}/orgs/{1}/repos".format( 1535 | get_github_api_host(args), args.user 1536 | ) 1537 | 1538 | if args.repository: 1539 | if "/" in args.repository: 1540 | repo_path = args.repository 1541 | else: 1542 | repo_path = "{0}/{1}".format(args.user, args.repository) 1543 | single_request = True 1544 | template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path) 1545 | 1546 | repos = retrieve_data(args, template, single_request=single_request) 1547 | 1548 | if args.all_starred: 1549 | starred_template = "https://{0}/users/{1}/starred".format( 1550 | get_github_api_host(args), args.user 1551 | ) 1552 | starred_repos = retrieve_data(args, starred_template, single_request=False) 1553 | # flag each repo as starred for downstream processing 1554 | for item in starred_repos: 1555 | item.update({"is_starred": True}) 1556 | repos.extend(starred_repos) 1557 | 1558 | if args.include_gists: 1559 | gists_template = "https://{0}/users/{1}/gists".format( 1560 | get_github_api_host(args), args.user 1561 | ) 1562 | gists = retrieve_data(args, gists_template, single_request=False) 1563 | # flag each repo as a gist for downstream processing 1564 | for item in gists: 1565 | item.update({"is_gist": True}) 1566 | repos.extend(gists) 1567 | 1568 | if args.include_starred_gists: 1569 | if ( 1570 | not authenticated_user.get("login") 1571 | or args.user.lower() != authenticated_user["login"].lower() 1572 | ): 1573 | logger.warning( 1574 | "Cannot retrieve starred gists for '%s'. GitHub only allows access to the authenticated user's starred gists.", 1575 | args.user, 1576 | ) 1577 | else: 1578 | starred_gists_template = "https://{0}/gists/starred".format( 1579 | get_github_api_host(args) 1580 | ) 1581 | starred_gists = retrieve_data( 1582 | args, starred_gists_template, single_request=False 1583 | ) 1584 | # flag each repo as a starred gist for downstream processing 1585 | for item in starred_gists: 1586 | item.update({"is_gist": True, "is_starred": True}) 1587 | repos.extend(starred_gists) 1588 | 1589 | return repos 1590 | 1591 | 1592 | def filter_repositories(args, unfiltered_repositories): 1593 | if args.repository: 1594 | return unfiltered_repositories 1595 | logger.info("Filtering repositories") 1596 | 1597 | repositories = [] 1598 | for r in unfiltered_repositories: 1599 | # gists can be anonymous, so need to safely check owner 1600 | # Use case-insensitive comparison to match GitHub's case-insensitive username behavior 1601 | owner_login = r.get("owner", {}).get("login", "") 1602 | if owner_login.lower() == args.user.lower() or r.get("is_starred"): 1603 | repositories.append(r) 1604 | 1605 | name_regex = None 1606 | if args.name_regex: 1607 | name_regex = re.compile(args.name_regex) 1608 | 1609 | languages = None 1610 | if args.languages: 1611 | languages = [x.lower() for x in args.languages] 1612 | 1613 | if not args.fork: 1614 | repositories = [r for r in repositories if not r.get("fork")] 1615 | if not args.private: 1616 | repositories = [ 1617 | r for r in repositories if not r.get("private") or r.get("public") 1618 | ] 1619 | if languages: 1620 | repositories = [ 1621 | r 1622 | for r in repositories 1623 | if r.get("language") and r.get("language").lower() in languages 1624 | ] # noqa 1625 | if name_regex: 1626 | repositories = [ 1627 | r for r in repositories if "name" not in r or name_regex.match(r["name"]) 1628 | ] 1629 | if args.skip_archived: 1630 | repositories = [r for r in repositories if not r.get("archived")] 1631 | if args.exclude: 1632 | repositories = [ 1633 | r for r in repositories if "name" not in r or r["name"] not in args.exclude 1634 | ] 1635 | 1636 | return repositories 1637 | 1638 | 1639 | def backup_repositories(args, output_directory, repositories): 1640 | logger.info("Backing up repositories") 1641 | repos_template = "https://{0}/repos".format(get_github_api_host(args)) 1642 | 1643 | if args.incremental: 1644 | last_update_path = os.path.join(output_directory, "last_update") 1645 | if os.path.exists(last_update_path): 1646 | args.since = open(last_update_path).read().strip() 1647 | else: 1648 | args.since = None 1649 | else: 1650 | args.since = None 1651 | 1652 | last_update = "0000-00-00T00:00:00Z" 1653 | for repository in repositories: 1654 | if "updated_at" in repository and repository["updated_at"] > last_update: 1655 | last_update = repository["updated_at"] 1656 | elif "pushed_at" in repository and repository["pushed_at"] > last_update: 1657 | last_update = repository["pushed_at"] 1658 | 1659 | if repository.get("is_gist"): 1660 | repo_cwd = os.path.join(output_directory, "gists", repository["id"]) 1661 | elif repository.get("is_starred"): 1662 | # put starred repos in -o/starred/${owner}/${repo} to prevent collision of 1663 | # any repositories with the same name 1664 | repo_cwd = os.path.join( 1665 | output_directory, 1666 | "starred", 1667 | repository["owner"]["login"], 1668 | repository["name"], 1669 | ) 1670 | else: 1671 | repo_cwd = os.path.join( 1672 | output_directory, "repositories", repository["name"] 1673 | ) 1674 | 1675 | repo_dir = os.path.join(repo_cwd, "repository") 1676 | repo_url = get_github_repo_url(args, repository) 1677 | 1678 | include_gists = args.include_gists or args.include_starred_gists 1679 | include_starred = args.all_starred and repository.get("is_starred") 1680 | if ( 1681 | (args.include_repository or args.include_everything) 1682 | or (include_gists and repository.get("is_gist")) 1683 | or include_starred 1684 | ): 1685 | repo_name = ( 1686 | repository.get("name") 1687 | if not repository.get("is_gist") 1688 | else repository.get("id") 1689 | ) 1690 | fetch_repository( 1691 | repo_name, 1692 | repo_url, 1693 | repo_dir, 1694 | skip_existing=args.skip_existing, 1695 | bare_clone=args.bare_clone, 1696 | lfs_clone=args.lfs_clone, 1697 | no_prune=args.no_prune, 1698 | ) 1699 | 1700 | if repository.get("is_gist"): 1701 | # dump gist information to a file as well 1702 | output_file = "{0}/gist.json".format(repo_cwd) 1703 | with codecs.open(output_file, "w", encoding="utf-8") as f: 1704 | json_dump(repository, f) 1705 | 1706 | continue # don't try to back anything else for a gist; it doesn't exist 1707 | 1708 | try: 1709 | download_wiki = args.include_wiki or args.include_everything 1710 | if repository["has_wiki"] and download_wiki: 1711 | fetch_repository( 1712 | repository["name"], 1713 | repo_url.replace(".git", ".wiki.git"), 1714 | os.path.join(repo_cwd, "wiki"), 1715 | skip_existing=args.skip_existing, 1716 | bare_clone=args.bare_clone, 1717 | lfs_clone=args.lfs_clone, 1718 | no_prune=args.no_prune, 1719 | ) 1720 | if args.include_issues or args.include_everything: 1721 | backup_issues(args, repo_cwd, repository, repos_template) 1722 | 1723 | if args.include_pulls or args.include_everything: 1724 | backup_pulls(args, repo_cwd, repository, repos_template) 1725 | 1726 | if args.include_milestones or args.include_everything: 1727 | backup_milestones(args, repo_cwd, repository, repos_template) 1728 | 1729 | if args.include_labels or args.include_everything: 1730 | backup_labels(args, repo_cwd, repository, repos_template) 1731 | 1732 | if args.include_hooks or args.include_everything: 1733 | backup_hooks(args, repo_cwd, repository, repos_template) 1734 | 1735 | if args.include_releases or args.include_everything: 1736 | backup_releases( 1737 | args, 1738 | repo_cwd, 1739 | repository, 1740 | repos_template, 1741 | include_assets=args.include_assets or args.include_everything, 1742 | ) 1743 | except RepositoryUnavailableError as e: 1744 | logger.warning( 1745 | f"Repository {repository['full_name']} is unavailable (HTTP 451)" 1746 | ) 1747 | if e.dmca_url: 1748 | logger.warning(f"DMCA notice: {e.dmca_url}") 1749 | logger.info(f"Skipping remaining resources for {repository['full_name']}") 1750 | continue 1751 | 1752 | if args.incremental: 1753 | if last_update == "0000-00-00T00:00:00Z": 1754 | last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()) 1755 | 1756 | open(last_update_path, "w").write(last_update) 1757 | 1758 | 1759 | def backup_issues(args, repo_cwd, repository, repos_template): 1760 | has_issues_dir = os.path.isdir("{0}/issues/.git".format(repo_cwd)) 1761 | if args.skip_existing and has_issues_dir: 1762 | return 1763 | 1764 | logger.info("Retrieving {0} issues".format(repository["full_name"])) 1765 | issue_cwd = os.path.join(repo_cwd, "issues") 1766 | mkdir_p(repo_cwd, issue_cwd) 1767 | 1768 | issues = {} 1769 | issues_skipped = 0 1770 | issues_skipped_message = "" 1771 | _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) 1772 | 1773 | should_include_pulls = args.include_pulls or args.include_everything 1774 | issue_states = ["open", "closed"] 1775 | for issue_state in issue_states: 1776 | query_args = {"filter": "all", "state": issue_state} 1777 | if args.since: 1778 | query_args["since"] = args.since 1779 | 1780 | _issues = retrieve_data(args, _issue_template, query_args=query_args) 1781 | for issue in _issues: 1782 | # skip pull requests which are also returned as issues 1783 | # if retrieving pull requests is requested as well 1784 | if "pull_request" in issue and should_include_pulls: 1785 | issues_skipped += 1 1786 | continue 1787 | 1788 | issues[issue["number"]] = issue 1789 | 1790 | if issues_skipped: 1791 | issues_skipped_message = " (skipped {0} pull requests)".format(issues_skipped) 1792 | 1793 | logger.info( 1794 | "Saving {0} issues to disk{1}".format( 1795 | len(list(issues.keys())), issues_skipped_message 1796 | ) 1797 | ) 1798 | comments_template = _issue_template + "/{0}/comments" 1799 | events_template = _issue_template + "/{0}/events" 1800 | for number, issue in list(issues.items()): 1801 | issue_file = "{0}/{1}.json".format(issue_cwd, number) 1802 | if args.incremental_by_files and os.path.isfile(issue_file): 1803 | modified = os.path.getmtime(issue_file) 1804 | modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") 1805 | if modified > issue["updated_at"]: 1806 | logger.info( 1807 | "Skipping issue {0} because it wasn't modified since last backup".format( 1808 | number 1809 | ) 1810 | ) 1811 | continue 1812 | 1813 | if args.include_issue_comments or args.include_everything: 1814 | template = comments_template.format(number) 1815 | issues[number]["comment_data"] = retrieve_data(args, template) 1816 | if args.include_issue_events or args.include_everything: 1817 | template = events_template.format(number) 1818 | issues[number]["event_data"] = retrieve_data(args, template) 1819 | if args.include_attachments: 1820 | download_attachments( 1821 | args, issue_cwd, issues[number], number, repository, item_type="issue" 1822 | ) 1823 | 1824 | with codecs.open(issue_file + ".temp", "w", encoding="utf-8") as f: 1825 | json_dump(issue, f) 1826 | os.replace(issue_file + ".temp", issue_file) # Atomic write 1827 | 1828 | 1829 | def backup_pulls(args, repo_cwd, repository, repos_template): 1830 | has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd)) 1831 | if args.skip_existing and has_pulls_dir: 1832 | return 1833 | 1834 | logger.info("Retrieving {0} pull requests".format(repository["full_name"])) # noqa 1835 | pulls_cwd = os.path.join(repo_cwd, "pulls") 1836 | mkdir_p(repo_cwd, pulls_cwd) 1837 | 1838 | pulls = {} 1839 | _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"]) 1840 | _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"]) 1841 | query_args = { 1842 | "filter": "all", 1843 | "state": "all", 1844 | "sort": "updated", 1845 | "direction": "desc", 1846 | } 1847 | 1848 | if not args.include_pull_details: 1849 | pull_states = ["open", "closed"] 1850 | for pull_state in pull_states: 1851 | query_args["state"] = pull_state 1852 | _pulls = retrieve_data_gen(args, _pulls_template, query_args=query_args) 1853 | for pull in _pulls: 1854 | if args.since and pull["updated_at"] < args.since: 1855 | break 1856 | if not args.since or pull["updated_at"] >= args.since: 1857 | pulls[pull["number"]] = pull 1858 | else: 1859 | _pulls = retrieve_data_gen(args, _pulls_template, query_args=query_args) 1860 | for pull in _pulls: 1861 | if args.since and pull["updated_at"] < args.since: 1862 | break 1863 | if not args.since or pull["updated_at"] >= args.since: 1864 | pulls[pull["number"]] = retrieve_data( 1865 | args, 1866 | _pulls_template + "/{}".format(pull["number"]), 1867 | single_request=True, 1868 | )[0] 1869 | 1870 | logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys())))) 1871 | # Comments from pulls API are only _review_ comments 1872 | # regular comments need to be fetched via issue API. 1873 | # For backwards compatibility with versions <= 0.41.0 1874 | # keep name "comment_data" for review comments 1875 | comments_regular_template = _issue_template + "/{0}/comments" 1876 | comments_template = _pulls_template + "/{0}/comments" 1877 | commits_template = _pulls_template + "/{0}/commits" 1878 | for number, pull in list(pulls.items()): 1879 | pull_file = "{0}/{1}.json".format(pulls_cwd, number) 1880 | if args.incremental_by_files and os.path.isfile(pull_file): 1881 | modified = os.path.getmtime(pull_file) 1882 | modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ") 1883 | if modified > pull["updated_at"]: 1884 | logger.info( 1885 | "Skipping pull request {0} because it wasn't modified since last backup".format( 1886 | number 1887 | ) 1888 | ) 1889 | continue 1890 | if args.include_pull_comments or args.include_everything: 1891 | template = comments_regular_template.format(number) 1892 | pulls[number]["comment_regular_data"] = retrieve_data(args, template) 1893 | template = comments_template.format(number) 1894 | pulls[number]["comment_data"] = retrieve_data(args, template) 1895 | if args.include_pull_commits or args.include_everything: 1896 | template = commits_template.format(number) 1897 | pulls[number]["commit_data"] = retrieve_data(args, template) 1898 | if args.include_attachments: 1899 | download_attachments( 1900 | args, pulls_cwd, pulls[number], number, repository, item_type="pull" 1901 | ) 1902 | 1903 | with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f: 1904 | json_dump(pull, f) 1905 | os.replace(pull_file + ".temp", pull_file) # Atomic write 1906 | 1907 | 1908 | def backup_milestones(args, repo_cwd, repository, repos_template): 1909 | milestone_cwd = os.path.join(repo_cwd, "milestones") 1910 | if args.skip_existing and os.path.isdir(milestone_cwd): 1911 | return 1912 | 1913 | logger.info("Retrieving {0} milestones".format(repository["full_name"])) 1914 | mkdir_p(repo_cwd, milestone_cwd) 1915 | 1916 | template = "{0}/{1}/milestones".format(repos_template, repository["full_name"]) 1917 | 1918 | query_args = {"state": "all"} 1919 | 1920 | _milestones = retrieve_data(args, template, query_args=query_args) 1921 | 1922 | milestones = {} 1923 | for milestone in _milestones: 1924 | milestones[milestone["number"]] = milestone 1925 | 1926 | written_count = 0 1927 | for number, milestone in list(milestones.items()): 1928 | milestone_file = "{0}/{1}.json".format(milestone_cwd, number) 1929 | if json_dump_if_changed(milestone, milestone_file): 1930 | written_count += 1 1931 | 1932 | total = len(milestones) 1933 | if written_count == total: 1934 | logger.info("Saved {0} milestones to disk".format(total)) 1935 | elif written_count == 0: 1936 | logger.info("{0} milestones unchanged, skipped write".format(total)) 1937 | else: 1938 | logger.info( 1939 | "Saved {0} of {1} milestones to disk ({2} unchanged)".format( 1940 | written_count, total, total - written_count 1941 | ) 1942 | ) 1943 | 1944 | 1945 | def backup_labels(args, repo_cwd, repository, repos_template): 1946 | label_cwd = os.path.join(repo_cwd, "labels") 1947 | output_file = "{0}/labels.json".format(label_cwd) 1948 | template = "{0}/{1}/labels".format(repos_template, repository["full_name"]) 1949 | _backup_data(args, "labels", template, output_file, label_cwd) 1950 | 1951 | 1952 | def backup_hooks(args, repo_cwd, repository, repos_template): 1953 | auth = get_auth(args) 1954 | if not auth: 1955 | logger.info("Skipping hooks since no authentication provided") 1956 | return 1957 | hook_cwd = os.path.join(repo_cwd, "hooks") 1958 | output_file = "{0}/hooks.json".format(hook_cwd) 1959 | template = "{0}/{1}/hooks".format(repos_template, repository["full_name"]) 1960 | try: 1961 | _backup_data(args, "hooks", template, output_file, hook_cwd) 1962 | except Exception as e: 1963 | if "404" in str(e): 1964 | logger.info("Unable to read hooks, skipping") 1965 | else: 1966 | raise e 1967 | 1968 | 1969 | def backup_releases(args, repo_cwd, repository, repos_template, include_assets=False): 1970 | repository_fullname = repository["full_name"] 1971 | 1972 | # give release files somewhere to live & log intent 1973 | release_cwd = os.path.join(repo_cwd, "releases") 1974 | logger.info("Retrieving {0} releases".format(repository_fullname)) 1975 | mkdir_p(repo_cwd, release_cwd) 1976 | 1977 | query_args = {} 1978 | 1979 | release_template = "{0}/{1}/releases".format(repos_template, repository_fullname) 1980 | releases = retrieve_data(args, release_template, query_args=query_args) 1981 | 1982 | if args.skip_prerelease: 1983 | releases = [r for r in releases if not r["prerelease"] and not r["draft"]] 1984 | 1985 | if args.number_of_latest_releases and args.number_of_latest_releases < len( 1986 | releases 1987 | ): 1988 | releases.sort( 1989 | key=lambda item: datetime.strptime( 1990 | item["created_at"], "%Y-%m-%dT%H:%M:%SZ" 1991 | ), 1992 | reverse=True, 1993 | ) 1994 | releases = releases[: args.number_of_latest_releases] 1995 | 1996 | # Check if this repo should skip asset downloads (case-insensitive) 1997 | skip_assets = False 1998 | if include_assets: 1999 | repo_name = repository.get("name", "").lower() 2000 | repo_full_name = repository.get("full_name", "").lower() 2001 | skip_repos = [r.lower() for r in (args.skip_assets_on or [])] 2002 | skip_assets = repo_name in skip_repos or repo_full_name in skip_repos 2003 | if skip_assets: 2004 | logger.info( 2005 | "Skipping assets for {0} ({1} releases) due to --skip-assets-on".format( 2006 | repository.get("name"), len(releases) 2007 | ) 2008 | ) 2009 | 2010 | # for each release, store it 2011 | written_count = 0 2012 | for release in releases: 2013 | release_name = release["tag_name"] 2014 | release_name_safe = release_name.replace("/", "__") 2015 | output_filepath = os.path.join( 2016 | release_cwd, "{0}.json".format(release_name_safe) 2017 | ) 2018 | if json_dump_if_changed(release, output_filepath): 2019 | written_count += 1 2020 | 2021 | if include_assets and not skip_assets: 2022 | assets = retrieve_data(args, release["assets_url"]) 2023 | if len(assets) > 0: 2024 | # give release asset files somewhere to live & download them (not including source archives) 2025 | release_assets_cwd = os.path.join(release_cwd, release_name_safe) 2026 | mkdir_p(release_assets_cwd) 2027 | for asset in assets: 2028 | download_file( 2029 | asset["url"], 2030 | os.path.join(release_assets_cwd, asset["name"]), 2031 | get_auth(args, encode=not args.as_app), 2032 | as_app=args.as_app, 2033 | fine=True if args.token_fine is not None else False, 2034 | ) 2035 | 2036 | # Log the results 2037 | total = len(releases) 2038 | if written_count == total: 2039 | logger.info("Saved {0} releases to disk".format(total)) 2040 | elif written_count == 0: 2041 | logger.info("{0} releases unchanged, skipped write".format(total)) 2042 | else: 2043 | logger.info( 2044 | "Saved {0} of {1} releases to disk ({2} unchanged)".format( 2045 | written_count, total, total - written_count 2046 | ) 2047 | ) 2048 | 2049 | 2050 | def fetch_repository( 2051 | name, 2052 | remote_url, 2053 | local_dir, 2054 | skip_existing=False, 2055 | bare_clone=False, 2056 | lfs_clone=False, 2057 | no_prune=False, 2058 | ): 2059 | if bare_clone: 2060 | if os.path.exists(local_dir): 2061 | clone_exists = ( 2062 | subprocess.check_output( 2063 | ["git", "rev-parse", "--is-bare-repository"], cwd=local_dir 2064 | ) 2065 | == b"true\n" 2066 | ) 2067 | else: 2068 | clone_exists = False 2069 | else: 2070 | clone_exists = os.path.exists(os.path.join(local_dir, ".git")) 2071 | 2072 | if clone_exists and skip_existing: 2073 | return 2074 | 2075 | masked_remote_url = mask_password(remote_url) 2076 | 2077 | initialized = subprocess.call( 2078 | "git ls-remote " + remote_url, stdout=FNULL, stderr=FNULL, shell=True 2079 | ) 2080 | if initialized == 128: 2081 | if ".wiki.git" in remote_url: 2082 | logger.info( 2083 | "Skipping {0} wiki (wiki is enabled but has no content)".format(name) 2084 | ) 2085 | else: 2086 | logger.info( 2087 | "Skipping {0} (repository not accessible - may be empty, private, or credentials invalid)".format( 2088 | name 2089 | ) 2090 | ) 2091 | return 2092 | 2093 | if clone_exists: 2094 | logger.info("Updating {0} in {1}".format(name, local_dir)) 2095 | 2096 | remotes = subprocess.check_output(["git", "remote", "show"], cwd=local_dir) 2097 | remotes = [i.strip() for i in remotes.decode("utf-8").splitlines()] 2098 | 2099 | if "origin" not in remotes: 2100 | git_command = ["git", "remote", "rm", "origin"] 2101 | logging_subprocess(git_command, cwd=local_dir) 2102 | git_command = ["git", "remote", "add", "origin", remote_url] 2103 | logging_subprocess(git_command, cwd=local_dir) 2104 | else: 2105 | git_command = ["git", "remote", "set-url", "origin", remote_url] 2106 | logging_subprocess(git_command, cwd=local_dir) 2107 | 2108 | git_command = ["git", "fetch", "--all", "--force", "--tags", "--prune"] 2109 | if no_prune: 2110 | git_command.pop() 2111 | logging_subprocess(git_command, cwd=local_dir) 2112 | if lfs_clone: 2113 | git_command = ["git", "lfs", "fetch", "--all", "--prune"] 2114 | if no_prune: 2115 | git_command.pop() 2116 | logging_subprocess(git_command, cwd=local_dir) 2117 | else: 2118 | logger.info( 2119 | "Cloning {0} repository from {1} to {2}".format( 2120 | name, masked_remote_url, local_dir 2121 | ) 2122 | ) 2123 | if bare_clone: 2124 | git_command = ["git", "clone", "--mirror", remote_url, local_dir] 2125 | logging_subprocess(git_command) 2126 | if lfs_clone: 2127 | git_command = ["git", "lfs", "fetch", "--all", "--prune"] 2128 | if no_prune: 2129 | git_command.pop() 2130 | logging_subprocess(git_command, cwd=local_dir) 2131 | else: 2132 | git_command = ["git", "clone", remote_url, local_dir] 2133 | logging_subprocess(git_command) 2134 | if lfs_clone: 2135 | git_command = ["git", "lfs", "fetch", "--all", "--prune"] 2136 | if no_prune: 2137 | git_command.pop() 2138 | logging_subprocess(git_command, cwd=local_dir) 2139 | 2140 | 2141 | def backup_account(args, output_directory): 2142 | account_cwd = os.path.join(output_directory, "account") 2143 | 2144 | if args.include_starred or args.include_everything: 2145 | output_file = "{0}/starred.json".format(account_cwd) 2146 | template = "https://{0}/users/{1}/starred".format( 2147 | get_github_api_host(args), args.user 2148 | ) 2149 | _backup_data(args, "starred repositories", template, output_file, account_cwd) 2150 | 2151 | if args.include_watched or args.include_everything: 2152 | output_file = "{0}/watched.json".format(account_cwd) 2153 | template = "https://{0}/users/{1}/subscriptions".format( 2154 | get_github_api_host(args), args.user 2155 | ) 2156 | _backup_data(args, "watched repositories", template, output_file, account_cwd) 2157 | 2158 | if args.include_followers or args.include_everything: 2159 | output_file = "{0}/followers.json".format(account_cwd) 2160 | template = "https://{0}/users/{1}/followers".format( 2161 | get_github_api_host(args), args.user 2162 | ) 2163 | _backup_data(args, "followers", template, output_file, account_cwd) 2164 | 2165 | if args.include_following or args.include_everything: 2166 | output_file = "{0}/following.json".format(account_cwd) 2167 | template = "https://{0}/users/{1}/following".format( 2168 | get_github_api_host(args), args.user 2169 | ) 2170 | _backup_data(args, "following", template, output_file, account_cwd) 2171 | 2172 | 2173 | def _backup_data(args, name, template, output_file, output_directory): 2174 | skip_existing = args.skip_existing 2175 | if not skip_existing or not os.path.exists(output_file): 2176 | logger.info("Retrieving {0} {1}".format(args.user, name)) 2177 | mkdir_p(output_directory) 2178 | data = retrieve_data(args, template) 2179 | 2180 | if json_dump_if_changed(data, output_file): 2181 | logger.info("Saved {0} {1} to disk".format(len(data), name)) 2182 | else: 2183 | logger.info("{0} {1} unchanged, skipped write".format(len(data), name)) 2184 | 2185 | 2186 | def json_dump(data, output_file): 2187 | json.dump( 2188 | data, 2189 | output_file, 2190 | ensure_ascii=False, 2191 | sort_keys=True, 2192 | indent=4, 2193 | separators=(",", ": "), 2194 | ) 2195 | 2196 | 2197 | def json_dump_if_changed(data, output_file_path): 2198 | """ 2199 | Write JSON data to file only if content has changed. 2200 | 2201 | Compares the serialized JSON data with the existing file content 2202 | and only writes if different. This prevents unnecessary file 2203 | modification timestamp updates and disk writes. 2204 | 2205 | Uses atomic writes (temp file + rename) to prevent corruption 2206 | if the process is interrupted during the write. 2207 | 2208 | Args: 2209 | data: The data to serialize as JSON 2210 | output_file_path: The path to the output file 2211 | 2212 | Returns: 2213 | True if file was written (content changed or new file) 2214 | False if write was skipped (content unchanged) 2215 | """ 2216 | # Serialize new data with consistent formatting matching json_dump() 2217 | new_content = json.dumps( 2218 | data, 2219 | ensure_ascii=False, 2220 | sort_keys=True, 2221 | indent=4, 2222 | separators=(",", ": "), 2223 | ) 2224 | 2225 | # Check if file exists and compare content 2226 | if os.path.exists(output_file_path): 2227 | try: 2228 | with codecs.open(output_file_path, "r", encoding="utf-8") as f: 2229 | existing_content = f.read() 2230 | if existing_content == new_content: 2231 | logger.debug( 2232 | "Content unchanged, skipping write: {0}".format(output_file_path) 2233 | ) 2234 | return False 2235 | except (OSError, UnicodeDecodeError) as e: 2236 | # If we can't read the existing file, write the new one 2237 | logger.debug( 2238 | "Error reading existing file {0}, will overwrite: {1}".format( 2239 | output_file_path, e 2240 | ) 2241 | ) 2242 | 2243 | # Write the file atomically using temp file + rename 2244 | temp_file = output_file_path + ".temp" 2245 | with codecs.open(temp_file, "w", encoding="utf-8") as f: 2246 | f.write(new_content) 2247 | os.replace(temp_file, output_file_path) # Atomic write 2248 | return True 2249 | --------------------------------------------------------------------------------