├── requirements.txt
├── MANIFEST.in
├── github_backup
    ├── __init__.py
    ├── __main__.py
    ├── cli.py
    └── github_backup.py
├── tests
    ├── __init__.py
    ├── test_case_sensitivity.py
    ├── test_pagination.py
    ├── test_http_451.py
    ├── test_all_starred.py
    ├── test_json_dump_if_changed.py
    ├── test_skip_assets_on.py
    └── test_attachments.py
├── python-github-backup.code-workspace
├── pytest.ini
├── release-requirements.txt
├── .github
    ├── dependabot.yml
    ├── workflows
    │   ├── tagged-release.yml
    │   ├── test.yml
    │   ├── lint.yml
    │   ├── automatic-release.yml
    │   └── docker.yml
    ├── PULL_REQUEST.md
    └── ISSUE_TEMPLATE
    │   ├── bug.yaml
    │   └── feature.yaml
├── .gitignore
├── bin
    └── github-backup
├── Dockerfile
├── .dockerignore
├── LICENSE.txt
├── setup.py
├── .gitchangelog.rc
├── release
└── README.rst


/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.rst
3 | 


--------------------------------------------------------------------------------
/github_backup/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.57.0"
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Tests for python-github-backup."""
2 | 


--------------------------------------------------------------------------------
/python-github-backup.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | 	"folders": [
3 | 		{
4 | 			"path": "."
5 | 		}
6 | 	]
7 | }


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | python_files = test_*.py
4 | python_classes = Test*
5 | python_functions = test_*
6 | addopts = -v
7 | 


--------------------------------------------------------------------------------
/release-requirements.txt:
--------------------------------------------------------------------------------
 1 | # Linting & Formatting
 2 | autopep8==2.3.2
 3 | black==25.12.0
 4 | flake8==7.3.0
 5 | 
 6 | # Testing
 7 | pytest==9.0.2
 8 | 
 9 | # Release & Publishing
10 | twine==6.2.0
11 | gitchangelog==3.0.4
12 | setuptools==80.9.0
13 | 
14 | # Documentation
15 | restructuredtext-lint==2.0.2
16 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     time: "13:00"
 8 |   groups:
 9 |     python-packages:
10 |       patterns:
11 |         - "*"
12 | - package-ecosystem: "github-actions"
13 |   directory: "/"
14 |   schedule:
15 |     interval: "weekly"
16 | 


--------------------------------------------------------------------------------
/github_backup/__main__.py:
--------------------------------------------------------------------------------
 1 | """Allow running as: python -m github_backup"""
 2 | 
 3 | import sys
 4 | 
 5 | from github_backup.cli import main
 6 | from github_backup.github_backup import logger
 7 | 
 8 | if __name__ == "__main__":
 9 |     try:
10 |         main()
11 |     except Exception as e:
12 |         logger.error(str(e))
13 |         sys.exit(1)
14 | 


--------------------------------------------------------------------------------
/.github/workflows/tagged-release.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "tagged-release"
 3 | 
 4 | # yamllint disable-line rule:truthy
 5 | on:
 6 |   push:
 7 |     tags:
 8 |       - '*'
 9 | 
10 | jobs:
11 |   tagged-release:
12 |     name: tagged-release
13 |     runs-on: ubuntu-24.04
14 | 
15 |     steps:
16 |       - uses: "marvinpinto/action-automatic-releases@v1.2.1"
17 |         with:
18 |           repo_token: "${{ secrets.GITHUB_TOKEN }}"
19 |           prerelease: false
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # Temp files
 4 | *~
 5 | ~*
 6 | .*~
 7 | \#*
 8 | .#*
 9 | *#
10 | dist
11 | 
12 | # Build files
13 | build
14 | dist
15 | pkg
16 | *.egg
17 | *.egg-info
18 | 
19 | # Debian Files
20 | debian/files
21 | debian/python-github-backup*
22 | 
23 | # Sphinx build
24 | doc/_build
25 | 
26 | # Generated man page
27 | doc/github_backup.1
28 | 
29 | # Annoying macOS files
30 | .DS_Store
31 | ._*
32 | 
33 | # IDE configuration files
34 | .vscode
35 | .atom
36 | .idea
37 | 
38 | README
39 | 
40 | # RSA
41 | id_rsa
42 | id_rsa.pub
43 | 
44 | # Virtual env
45 | venv
46 | .venv
47 | 


--------------------------------------------------------------------------------
/bin/github-backup:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Backwards-compatible wrapper script.
 4 | 
 5 | The recommended way to run github-backup is via the installed command
 6 | (pip install github-backup) or python -m github_backup.
 7 | 
 8 | This script is kept for backwards compatibility with existing installations
 9 | that may reference this path directly.
10 | """
11 | 
12 | import sys
13 | 
14 | from github_backup.cli import main
15 | from github_backup.github_backup import logger
16 | 
17 | if __name__ == "__main__":
18 |     try:
19 |         main()
20 |     except Exception as e:
21 |         logger.error(str(e))
22 |         sys.exit(1)
23 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST.md:
--------------------------------------------------------------------------------
1 | # Important notice regarding filed pull requests
2 | 
3 | This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given.
4 | 
5 | I will attempt to review pull requests at _my_ earliest convenience. If I am unable to get to your pull request in a timely fashion, it is what it is. This repository does not pay any bills, and I am not required to merge any pull request from any individual.
6 | 
7 | If you wish to jump my personal priority queue, you may pay me for my time to review. My rate is $200 an hour - minimum 1 hour - feel free contact me via my github email address if you want to go this route.
8 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "test"
 3 | 
 4 | # yamllint disable-line rule:truthy
 5 | on:
 6 |   pull_request:
 7 |     branches:
 8 |       - "*"
 9 |   push:
10 |     branches:
11 |       - "main"
12 |       - "master"
13 | 
14 | jobs:
15 |   test:
16 |     name: test
17 |     runs-on: ubuntu-24.04
18 |     strategy:
19 |       matrix:
20 |         python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
21 | 
22 |     steps:
23 |       - name: Checkout repository
24 |         uses: actions/checkout@v6
25 |         with:
26 |           fetch-depth: 0
27 |       - name: Setup Python
28 |         uses: actions/setup-python@v6
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 |           cache: "pip"
32 |       - run: pip install -r release-requirements.txt
33 |       - run: pytest tests/ -v
34 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "lint"
 3 | 
 4 | # yamllint disable-line rule:truthy
 5 | on:
 6 |   pull_request:
 7 |     branches:
 8 |       - "*"
 9 |   push:
10 |     branches:
11 |       - "main"
12 |       - "master"
13 | 
14 | jobs:
15 |   lint:
16 |     name: lint
17 |     runs-on: ubuntu-24.04
18 |     strategy:
19 |       matrix:
20 |         python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
21 | 
22 |     steps:
23 |       - name: Checkout repository
24 |         uses: actions/checkout@v6
25 |         with:
26 |           fetch-depth: 0
27 |       - name: Setup Python
28 |         uses: actions/setup-python@v6
29 |         with:
30 |           python-version: ${{ matrix.python-version }}
31 |           cache: "pip"
32 |       - run: pip install -r release-requirements.txt && pip install wheel
33 |       - run: flake8 --ignore=E501,E203,W503
34 |       - run: black .
35 |       - run: rst-lint README.rst
36 |       - run: python setup.py sdist bdist_wheel && twine check dist/*
37 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-alpine3.22 AS builder
 2 | 
 3 | RUN pip install --no-cache-dir --upgrade pip \
 4 |     && pip install --no-cache-dir uv
 5 | 
 6 | WORKDIR /app
 7 | 
 8 | RUN --mount=type=cache,target=/root/.cache/uv \
 9 |     --mount=type=bind,source=requirements.txt,target=requirements.txt \
10 |     --mount=type=bind,source=release-requirements.txt,target=release-requirements.txt \
11 |     uv venv \
12 |     && uv pip install -r release-requirements.txt
13 | 
14 | COPY . .
15 | 
16 | RUN --mount=type=cache,target=/root/.cache/uv \
17 |     uv pip install .
18 | 
19 | 
20 | FROM python:3.12-alpine3.22
21 | ENV PYTHONUNBUFFERED=1
22 | 
23 | RUN apk add --no-cache \
24 |     ca-certificates \
25 |     git \
26 |     git-lfs \
27 |     && addgroup -g 1000 appuser \
28 |     && adduser -D -u 1000 -G appuser appuser
29 | 
30 | COPY --from=builder --chown=appuser:appuser /app /app
31 | 
32 | WORKDIR /app
33 | 
34 | USER appuser
35 | 
36 | ENV PATH="/app/.venv/bin:$PATH"
37 | 
38 | ENTRYPOINT ["github-backup"]
39 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Docker ignore file to reduce build context size
 2 | 
 3 | # Temp files
 4 | *~
 5 | ~*
 6 | .*~
 7 | \#*
 8 | .#*
 9 | *#
10 | dist
11 | 
12 | # Build files
13 | build
14 | dist
15 | pkg
16 | *.egg
17 | *.egg-info
18 | 
19 | # Debian Files
20 | debian/files
21 | debian/python-github-backup*
22 | 
23 | # Sphinx build
24 | doc/_build
25 | 
26 | # Generated man page
27 | doc/github_backup.1
28 | 
29 | # Annoying macOS files
30 | .DS_Store
31 | ._*
32 | 
33 | # IDE configuration files
34 | .vscode
35 | .atom
36 | .idea
37 | *.code-workspace
38 | 
39 | # RSA
40 | id_rsa
41 | id_rsa.pub
42 | 
43 | # Virtual env
44 | venv
45 | .venv
46 | 
47 | # Git
48 | .git
49 | .gitignore
50 | .gitchangelog.rc
51 | .github
52 | 
53 | # Documentation
54 | *.md
55 | !README.md
56 | 
57 | # Environment variables files
58 | .env
59 | .env.*
60 | !.env.example
61 | *.log
62 | 
63 | # Cache files
64 | **/__pycache__/
65 | *.py[cod]
66 | 
67 | # Docker files
68 | docker-compose.yml
69 | Dockerfile*
70 | 
71 | # Other files
72 | release
73 | *.tar
74 | *.zip
75 | *.gzip
76 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Jose Diaz-Gonzalez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | description: File a bug report.
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         # Important notice regarding filed issues
 9 | 
10 |         This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given.
11 | 
12 |         If pull requests implementing bug fixes or enhancements are pushed, I am happy to review and merge them (time permitting).
13 | 
14 |         If you wish to have a bug fixed, you have a few options:
15 | 
16 |         - Fix it yourself and file a pull request.
17 |         - File a bug and hope someone else fixes it for you.
18 |         - Pay me to fix it (my rate is $200 an hour, minimum 1 hour, contact me via my [github email address](https://github.com/josegonzalez) if you want to go this route).
19 | 
20 |         In all cases, feel free to file an issue, they may be of help to others in the future.
21 |   - type: textarea
22 |     id: what-happened
23 |     attributes:
24 |       label: What happened?
25 |       description: Also tell us, what did you expect to happen?
26 |       placeholder: Tell us what you see!
27 |     validations:
28 |       required: true
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature Request
 3 | description: File a feature request.
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         # Important notice regarding filed issues
 9 | 
10 |         This project already fills my needs, and as such I have no real reason to continue it's development. This project is otherwise provided as is, and no support is given.
11 | 
12 |         If pull requests implementing bug fixes or enhancements are pushed, I am happy to review and merge them (time permitting).
13 | 
14 |         If you wish to have a feature implemented, you have a few options:
15 | 
16 |         - Implement it yourself and file a pull request.
17 |         - File an issue and hope someone else implements it for you.
18 |         - Pay me to implement it (my rate is $200 an hour, minimum 1 hour, contact me via my [github email address](https://github.com/josegonzalez) if you want to go this route).
19 | 
20 |         In all cases, feel free to file an issue, they may be of help to others in the future.
21 |   - type: textarea
22 |     id: what-would-you-like-to-happen
23 |     attributes:
24 |       label: What would you like to happen?
25 |       description: Please describe in detail how the new functionality should work as well as any issues with existing functionality.
26 |     validations:
27 |       required: true
28 | 


--------------------------------------------------------------------------------
/.github/workflows/automatic-release.yml:
--------------------------------------------------------------------------------
 1 | name: automatic-release
 2 | 
 3 | on:
 4 |     workflow_dispatch:
 5 |         inputs:
 6 |             release_type:
 7 |                 description: Release type
 8 |                 required: true
 9 |                 type: choice
10 |                 options:
11 |                     - patch
12 |                     - minor
13 |                     - major
14 | 
15 | jobs:
16 |     release:
17 |         name: Release
18 |         runs-on: ubuntu-24.04
19 |         steps:
20 |             - name: Checkout repository
21 |               uses: actions/checkout@v6
22 |               with:
23 |                 fetch-depth: 0
24 |                 ssh-key: ${{ secrets.DEPLOY_PRIVATE_KEY }}
25 |             - name: Setup Git
26 |               run: |
27 |                 git config --local user.email "action@github.com"
28 |                 git config --local user.name "GitHub Action"
29 |             - name: Setup Python
30 |               uses: actions/setup-python@v6
31 |               with:
32 |                 python-version: '3.12'
33 |             - name: Install prerequisites
34 |               run: pip install -r release-requirements.txt
35 |             - name: Execute release
36 |               env:
37 |                 SEMVER_BUMP: ${{ github.event.inputs.release_type }}
38 |                 TWINE_REPOSITORY: ${{ vars.TWINE_REPOSITORY }}
39 |                 TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
40 |                 TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
41 |               run: ./release $SEMVER_BUMP
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | 
 5 | from github_backup import __version__
 6 | 
 7 | try:
 8 |     from setuptools import setup
 9 | 
10 |     setup  # workaround for pyflakes issue #13
11 | except ImportError:
12 |     from distutils.core import setup
13 | 
14 | # Hack to prevent stupid TypeError: 'NoneType' object is not callable error on
15 | # exit of python setup.py test # in multiprocessing/util.py _exit_function when
16 | # running python setup.py test (see
17 | # http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html)
18 | try:
19 |     import multiprocessing
20 | 
21 |     multiprocessing
22 | except ImportError:
23 |     pass
24 | 
25 | 
26 | def open_file(fname):
27 |     return open(os.path.join(os.path.dirname(__file__), fname))
28 | 
29 | 
30 | setup(
31 |     name="github-backup",
32 |     version=__version__,
33 |     author="Jose Diaz-Gonzalez",
34 |     author_email="github-backup@josediazgonzalez.com",
35 |     packages=["github_backup"],
36 |     entry_points={
37 |         "console_scripts": [
38 |             "github-backup=github_backup.cli:main",
39 |         ],
40 |     },
41 |     url="http://github.com/josegonzalez/python-github-backup",
42 |     license="MIT",
43 |     classifiers=[
44 |         "Development Status :: 5 - Production/Stable",
45 |         "Topic :: System :: Archiving :: Backup",
46 |         "License :: OSI Approved :: MIT License",
47 |         "Programming Language :: Python :: 3.10",
48 |         "Programming Language :: Python :: 3.11",
49 |         "Programming Language :: Python :: 3.12",
50 |         "Programming Language :: Python :: 3.13",
51 |         "Programming Language :: Python :: 3.14",
52 |     ],
53 |     description="backup a github user or organization",
54 |     long_description=open_file("README.rst").read(),
55 |     long_description_content_type="text/x-rst",
56 |     install_requires=open_file("requirements.txt").readlines(),
57 |     python_requires=">=3.10",
58 |     zip_safe=True,
59 | )
60 | 


--------------------------------------------------------------------------------
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub.
 2 | # They are provided by a third-party and are governed by
 3 | # separate terms of service, privacy policy, and support
 4 | # documentation.
 5 | 
 6 | name: Create and publish a Docker image
 7 | 
 8 | on:
 9 |   push:
10 |     branches:
11 |       - 'master'
12 |       - 'main'
13 |       - 'dev'
14 | 
15 |     tags:
16 |       - 'v*'
17 |       - 'v*.*'
18 |       - 'v*.*.*'
19 |       - '*'
20 |       - '*.*'
21 |       - '*.*.*'
22 |   pull_request:
23 |     branches:
24 |       - 'main'
25 |       - 'dev'
26 | 
27 | 
28 | env:
29 |   REGISTRY: ghcr.io
30 |   IMAGE_NAME: ${{ github.repository }}
31 | 
32 | jobs:
33 |   build-and-push-image:
34 |     runs-on: ubuntu-latest
35 |     permissions:
36 |       contents: read
37 |       packages: write
38 | 
39 |     steps:
40 |       - name: Checkout repository
41 |         uses: actions/checkout@v6
42 |         with:
43 |           persist-credentials: false
44 | 
45 |       - name: Set up QEMU
46 |         uses: docker/setup-qemu-action@v3
47 | 
48 |       - name: Set up Docker Buildx
49 |         uses: docker/setup-buildx-action@v3
50 | 
51 |       - name: Log in to the Container registry
52 |         uses: docker/login-action@v3
53 |         with:
54 |           registry: ${{ env.REGISTRY }}
55 |           username: ${{ github.actor }}
56 |           password: ${{ secrets.GITHUB_TOKEN }}
57 | 
58 |       - name: Extract metadata (tags, labels) for Docker
59 |         id: meta
60 |         uses: docker/metadata-action@v5
61 |         with:
62 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
63 |           tags: |
64 |             type=semver,pattern={{version}}
65 |             type=semver,pattern={{major}}.{{minor}}
66 |             type=semver,pattern={{major}}
67 |             type=sha
68 |             type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }}
69 | 
70 |       - name: Build and push Docker image
71 |         uses:  docker/build-push-action@v6
72 |         with:
73 |           context: .
74 |           push: true
75 |           platforms: linux/amd64,linux/arm64
76 |           tags: ${{ steps.meta.outputs.tags }}
77 |           labels: ${{ steps.meta.outputs.labels }}
78 | 


--------------------------------------------------------------------------------
/github_backup/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Command-line interface for github-backup."""
 3 | 
 4 | import logging
 5 | import os
 6 | import sys
 7 | 
 8 | from github_backup.github_backup import (
 9 |     backup_account,
10 |     backup_repositories,
11 |     check_git_lfs_install,
12 |     filter_repositories,
13 |     get_auth,
14 |     get_authenticated_user,
15 |     logger,
16 |     mkdir_p,
17 |     parse_args,
18 |     retrieve_repositories,
19 | )
20 | 
21 | # INFO and DEBUG go to stdout, WARNING and above go to stderr
22 | log_format = logging.Formatter(
23 |     fmt="%(asctime)s.%(msecs)03d: %(message)s",
24 |     datefmt="%Y-%m-%dT%H:%M:%S",
25 | )
26 | 
27 | stdout_handler = logging.StreamHandler(sys.stdout)
28 | stdout_handler.setLevel(logging.DEBUG)
29 | stdout_handler.addFilter(lambda r: r.levelno < logging.WARNING)
30 | stdout_handler.setFormatter(log_format)
31 | 
32 | stderr_handler = logging.StreamHandler(sys.stderr)
33 | stderr_handler.setLevel(logging.WARNING)
34 | stderr_handler.setFormatter(log_format)
35 | 
36 | logging.basicConfig(level=logging.INFO, handlers=[stdout_handler, stderr_handler])
37 | 
38 | 
39 | def main():
40 |     """Main entry point for github-backup CLI."""
41 |     args = parse_args()
42 | 
43 |     if args.private and not get_auth(args):
44 |         logger.warning(
45 |             "The --private flag has no effect without authentication. "
46 |             "Use -t/--token, -f/--token-fine, or -u/--username to authenticate."
47 |         )
48 | 
49 |     if args.quiet:
50 |         logger.setLevel(logging.WARNING)
51 | 
52 |     output_directory = os.path.realpath(args.output_directory)
53 |     if not os.path.isdir(output_directory):
54 |         logger.info("Create output directory {0}".format(output_directory))
55 |         mkdir_p(output_directory)
56 | 
57 |     if args.lfs_clone:
58 |         check_git_lfs_install()
59 | 
60 |     if args.log_level:
61 |         log_level = logging.getLevelName(args.log_level.upper())
62 |         if isinstance(log_level, int):
63 |             logger.root.setLevel(log_level)
64 | 
65 |     if not args.as_app:
66 |         logger.info("Backing up user {0} to {1}".format(args.user, output_directory))
67 |         authenticated_user = get_authenticated_user(args)
68 |     else:
69 |         authenticated_user = {"login": None}
70 | 
71 |     repositories = retrieve_repositories(args, authenticated_user)
72 |     repositories = filter_repositories(args, repositories)
73 |     backup_repositories(args, output_directory, repositories)
74 |     backup_account(args, output_directory)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     try:
79 |         main()
80 |     except Exception as e:
81 |         logger.error(str(e))
82 |         sys.exit(1)
83 | 


--------------------------------------------------------------------------------
/.gitchangelog.rc:
--------------------------------------------------------------------------------
  1 | #
  2 | # Format
  3 | #
  4 | #   ACTION: [AUDIENCE:] COMMIT_MSG [@TAG ...]
  5 | #
  6 | # Description
  7 | #
  8 | #   ACTION is one of 'chg', 'fix', 'new'
  9 | #
 10 | #       Is WHAT the change is about.
 11 | #
 12 | #       'chg' is for refactor, small improvement, cosmetic changes...
 13 | #       'fix' is for bug fixes
 14 | #       'new' is for new features, big improvement
 15 | #
 16 | #   SUBJECT is optional and one of 'dev', 'usr', 'pkg', 'test', 'doc'
 17 | #
 18 | #       Is WHO is concerned by the change.
 19 | #
 20 | #       'dev'  is for developpers (API changes, refactors...)
 21 | #       'usr'  is for final users (UI changes)
 22 | #       'pkg'  is for packagers   (packaging changes)
 23 | #       'test' is for testers     (test only related changes)
 24 | #       'doc'  is for doc guys    (doc only changes)
 25 | #
 26 | #   COMMIT_MSG is ... well ... the commit message itself.
 27 | #
 28 | #   TAGs are additionnal adjective as 'refactor' 'minor' 'cosmetic'
 29 | #
 30 | #       'refactor' is obviously for refactoring code only
 31 | #       'minor' is for a very meaningless change (a typo, adding a comment)
 32 | #       'cosmetic' is for cosmetic driven change (re-indentation, 80-col...)
 33 | #
 34 | # Example:
 35 | #
 36 | #   new: usr: support of bazaar implemented
 37 | #   chg: re-indentend some lines @cosmetic
 38 | #   new: dev: updated code to be compatible with last version of killer lib.
 39 | #   fix: pkg: updated year of licence coverage.
 40 | #   new: test: added a bunch of test around user usability of feature X.
 41 | #   fix: typo in spelling my name in comment. @minor
 42 | #
 43 | #   Please note that multi-line commit message are supported, and only the
 44 | #   first line will be considered as the "summary" of the commit message. So
 45 | #   tags, and other rules only applies to the summary.  The body of the commit
 46 | #   message will be displayed in the changelog with minor reformating.
 47 | 
 48 | #
 49 | # ``ignore_regexps`` is a line of regexps
 50 | #
 51 | # Any commit having its full commit message matching any regexp listed here
 52 | # will be ignored and won't be reported in the changelog.
 53 | #
 54 | ignore_regexps = [
 55 |     r'(?i)^(Merge pull request|Merge branch|Release|Update)',
 56 | ]
 57 | 
 58 | 
 59 | #
 60 | # ``replace_regexps`` is a dict associating a regexp pattern and its replacement
 61 | #
 62 | # It will be applied to get the summary line from the full commit message.
 63 | #
 64 | # Note that you can provide multiple replacement patterns, they will be all
 65 | # tried. If None matches, the summary line will be the full commit message.
 66 | #
 67 | replace_regexps = {
 68 |     # current format (ie: 'chg: dev: my commit msg @tag1 @tag2')
 69 | 
 70 |     r'^([cC]hg|[fF]ix|[nN]ew)\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n@]*)(@[a-z]+\s+)*$':
 71 |     r'\4',
 72 | }
 73 | 
 74 | 
 75 | # ``section_regexps`` is a list of 2-tuples associating a string label and a
 76 | # list of regexp
 77 | #
 78 | # Commit messages will be classified in sections thanks to this. Section
 79 | # titles are the label, and a commit is classified under this section if any
 80 | # of the regexps associated is matching.
 81 | #
 82 | section_regexps = [
 83 |     ('New', [
 84 |         r'^[nN]ew\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 85 |     ]),
 86 |     ('Changes', [
 87 |         r'^[cC]hg\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 88 |     ]),
 89 |     ('Fix', [
 90 |         r'^[fF]ix\s*:\s*((dev|use?r|pkg|test|doc)\s*:\s*)?([^\n]*)$',
 91 |     ]),
 92 |     ('Other', None  # Match all lines
 93 |      ),
 94 | 
 95 | ]
 96 | 
 97 | # ``body_split_regexp`` is a regexp
 98 | #
 99 | # Commit message body (not the summary) if existing will be split
100 | # (new line) on this regexp
101 | #
102 | body_split_regexp = r'[\n-]'
103 | 
104 | 
105 | # ``tag_filter_regexp`` is a regexp
106 | #
107 | # Tags that will be used for the changelog must match this regexp.
108 | #
109 | # tag_filter_regexp = r'^[0-9]+$'
110 | tag_filter_regexp = r'^(?:[vV])?[0-9\.]+$'
111 | 
112 | 
113 | # ``unreleased_version_label`` is a string
114 | #
115 | # This label will be used as the changelog Title of the last set of changes
116 | # between last valid tag and HEAD if any.
117 | unreleased_version_label = "%%version%% (unreleased)"
118 | 


--------------------------------------------------------------------------------
/tests/test_case_sensitivity.py:
--------------------------------------------------------------------------------
  1 | """Tests for case-insensitive username/organization filtering."""
  2 | 
  3 | import pytest
  4 | from unittest.mock import Mock
  5 | 
  6 | from github_backup import github_backup
  7 | 
  8 | 
  9 | class TestCaseSensitivity:
 10 |     """Test suite for case-insensitive username matching in filter_repositories."""
 11 | 
 12 |     def test_filter_repositories_case_insensitive_user(self):
 13 |         """Should filter repositories case-insensitively for usernames.
 14 | 
 15 |         Reproduces issue #198 where typing 'iamrodos' fails to match
 16 |         repositories with owner.login='Iamrodos' (the canonical case from GitHub API).
 17 |         """
 18 |         # Simulate user typing lowercase username
 19 |         args = Mock()
 20 |         args.user = "iamrodos"  # lowercase (what user typed)
 21 |         args.repository = None
 22 |         args.name_regex = None
 23 |         args.languages = None
 24 |         args.exclude = None
 25 |         args.fork = False
 26 |         args.private = False
 27 |         args.public = False
 28 |         args.all = True
 29 | 
 30 |         # Simulate GitHub API returning canonical case
 31 |         repos = [
 32 |             {
 33 |                 "name": "repo1",
 34 |                 "owner": {"login": "Iamrodos"},  # Capital I (canonical from API)
 35 |                 "private": False,
 36 |                 "fork": False,
 37 |             },
 38 |             {
 39 |                 "name": "repo2",
 40 |                 "owner": {"login": "Iamrodos"},
 41 |                 "private": False,
 42 |                 "fork": False,
 43 |             },
 44 |         ]
 45 | 
 46 |         filtered = github_backup.filter_repositories(args, repos)
 47 | 
 48 |         # Should match despite case difference
 49 |         assert len(filtered) == 2
 50 |         assert filtered[0]["name"] == "repo1"
 51 |         assert filtered[1]["name"] == "repo2"
 52 | 
 53 |     def test_filter_repositories_case_insensitive_org(self):
 54 |         """Should filter repositories case-insensitively for organizations.
 55 | 
 56 |         Tests the example from issue #198 where 'prai-org' doesn't match 'PRAI-Org'.
 57 |         """
 58 |         args = Mock()
 59 |         args.user = "prai-org"  # lowercase (what user typed)
 60 |         args.repository = None
 61 |         args.name_regex = None
 62 |         args.languages = None
 63 |         args.exclude = None
 64 |         args.fork = False
 65 |         args.private = False
 66 |         args.public = False
 67 |         args.all = True
 68 | 
 69 |         repos = [
 70 |             {
 71 |                 "name": "repo1",
 72 |                 "owner": {"login": "PRAI-Org"},  # Different case (canonical from API)
 73 |                 "private": False,
 74 |                 "fork": False,
 75 |             },
 76 |         ]
 77 | 
 78 |         filtered = github_backup.filter_repositories(args, repos)
 79 | 
 80 |         # Should match despite case difference
 81 |         assert len(filtered) == 1
 82 |         assert filtered[0]["name"] == "repo1"
 83 | 
 84 |     def test_filter_repositories_case_variations(self):
 85 |         """Should handle various case combinations correctly."""
 86 |         args = Mock()
 87 |         args.user = "TeSt-UsEr"  # Mixed case
 88 |         args.repository = None
 89 |         args.name_regex = None
 90 |         args.languages = None
 91 |         args.exclude = None
 92 |         args.fork = False
 93 |         args.private = False
 94 |         args.public = False
 95 |         args.all = True
 96 | 
 97 |         repos = [
 98 |             {"name": "repo1", "owner": {"login": "test-user"}, "private": False, "fork": False},
 99 |             {"name": "repo2", "owner": {"login": "TEST-USER"}, "private": False, "fork": False},
100 |             {"name": "repo3", "owner": {"login": "TeSt-UsEr"}, "private": False, "fork": False},
101 |             {"name": "repo4", "owner": {"login": "other-user"}, "private": False, "fork": False},
102 |         ]
103 | 
104 |         filtered = github_backup.filter_repositories(args, repos)
105 | 
106 |         # Should match first 3 (all case variations of same user)
107 |         assert len(filtered) == 3
108 |         assert set(r["name"] for r in filtered) == {"repo1", "repo2", "repo3"}
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     pytest.main([__file__, "-v"])
113 | 


--------------------------------------------------------------------------------
/release:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -eo pipefail
  3 | [[ $RELEASE_TRACE ]] && set -x
  4 | 
  5 | if [[ ! -f setup.py ]]; then
  6 |     echo -e "${RED}WARNING: Missing setup.py${COLOR_OFF}\n"
  7 |     exit 1
  8 | fi
  9 | 
 10 | PACKAGE_NAME="$(cat setup.py | grep 'name="' | head | cut -d '"' -f2)"
 11 | INIT_PACKAGE_NAME="$(echo "${PACKAGE_NAME//-/_}")"
 12 | PUBLIC="true"
 13 | 
 14 | # Colors
 15 | COLOR_OFF="\033[0m"  # unsets color to term fg color
 16 | RED="\033[0;31m"     # red
 17 | GREEN="\033[0;32m"   # green
 18 | YELLOW="\033[0;33m"  # yellow
 19 | MAGENTA="\033[0;35m" # magenta
 20 | CYAN="\033[0;36m"    # cyan
 21 | 
 22 | # ensure wheel is available
 23 | pip install wheel >/dev/null
 24 | 
 25 | command -v gitchangelog >/dev/null 2>&1 || {
 26 |     echo -e "${RED}WARNING: Missing gitchangelog binary, please run: pip install gitchangelog==3.0.4${COLOR_OFF}\n"
 27 |     exit 1
 28 | }
 29 | 
 30 | command -v rst-lint >/dev/null || {
 31 |     echo -e "${RED}WARNING: Missing rst-lint binary, please run: pip install restructuredtext_lint${COLOR_OFF}\n"
 32 |     exit 1
 33 | }
 34 | 
 35 | command -v twine >/dev/null || {
 36 |     echo -e "${RED}WARNING: Missing twine binary, please run: pip install twine==3.2.0${COLOR_OFF}\n"
 37 |     exit 1
 38 | }
 39 | 
 40 | if [[ "$@" != "major" ]] && [[ "$@" != "minor" ]] && [[ "$@" != "patch" ]]; then
 41 |     echo -e "${RED}WARNING: Invalid release type, must specify 'major', 'minor', or 'patch'${COLOR_OFF}\n"
 42 |     exit 1
 43 | fi
 44 | 
 45 | echo -e "\n${GREEN}STARTING RELEASE PROCESS${COLOR_OFF}\n"
 46 | 
 47 | set +e
 48 | git status | grep -Eo "working (directory|tree) clean" &>/dev/null
 49 | if [ ! $? -eq 0 ]; then # working directory is NOT clean
 50 |     echo -e "${RED}WARNING: You have uncomitted changes, you may have forgotten something${COLOR_OFF}\n"
 51 |     exit 1
 52 | fi
 53 | set -e
 54 | 
 55 | echo -e "${YELLOW}--->${COLOR_OFF} Updating local copy"
 56 | git pull -q origin master
 57 | 
 58 | echo -e "${YELLOW}--->${COLOR_OFF} Retrieving release versions"
 59 | 
 60 | current_version=$(cat ${INIT_PACKAGE_NAME}/__init__.py | grep '__version__ =' | sed 's/[^0-9.]//g')
 61 | major=$(echo $current_version | awk '{split($0,a,"."); print a[1]}')
 62 | minor=$(echo $current_version | awk '{split($0,a,"."); print a[2]}')
 63 | patch=$(echo $current_version | awk '{split($0,a,"."); print a[3]}')
 64 | 
 65 | if [[ "$@" == "major" ]]; then
 66 |     major=$(($major + 1))
 67 |     minor="0"
 68 |     patch="0"
 69 | elif [[ "$@" == "minor" ]]; then
 70 |     minor=$(($minor + 1))
 71 |     patch="0"
 72 | elif [[ "$@" == "patch" ]]; then
 73 |     patch=$(($patch + 1))
 74 | fi
 75 | 
 76 | next_version="${major}.${minor}.${patch}"
 77 | 
 78 | echo -e "${YELLOW}   >${COLOR_OFF} ${MAGENTA}${current_version}${COLOR_OFF} -> ${MAGENTA}${next_version}${COLOR_OFF}"
 79 | 
 80 | echo -e "${YELLOW}--->${COLOR_OFF} Ensuring readme passes lint checks (if this fails, run rst-lint)"
 81 | rst-lint README.rst || exit 1
 82 | 
 83 | echo -e "${YELLOW}--->${COLOR_OFF} Creating necessary temp file"
 84 | tempfoo=$(basename $0)
 85 | TMPFILE=$(mktemp /tmp/${tempfoo}.XXXXXX) || {
 86 |     echo -e "${RED}WARNING: Cannot create temp file using mktemp in /tmp dir ${COLOR_OFF}\n"
 87 |     exit 1
 88 | }
 89 | 
 90 | find_this="__version__ = \"$current_version\""
 91 | replace_with="__version__ = \"$next_version\""
 92 | 
 93 | echo -e "${YELLOW}--->${COLOR_OFF} Updating ${INIT_PACKAGE_NAME}/__init__.py"
 94 | sed "s/$find_this/$replace_with/" ${INIT_PACKAGE_NAME}/__init__.py >$TMPFILE && mv $TMPFILE ${INIT_PACKAGE_NAME}/__init__.py
 95 | 
 96 | if [ -f docs/conf.py ]; then
 97 |     echo -e "${YELLOW}--->${COLOR_OFF} Updating docs"
 98 |     find_this="version = '${current_version}'"
 99 |     replace_with="version = '${next_version}'"
100 |     sed "s/$find_this/$replace_with/" docs/conf.py >$TMPFILE && mv $TMPFILE docs/conf.py
101 | 
102 |     find_this="version = '${current_version}'"
103 |     replace_with="release = '${next_version}'"
104 |     sed "s/$find_this/$replace_with/" docs/conf.py >$TMPFILE && mv $TMPFILE docs/conf.py
105 | fi
106 | 
107 | echo -e "${YELLOW}--->${COLOR_OFF} Updating CHANGES.rst for new release"
108 | version_header="$next_version ($(date +%F))"
109 | set +e
110 | dashes=$(yes '-' | head -n ${#version_header} | tr -d '\n')
111 | set -e
112 | gitchangelog | sed "4s/.*/$version_header/" | sed "5s/.*/$dashes/" >$TMPFILE && mv $TMPFILE CHANGES.rst
113 | 
114 | echo -e "${YELLOW}--->${COLOR_OFF} Adding changed files to git"
115 | git add CHANGES.rst README.rst ${INIT_PACKAGE_NAME}/__init__.py
116 | if [ -f docs/conf.py ]; then git add docs/conf.py; fi
117 | 
118 | echo -e "${YELLOW}--->${COLOR_OFF} Creating release"
119 | git commit -q -m "Release version $next_version"
120 | 
121 | if [[ "$PUBLIC" == "true" ]]; then
122 |     echo -e "${YELLOW}--->${COLOR_OFF} Creating python release files"
123 |     cp README.rst README
124 |     python setup.py sdist bdist_wheel >/dev/null
125 | 
126 |     echo -e "${YELLOW}--->${COLOR_OFF} Validating long_description"
127 |     twine check dist/*
128 | fi
129 | 
130 | echo -e "${YELLOW}--->${COLOR_OFF} Tagging release"
131 | git tag -a $next_version -m "Release version $next_version"
132 | 
133 | echo -e "${YELLOW}--->${COLOR_OFF} Pushing release and tags to github"
134 | git push -q origin master && git push -q --tags
135 | 
136 | if [[ "$PUBLIC" == "true" ]]; then
137 |     echo -e "${YELLOW}--->${COLOR_OFF} Uploading python release"
138 |     twine upload dist/*
139 |     rm README
140 | fi
141 | 
142 | echo -e "\n${CYAN}RELEASED VERSION ${next_version}!${COLOR_OFF}\n"
143 | 


--------------------------------------------------------------------------------
/tests/test_pagination.py:
--------------------------------------------------------------------------------
  1 | """Tests for Link header pagination handling."""
  2 | 
  3 | import json
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pytest
  7 | 
  8 | from github_backup import github_backup
  9 | 
 10 | 
 11 | class MockHTTPResponse:
 12 |     """Mock HTTP response for paginated API calls."""
 13 | 
 14 |     def __init__(self, data, link_header=None):
 15 |         self._content = json.dumps(data).encode("utf-8")
 16 |         self._link_header = link_header
 17 |         self._read = False
 18 |         self.reason = "OK"
 19 | 
 20 |     def getcode(self):
 21 |         return 200
 22 | 
 23 |     def read(self):
 24 |         if self._read:
 25 |             return b""
 26 |         self._read = True
 27 |         return self._content
 28 | 
 29 |     def get_header(self, name, default=None):
 30 |         """Mock method for headers.get()."""
 31 |         return self.headers.get(name, default)
 32 | 
 33 |     @property
 34 |     def headers(self):
 35 |         headers = {"x-ratelimit-remaining": "5000"}
 36 |         if self._link_header:
 37 |             headers["Link"] = self._link_header
 38 |         return headers
 39 | 
 40 | 
 41 | @pytest.fixture
 42 | def mock_args():
 43 |     """Mock args for retrieve_data_gen."""
 44 |     args = Mock()
 45 |     args.as_app = False
 46 |     args.token_fine = None
 47 |     args.token_classic = "fake_token"
 48 |     args.username = None
 49 |     args.password = None
 50 |     args.osx_keychain_item_name = None
 51 |     args.osx_keychain_item_account = None
 52 |     args.throttle_limit = None
 53 |     args.throttle_pause = 0
 54 |     return args
 55 | 
 56 | 
 57 | def test_cursor_based_pagination(mock_args):
 58 |     """Link header with 'after' cursor parameter works correctly."""
 59 | 
 60 |     # Simulate issues endpoint behavior: returns cursor in Link header
 61 |     responses = [
 62 |         # Issues endpoint returns 'after' cursor parameter (not 'page')
 63 |         MockHTTPResponse(
 64 |             data=[{"issue": i} for i in range(1, 101)],  # Page 1 contents
 65 |             link_header='<https://api.github.com/repos/owner/repo/issues?per_page=100&after=ABC123&page=2>; rel="next"',
 66 |         ),
 67 |         MockHTTPResponse(
 68 |             data=[{"issue": i} for i in range(101, 151)],  # Page 2 contents
 69 |             link_header=None,  # No Link header - signals end of pagination
 70 |         ),
 71 |     ]
 72 |     requests_made = []
 73 | 
 74 |     def mock_urlopen(request, *args, **kwargs):
 75 |         url = request.get_full_url()
 76 |         requests_made.append(url)
 77 |         return responses[len(requests_made) - 1]
 78 | 
 79 |     with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
 80 |         results = list(
 81 |             github_backup.retrieve_data_gen(
 82 |                 mock_args, "https://api.github.com/repos/owner/repo/issues"
 83 |             )
 84 |         )
 85 | 
 86 |     # Verify all items retrieved and cursor was used in second request
 87 |     assert len(results) == 150
 88 |     assert len(requests_made) == 2
 89 |     assert "after=ABC123" in requests_made[1]
 90 | 
 91 | 
 92 | def test_page_based_pagination(mock_args):
 93 |     """Link header with 'page' parameter works correctly."""
 94 | 
 95 |     # Simulate pulls/repos endpoint behavior: returns page numbers in Link header
 96 |     responses = [
 97 |         # Pulls endpoint uses traditional 'page' parameter (not cursor)
 98 |         MockHTTPResponse(
 99 |             data=[{"pull": i} for i in range(1, 101)],  # Page 1 contents
100 |             link_header='<https://api.github.com/repos/owner/repo/pulls?per_page=100&page=2>; rel="next"',
101 |         ),
102 |         MockHTTPResponse(
103 |             data=[{"pull": i} for i in range(101, 181)],  # Page 2 contents
104 |             link_header=None,  # No Link header - signals end of pagination
105 |         ),
106 |     ]
107 |     requests_made = []
108 | 
109 |     def mock_urlopen(request, *args, **kwargs):
110 |         url = request.get_full_url()
111 |         requests_made.append(url)
112 |         return responses[len(requests_made) - 1]
113 | 
114 |     with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
115 |         results = list(
116 |             github_backup.retrieve_data_gen(
117 |                 mock_args, "https://api.github.com/repos/owner/repo/pulls"
118 |             )
119 |         )
120 | 
121 |     # Verify all items retrieved and page parameter was used (not cursor)
122 |     assert len(results) == 180
123 |     assert len(requests_made) == 2
124 |     assert "page=2" in requests_made[1]
125 |     assert "after" not in requests_made[1]
126 | 
127 | 
128 | def test_no_link_header_stops_pagination(mock_args):
129 |     """Pagination stops when Link header is absent."""
130 | 
131 |     # Simulate endpoint with results that fit in a single page
132 |     responses = [
133 |         MockHTTPResponse(
134 |             data=[{"label": i} for i in range(1, 51)],  # Page contents
135 |             link_header=None,  # No Link header - signals end of pagination
136 |         )
137 |     ]
138 |     requests_made = []
139 | 
140 |     def mock_urlopen(request, *args, **kwargs):
141 |         requests_made.append(request.get_full_url())
142 |         return responses[len(requests_made) - 1]
143 | 
144 |     with patch("github_backup.github_backup.urlopen", side_effect=mock_urlopen):
145 |         results = list(
146 |             github_backup.retrieve_data_gen(
147 |                 mock_args, "https://api.github.com/repos/owner/repo/labels"
148 |             )
149 |         )
150 | 
151 |     # Verify pagination stopped after first request
152 |     assert len(results) == 50
153 |     assert len(requests_made) == 1
154 | 


--------------------------------------------------------------------------------
/tests/test_http_451.py:
--------------------------------------------------------------------------------
  1 | """Tests for HTTP 451 (DMCA takedown) handling."""
  2 | 
  3 | import json
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | import pytest
  7 | 
  8 | from github_backup import github_backup
  9 | 
 10 | 
 11 | class TestHTTP451Exception:
 12 |     """Test suite for HTTP 451 DMCA takedown exception handling."""
 13 | 
 14 |     def test_repository_unavailable_error_raised(self):
 15 |         """HTTP 451 should raise RepositoryUnavailableError with DMCA URL."""
 16 |         # Create mock args
 17 |         args = Mock()
 18 |         args.as_app = False
 19 |         args.token_fine = None
 20 |         args.token_classic = None
 21 |         args.username = None
 22 |         args.password = None
 23 |         args.osx_keychain_item_name = None
 24 |         args.osx_keychain_item_account = None
 25 |         args.throttle_limit = None
 26 |         args.throttle_pause = 0
 27 | 
 28 |         # Mock HTTPError 451 response
 29 |         mock_response = Mock()
 30 |         mock_response.getcode.return_value = 451
 31 | 
 32 |         dmca_data = {
 33 |             "message": "Repository access blocked",
 34 |             "block": {
 35 |                 "reason": "dmca",
 36 |                 "created_at": "2024-11-12T14:38:04Z",
 37 |                 "html_url": "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
 38 |             }
 39 |         }
 40 |         mock_response.read.return_value = json.dumps(dmca_data).encode("utf-8")
 41 |         mock_response.headers = {"x-ratelimit-remaining": "5000"}
 42 |         mock_response.reason = "Unavailable For Legal Reasons"
 43 | 
 44 |         def mock_get_response(request, auth, template):
 45 |             return mock_response, []
 46 | 
 47 |         with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
 48 |             with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
 49 |                 list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
 50 | 
 51 |             # Check exception has DMCA URL
 52 |             assert exc_info.value.dmca_url == "https://github.com/github/dmca/blob/master/2024/11/2024-11-04-source-code.md"
 53 |             assert "451" in str(exc_info.value)
 54 | 
 55 |     def test_repository_unavailable_error_without_dmca_url(self):
 56 |         """HTTP 451 without DMCA details should still raise exception."""
 57 |         args = Mock()
 58 |         args.as_app = False
 59 |         args.token_fine = None
 60 |         args.token_classic = None
 61 |         args.username = None
 62 |         args.password = None
 63 |         args.osx_keychain_item_name = None
 64 |         args.osx_keychain_item_account = None
 65 |         args.throttle_limit = None
 66 |         args.throttle_pause = 0
 67 | 
 68 |         mock_response = Mock()
 69 |         mock_response.getcode.return_value = 451
 70 |         mock_response.read.return_value = b'{"message": "Blocked"}'
 71 |         mock_response.headers = {"x-ratelimit-remaining": "5000"}
 72 |         mock_response.reason = "Unavailable For Legal Reasons"
 73 | 
 74 |         def mock_get_response(request, auth, template):
 75 |             return mock_response, []
 76 | 
 77 |         with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
 78 |             with pytest.raises(github_backup.RepositoryUnavailableError) as exc_info:
 79 |                 list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
 80 | 
 81 |             # Exception raised even without DMCA URL
 82 |             assert exc_info.value.dmca_url is None
 83 |             assert "451" in str(exc_info.value)
 84 | 
 85 |     def test_repository_unavailable_error_with_malformed_json(self):
 86 |         """HTTP 451 with malformed JSON should still raise exception."""
 87 |         args = Mock()
 88 |         args.as_app = False
 89 |         args.token_fine = None
 90 |         args.token_classic = None
 91 |         args.username = None
 92 |         args.password = None
 93 |         args.osx_keychain_item_name = None
 94 |         args.osx_keychain_item_account = None
 95 |         args.throttle_limit = None
 96 |         args.throttle_pause = 0
 97 | 
 98 |         mock_response = Mock()
 99 |         mock_response.getcode.return_value = 451
100 |         mock_response.read.return_value = b"invalid json {"
101 |         mock_response.headers = {"x-ratelimit-remaining": "5000"}
102 |         mock_response.reason = "Unavailable For Legal Reasons"
103 | 
104 |         def mock_get_response(request, auth, template):
105 |             return mock_response, []
106 | 
107 |         with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
108 |             with pytest.raises(github_backup.RepositoryUnavailableError):
109 |                 list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/dmca/issues"))
110 | 
111 |     def test_other_http_errors_unchanged(self):
112 |         """Other HTTP errors should still raise generic Exception."""
113 |         args = Mock()
114 |         args.as_app = False
115 |         args.token_fine = None
116 |         args.token_classic = None
117 |         args.username = None
118 |         args.password = None
119 |         args.osx_keychain_item_name = None
120 |         args.osx_keychain_item_account = None
121 |         args.throttle_limit = None
122 |         args.throttle_pause = 0
123 | 
124 |         mock_response = Mock()
125 |         mock_response.getcode.return_value = 404
126 |         mock_response.read.return_value = b'{"message": "Not Found"}'
127 |         mock_response.headers = {"x-ratelimit-remaining": "5000"}
128 |         mock_response.reason = "Not Found"
129 | 
130 |         def mock_get_response(request, auth, template):
131 |             return mock_response, []
132 | 
133 |         with patch("github_backup.github_backup._get_response", side_effect=mock_get_response):
134 |             # Should raise generic Exception, not RepositoryUnavailableError
135 |             with pytest.raises(Exception) as exc_info:
136 |                 list(github_backup.retrieve_data_gen(args, "https://api.github.com/repos/test/notfound/issues"))
137 | 
138 |             assert not isinstance(exc_info.value, github_backup.RepositoryUnavailableError)
139 |             assert "404" in str(exc_info.value)
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     pytest.main([__file__, "-v"])
144 | 


--------------------------------------------------------------------------------
/tests/test_all_starred.py:
--------------------------------------------------------------------------------
  1 | """Tests for --all-starred flag behavior (issue #225)."""
  2 | 
  3 | import pytest
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | from github_backup import github_backup
  7 | 
  8 | 
  9 | class TestAllStarredCloning:
 10 |     """Test suite for --all-starred repository cloning behavior.
 11 | 
 12 |     Issue #225: --all-starred should clone starred repos without requiring --repositories.
 13 |     """
 14 | 
 15 |     def _create_mock_args(self, **overrides):
 16 |         """Create a mock args object with sensible defaults."""
 17 |         args = Mock()
 18 |         args.user = "testuser"
 19 |         args.output_directory = "/tmp/backup"
 20 |         args.include_repository = False
 21 |         args.include_everything = False
 22 |         args.include_gists = False
 23 |         args.include_starred_gists = False
 24 |         args.all_starred = False
 25 |         args.skip_existing = False
 26 |         args.bare_clone = False
 27 |         args.lfs_clone = False
 28 |         args.no_prune = False
 29 |         args.include_wiki = False
 30 |         args.include_issues = False
 31 |         args.include_issue_comments = False
 32 |         args.include_issue_events = False
 33 |         args.include_pulls = False
 34 |         args.include_pull_comments = False
 35 |         args.include_pull_commits = False
 36 |         args.include_pull_details = False
 37 |         args.include_labels = False
 38 |         args.include_hooks = False
 39 |         args.include_milestones = False
 40 |         args.include_releases = False
 41 |         args.include_assets = False
 42 |         args.include_attachments = False
 43 |         args.incremental = False
 44 |         args.incremental_by_files = False
 45 |         args.github_host = None
 46 |         args.prefer_ssh = False
 47 |         args.token_classic = None
 48 |         args.token_fine = None
 49 |         args.username = None
 50 |         args.password = None
 51 |         args.as_app = False
 52 |         args.osx_keychain_item_name = None
 53 |         args.osx_keychain_item_account = None
 54 | 
 55 |         for key, value in overrides.items():
 56 |             setattr(args, key, value)
 57 | 
 58 |         return args
 59 | 
 60 |     @patch('github_backup.github_backup.fetch_repository')
 61 |     @patch('github_backup.github_backup.get_github_repo_url')
 62 |     def test_all_starred_clones_without_repositories_flag(self, mock_get_url, mock_fetch):
 63 |         """--all-starred should clone starred repos without --repositories flag.
 64 | 
 65 |         This is the core fix for issue #225.
 66 |         """
 67 |         args = self._create_mock_args(all_starred=True)
 68 |         mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git"
 69 | 
 70 |         # A starred repository (is_starred flag set by retrieve_repositories)
 71 |         starred_repo = {
 72 |             "name": "awesome-project",
 73 |             "full_name": "otheruser/awesome-project",
 74 |             "owner": {"login": "otheruser"},
 75 |             "private": False,
 76 |             "fork": False,
 77 |             "has_wiki": False,
 78 |             "is_starred": True,  # This flag is set for starred repos
 79 |         }
 80 | 
 81 |         with patch('github_backup.github_backup.mkdir_p'):
 82 |             github_backup.backup_repositories(args, "/tmp/backup", [starred_repo])
 83 | 
 84 |         # fetch_repository should be called for the starred repo
 85 |         assert mock_fetch.called, "--all-starred should trigger repository cloning"
 86 |         mock_fetch.assert_called_once()
 87 |         call_args = mock_fetch.call_args
 88 |         assert call_args[0][0] == "awesome-project"  # repo name
 89 | 
 90 |     @patch('github_backup.github_backup.fetch_repository')
 91 |     @patch('github_backup.github_backup.get_github_repo_url')
 92 |     def test_starred_repo_not_cloned_without_all_starred_flag(self, mock_get_url, mock_fetch):
 93 |         """Starred repos should NOT be cloned if --all-starred is not set."""
 94 |         args = self._create_mock_args(all_starred=False)
 95 |         mock_get_url.return_value = "https://github.com/otheruser/awesome-project.git"
 96 | 
 97 |         starred_repo = {
 98 |             "name": "awesome-project",
 99 |             "full_name": "otheruser/awesome-project",
100 |             "owner": {"login": "otheruser"},
101 |             "private": False,
102 |             "fork": False,
103 |             "has_wiki": False,
104 |             "is_starred": True,
105 |         }
106 | 
107 |         with patch('github_backup.github_backup.mkdir_p'):
108 |             github_backup.backup_repositories(args, "/tmp/backup", [starred_repo])
109 | 
110 |         # fetch_repository should NOT be called
111 |         assert not mock_fetch.called, "Starred repos should not be cloned without --all-starred"
112 | 
113 |     @patch('github_backup.github_backup.fetch_repository')
114 |     @patch('github_backup.github_backup.get_github_repo_url')
115 |     def test_non_starred_repo_not_cloned_with_only_all_starred(self, mock_get_url, mock_fetch):
116 |         """Non-starred repos should NOT be cloned when only --all-starred is set."""
117 |         args = self._create_mock_args(all_starred=True)
118 |         mock_get_url.return_value = "https://github.com/testuser/my-project.git"
119 | 
120 |         # A regular (non-starred) repository
121 |         regular_repo = {
122 |             "name": "my-project",
123 |             "full_name": "testuser/my-project",
124 |             "owner": {"login": "testuser"},
125 |             "private": False,
126 |             "fork": False,
127 |             "has_wiki": False,
128 |             # No is_starred flag
129 |         }
130 | 
131 |         with patch('github_backup.github_backup.mkdir_p'):
132 |             github_backup.backup_repositories(args, "/tmp/backup", [regular_repo])
133 | 
134 |         # fetch_repository should NOT be called for non-starred repos
135 |         assert not mock_fetch.called, "Non-starred repos should not be cloned with only --all-starred"
136 | 
137 |     @patch('github_backup.github_backup.fetch_repository')
138 |     @patch('github_backup.github_backup.get_github_repo_url')
139 |     def test_repositories_flag_still_works(self, mock_get_url, mock_fetch):
140 |         """--repositories flag should still clone repos as before."""
141 |         args = self._create_mock_args(include_repository=True)
142 |         mock_get_url.return_value = "https://github.com/testuser/my-project.git"
143 | 
144 |         regular_repo = {
145 |             "name": "my-project",
146 |             "full_name": "testuser/my-project",
147 |             "owner": {"login": "testuser"},
148 |             "private": False,
149 |             "fork": False,
150 |             "has_wiki": False,
151 |         }
152 | 
153 |         with patch('github_backup.github_backup.mkdir_p'):
154 |             github_backup.backup_repositories(args, "/tmp/backup", [regular_repo])
155 | 
156 |         # fetch_repository should be called
157 |         assert mock_fetch.called, "--repositories should trigger repository cloning"
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     pytest.main([__file__, "-v"])
162 | 


--------------------------------------------------------------------------------
/tests/test_json_dump_if_changed.py:
--------------------------------------------------------------------------------
  1 | """Tests for json_dump_if_changed functionality."""
  2 | 
  3 | import codecs
  4 | import json
  5 | import os
  6 | import tempfile
  7 | 
  8 | import pytest
  9 | 
 10 | from github_backup import github_backup
 11 | 
 12 | 
 13 | class TestJsonDumpIfChanged:
 14 |     """Test suite for json_dump_if_changed function."""
 15 | 
 16 |     def test_writes_new_file(self):
 17 |         """Should write file when it doesn't exist."""
 18 |         with tempfile.TemporaryDirectory() as tmpdir:
 19 |             output_file = os.path.join(tmpdir, "test.json")
 20 |             test_data = {"key": "value", "number": 42}
 21 | 
 22 |             result = github_backup.json_dump_if_changed(test_data, output_file)
 23 | 
 24 |             assert result is True
 25 |             assert os.path.exists(output_file)
 26 | 
 27 |             # Verify content matches expected format
 28 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
 29 |                 content = f.read()
 30 |                 loaded = json.loads(content)
 31 |                 assert loaded == test_data
 32 | 
 33 |     def test_skips_unchanged_file(self):
 34 |         """Should skip write when content is identical."""
 35 |         with tempfile.TemporaryDirectory() as tmpdir:
 36 |             output_file = os.path.join(tmpdir, "test.json")
 37 |             test_data = {"key": "value", "number": 42}
 38 | 
 39 |             # First write
 40 |             result1 = github_backup.json_dump_if_changed(test_data, output_file)
 41 |             assert result1 is True
 42 | 
 43 |             # Get the initial mtime
 44 |             mtime1 = os.path.getmtime(output_file)
 45 | 
 46 |             # Second write with same data
 47 |             result2 = github_backup.json_dump_if_changed(test_data, output_file)
 48 |             assert result2 is False
 49 | 
 50 |             # File should not have been modified
 51 |             mtime2 = os.path.getmtime(output_file)
 52 |             assert mtime1 == mtime2
 53 | 
 54 |     def test_writes_when_content_changed(self):
 55 |         """Should write file when content has changed."""
 56 |         with tempfile.TemporaryDirectory() as tmpdir:
 57 |             output_file = os.path.join(tmpdir, "test.json")
 58 |             test_data1 = {"key": "value1"}
 59 |             test_data2 = {"key": "value2"}
 60 | 
 61 |             # First write
 62 |             result1 = github_backup.json_dump_if_changed(test_data1, output_file)
 63 |             assert result1 is True
 64 | 
 65 |             # Second write with different data
 66 |             result2 = github_backup.json_dump_if_changed(test_data2, output_file)
 67 |             assert result2 is True
 68 | 
 69 |             # Verify new content
 70 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
 71 |                 loaded = json.load(f)
 72 |                 assert loaded == test_data2
 73 | 
 74 |     def test_uses_consistent_formatting(self):
 75 |         """Should use same JSON formatting as json_dump."""
 76 |         with tempfile.TemporaryDirectory() as tmpdir:
 77 |             output_file = os.path.join(tmpdir, "test.json")
 78 |             test_data = {"z": "last", "a": "first", "m": "middle"}
 79 | 
 80 |             github_backup.json_dump_if_changed(test_data, output_file)
 81 | 
 82 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
 83 |                 content = f.read()
 84 | 
 85 |             # Check for consistent formatting:
 86 |             # - sorted keys
 87 |             # - 4-space indent
 88 |             # - comma-colon-space separator
 89 |             expected = json.dumps(
 90 |                 test_data,
 91 |                 ensure_ascii=False,
 92 |                 sort_keys=True,
 93 |                 indent=4,
 94 |                 separators=(",", ": "),
 95 |             )
 96 |             assert content == expected
 97 | 
 98 |     def test_atomic_write_always_used(self):
 99 |         """Should always use temp file and rename for atomic writes."""
100 |         with tempfile.TemporaryDirectory() as tmpdir:
101 |             output_file = os.path.join(tmpdir, "test.json")
102 |             test_data = {"key": "value"}
103 | 
104 |             result = github_backup.json_dump_if_changed(test_data, output_file)
105 | 
106 |             assert result is True
107 |             assert os.path.exists(output_file)
108 | 
109 |             # Temp file should not exist after atomic write
110 |             temp_file = output_file + ".temp"
111 |             assert not os.path.exists(temp_file)
112 | 
113 |             # Verify content
114 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
115 |                 loaded = json.load(f)
116 |                 assert loaded == test_data
117 | 
118 |     def test_handles_unicode_content(self):
119 |         """Should correctly handle Unicode content."""
120 |         with tempfile.TemporaryDirectory() as tmpdir:
121 |             output_file = os.path.join(tmpdir, "test.json")
122 |             test_data = {
123 |                 "emoji": "🚀",
124 |                 "chinese": "你好",
125 |                 "arabic": "مرحبا",
126 |                 "cyrillic": "Привет",
127 |             }
128 | 
129 |             result = github_backup.json_dump_if_changed(test_data, output_file)
130 |             assert result is True
131 | 
132 |             # Verify Unicode is preserved
133 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
134 |                 loaded = json.load(f)
135 |                 assert loaded == test_data
136 | 
137 |             # Second write should skip
138 |             result2 = github_backup.json_dump_if_changed(test_data, output_file)
139 |             assert result2 is False
140 | 
141 |     def test_handles_complex_nested_data(self):
142 |         """Should handle complex nested data structures."""
143 |         with tempfile.TemporaryDirectory() as tmpdir:
144 |             output_file = os.path.join(tmpdir, "test.json")
145 |             test_data = {
146 |                 "users": [
147 |                     {"id": 1, "name": "Alice", "tags": ["admin", "user"]},
148 |                     {"id": 2, "name": "Bob", "tags": ["user"]},
149 |                 ],
150 |                 "metadata": {"version": "1.0", "nested": {"deep": {"value": 42}}},
151 |             }
152 | 
153 |             result = github_backup.json_dump_if_changed(test_data, output_file)
154 |             assert result is True
155 | 
156 |             # Verify structure is preserved
157 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
158 |                 loaded = json.load(f)
159 |                 assert loaded == test_data
160 | 
161 |     def test_overwrites_on_unicode_decode_error(self):
162 |         """Should overwrite if existing file has invalid UTF-8."""
163 |         with tempfile.TemporaryDirectory() as tmpdir:
164 |             output_file = os.path.join(tmpdir, "test.json")
165 |             test_data = {"key": "value"}
166 | 
167 |             # Write invalid UTF-8 bytes
168 |             with open(output_file, "wb") as f:
169 |                 f.write(b"\xff\xfe invalid utf-8")
170 | 
171 |             # Should catch UnicodeDecodeError and overwrite
172 |             result = github_backup.json_dump_if_changed(test_data, output_file)
173 |             assert result is True
174 | 
175 |             # Verify new content was written
176 |             with codecs.open(output_file, "r", encoding="utf-8") as f:
177 |                 loaded = json.load(f)
178 |                 assert loaded == test_data
179 | 
180 |     def test_key_order_independence(self):
181 |         """Should treat differently-ordered dicts as same if keys/values match."""
182 |         with tempfile.TemporaryDirectory() as tmpdir:
183 |             output_file = os.path.join(tmpdir, "test.json")
184 | 
185 |             # Write first dict
186 |             data1 = {"z": 1, "a": 2, "m": 3}
187 |             github_backup.json_dump_if_changed(data1, output_file)
188 | 
189 |             # Try to write same data but different order
190 |             data2 = {"a": 2, "m": 3, "z": 1}
191 |             result = github_backup.json_dump_if_changed(data2, output_file)
192 | 
193 |             # Should skip because content is the same (keys are sorted)
194 |             assert result is False
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     pytest.main([__file__, "-v"])
199 | 


--------------------------------------------------------------------------------
/tests/test_skip_assets_on.py:
--------------------------------------------------------------------------------
  1 | """Tests for --skip-assets-on flag behavior (issue #135)."""
  2 | 
  3 | import pytest
  4 | from unittest.mock import Mock, patch
  5 | 
  6 | from github_backup import github_backup
  7 | 
  8 | 
  9 | class TestSkipAssetsOn:
 10 |     """Test suite for --skip-assets-on flag.
 11 | 
 12 |     Issue #135: Allow skipping asset downloads for specific repositories
 13 |     while still backing up release metadata.
 14 |     """
 15 | 
 16 |     def _create_mock_args(self, **overrides):
 17 |         """Create a mock args object with sensible defaults."""
 18 |         args = Mock()
 19 |         args.user = "testuser"
 20 |         args.output_directory = "/tmp/backup"
 21 |         args.include_repository = False
 22 |         args.include_everything = False
 23 |         args.include_gists = False
 24 |         args.include_starred_gists = False
 25 |         args.all_starred = False
 26 |         args.skip_existing = False
 27 |         args.bare_clone = False
 28 |         args.lfs_clone = False
 29 |         args.no_prune = False
 30 |         args.include_wiki = False
 31 |         args.include_issues = False
 32 |         args.include_issue_comments = False
 33 |         args.include_issue_events = False
 34 |         args.include_pulls = False
 35 |         args.include_pull_comments = False
 36 |         args.include_pull_commits = False
 37 |         args.include_pull_details = False
 38 |         args.include_labels = False
 39 |         args.include_hooks = False
 40 |         args.include_milestones = False
 41 |         args.include_releases = True
 42 |         args.include_assets = True
 43 |         args.skip_assets_on = []
 44 |         args.include_attachments = False
 45 |         args.incremental = False
 46 |         args.incremental_by_files = False
 47 |         args.github_host = None
 48 |         args.prefer_ssh = False
 49 |         args.token_classic = "test-token"
 50 |         args.token_fine = None
 51 |         args.username = None
 52 |         args.password = None
 53 |         args.as_app = False
 54 |         args.osx_keychain_item_name = None
 55 |         args.osx_keychain_item_account = None
 56 |         args.skip_prerelease = False
 57 |         args.number_of_latest_releases = None
 58 | 
 59 |         for key, value in overrides.items():
 60 |             setattr(args, key, value)
 61 | 
 62 |         return args
 63 | 
 64 |     def _create_mock_repository(self, name="test-repo", owner="testuser"):
 65 |         """Create a mock repository object."""
 66 |         return {
 67 |             "name": name,
 68 |             "full_name": f"{owner}/{name}",
 69 |             "owner": {"login": owner},
 70 |             "private": False,
 71 |             "fork": False,
 72 |             "has_wiki": False,
 73 |         }
 74 | 
 75 |     def _create_mock_release(self, tag="v1.0.0"):
 76 |         """Create a mock release object."""
 77 |         return {
 78 |             "tag_name": tag,
 79 |             "name": tag,
 80 |             "prerelease": False,
 81 |             "draft": False,
 82 |             "assets_url": f"https://api.github.com/repos/testuser/test-repo/releases/{tag}/assets",
 83 |         }
 84 | 
 85 |     def _create_mock_asset(self, name="asset.zip"):
 86 |         """Create a mock asset object."""
 87 |         return {
 88 |             "name": name,
 89 |             "url": f"https://api.github.com/repos/testuser/test-repo/releases/assets/{name}",
 90 |         }
 91 | 
 92 | 
 93 | class TestSkipAssetsOnArgumentParsing(TestSkipAssetsOn):
 94 |     """Tests for --skip-assets-on argument parsing."""
 95 | 
 96 |     def test_skip_assets_on_not_set_defaults_to_none(self):
 97 |         """When --skip-assets-on is not specified, it should default to None."""
 98 |         args = github_backup.parse_args(["testuser"])
 99 |         assert args.skip_assets_on is None
100 | 
101 |     def test_skip_assets_on_single_repo(self):
102 |         """Single --skip-assets-on should create list with one item."""
103 |         args = github_backup.parse_args(["testuser", "--skip-assets-on", "big-repo"])
104 |         assert args.skip_assets_on == ["big-repo"]
105 | 
106 |     def test_skip_assets_on_multiple_repos(self):
107 |         """Multiple repos can be specified space-separated (like --exclude)."""
108 |         args = github_backup.parse_args(
109 |             [
110 |                 "testuser",
111 |                 "--skip-assets-on",
112 |                 "big-repo",
113 |                 "another-repo",
114 |                 "owner/third-repo",
115 |             ]
116 |         )
117 |         assert args.skip_assets_on == ["big-repo", "another-repo", "owner/third-repo"]
118 | 
119 | 
120 | class TestSkipAssetsOnBehavior(TestSkipAssetsOn):
121 |     """Tests for --skip-assets-on behavior in backup_releases."""
122 | 
123 |     @patch("github_backup.github_backup.download_file")
124 |     @patch("github_backup.github_backup.retrieve_data")
125 |     @patch("github_backup.github_backup.mkdir_p")
126 |     @patch("github_backup.github_backup.json_dump_if_changed")
127 |     def test_assets_downloaded_when_not_skipped(
128 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
129 |     ):
130 |         """Assets should be downloaded when repo is not in skip list."""
131 |         args = self._create_mock_args(skip_assets_on=[])
132 |         repository = self._create_mock_repository(name="normal-repo")
133 |         release = self._create_mock_release()
134 |         asset = self._create_mock_asset()
135 | 
136 |         mock_json_dump.return_value = True
137 |         mock_retrieve.side_effect = [
138 |             [release],  # First call: get releases
139 |             [asset],  # Second call: get assets
140 |         ]
141 | 
142 |         with patch("os.path.join", side_effect=lambda *args: "/".join(args)):
143 |             github_backup.backup_releases(
144 |                 args,
145 |                 "/tmp/backup/repositories/normal-repo",
146 |                 repository,
147 |                 "https://api.github.com/repos/{owner}/{repo}",
148 |                 include_assets=True,
149 |             )
150 | 
151 |         # download_file should have been called for the asset
152 |         mock_download.assert_called_once()
153 | 
154 |     @patch("github_backup.github_backup.download_file")
155 |     @patch("github_backup.github_backup.retrieve_data")
156 |     @patch("github_backup.github_backup.mkdir_p")
157 |     @patch("github_backup.github_backup.json_dump_if_changed")
158 |     def test_assets_skipped_when_repo_name_matches(
159 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
160 |     ):
161 |         """Assets should be skipped when repo name is in skip list."""
162 |         args = self._create_mock_args(skip_assets_on=["big-repo"])
163 |         repository = self._create_mock_repository(name="big-repo")
164 |         release = self._create_mock_release()
165 | 
166 |         mock_json_dump.return_value = True
167 |         mock_retrieve.return_value = [release]
168 | 
169 |         github_backup.backup_releases(
170 |             args,
171 |             "/tmp/backup/repositories/big-repo",
172 |             repository,
173 |             "https://api.github.com/repos/{owner}/{repo}",
174 |             include_assets=True,
175 |         )
176 | 
177 |         # download_file should NOT have been called
178 |         mock_download.assert_not_called()
179 | 
180 |     @patch("github_backup.github_backup.download_file")
181 |     @patch("github_backup.github_backup.retrieve_data")
182 |     @patch("github_backup.github_backup.mkdir_p")
183 |     @patch("github_backup.github_backup.json_dump_if_changed")
184 |     def test_assets_skipped_when_full_name_matches(
185 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
186 |     ):
187 |         """Assets should be skipped when owner/repo format matches."""
188 |         args = self._create_mock_args(skip_assets_on=["otheruser/big-repo"])
189 |         repository = self._create_mock_repository(name="big-repo", owner="otheruser")
190 |         release = self._create_mock_release()
191 | 
192 |         mock_json_dump.return_value = True
193 |         mock_retrieve.return_value = [release]
194 | 
195 |         github_backup.backup_releases(
196 |             args,
197 |             "/tmp/backup/repositories/big-repo",
198 |             repository,
199 |             "https://api.github.com/repos/{owner}/{repo}",
200 |             include_assets=True,
201 |         )
202 | 
203 |         # download_file should NOT have been called
204 |         mock_download.assert_not_called()
205 | 
206 |     @patch("github_backup.github_backup.download_file")
207 |     @patch("github_backup.github_backup.retrieve_data")
208 |     @patch("github_backup.github_backup.mkdir_p")
209 |     @patch("github_backup.github_backup.json_dump_if_changed")
210 |     def test_case_insensitive_matching(
211 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
212 |     ):
213 |         """Skip matching should be case-insensitive."""
214 |         # User types uppercase, repo name is lowercase
215 |         args = self._create_mock_args(skip_assets_on=["BIG-REPO"])
216 |         repository = self._create_mock_repository(name="big-repo")
217 |         release = self._create_mock_release()
218 | 
219 |         mock_json_dump.return_value = True
220 |         mock_retrieve.return_value = [release]
221 | 
222 |         github_backup.backup_releases(
223 |             args,
224 |             "/tmp/backup/repositories/big-repo",
225 |             repository,
226 |             "https://api.github.com/repos/{owner}/{repo}",
227 |             include_assets=True,
228 |         )
229 | 
230 |         # download_file should NOT have been called (case-insensitive match)
231 |         assert not mock_download.called
232 | 
233 |     @patch("github_backup.github_backup.download_file")
234 |     @patch("github_backup.github_backup.retrieve_data")
235 |     @patch("github_backup.github_backup.mkdir_p")
236 |     @patch("github_backup.github_backup.json_dump_if_changed")
237 |     def test_multiple_skip_repos(
238 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
239 |     ):
240 |         """Multiple repos in skip list should all be skipped."""
241 |         args = self._create_mock_args(skip_assets_on=["repo1", "repo2", "repo3"])
242 |         repository = self._create_mock_repository(name="repo2")
243 |         release = self._create_mock_release()
244 | 
245 |         mock_json_dump.return_value = True
246 |         mock_retrieve.return_value = [release]
247 | 
248 |         github_backup.backup_releases(
249 |             args,
250 |             "/tmp/backup/repositories/repo2",
251 |             repository,
252 |             "https://api.github.com/repos/{owner}/{repo}",
253 |             include_assets=True,
254 |         )
255 | 
256 |         # download_file should NOT have been called
257 |         mock_download.assert_not_called()
258 | 
259 |     @patch("github_backup.github_backup.download_file")
260 |     @patch("github_backup.github_backup.retrieve_data")
261 |     @patch("github_backup.github_backup.mkdir_p")
262 |     @patch("github_backup.github_backup.json_dump_if_changed")
263 |     def test_release_metadata_still_saved_when_assets_skipped(
264 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
265 |     ):
266 |         """Release JSON should still be saved even when assets are skipped."""
267 |         args = self._create_mock_args(skip_assets_on=["big-repo"])
268 |         repository = self._create_mock_repository(name="big-repo")
269 |         release = self._create_mock_release()
270 | 
271 |         mock_json_dump.return_value = True
272 |         mock_retrieve.return_value = [release]
273 | 
274 |         github_backup.backup_releases(
275 |             args,
276 |             "/tmp/backup/repositories/big-repo",
277 |             repository,
278 |             "https://api.github.com/repos/{owner}/{repo}",
279 |             include_assets=True,
280 |         )
281 | 
282 |         # json_dump_if_changed should have been called for release metadata
283 |         mock_json_dump.assert_called_once()
284 |         # But download_file should NOT have been called
285 |         mock_download.assert_not_called()
286 | 
287 |     @patch("github_backup.github_backup.download_file")
288 |     @patch("github_backup.github_backup.retrieve_data")
289 |     @patch("github_backup.github_backup.mkdir_p")
290 |     @patch("github_backup.github_backup.json_dump_if_changed")
291 |     def test_non_matching_repo_still_downloads_assets(
292 |         self, mock_json_dump, mock_mkdir, mock_retrieve, mock_download
293 |     ):
294 |         """Repos not in skip list should still download assets."""
295 |         args = self._create_mock_args(skip_assets_on=["other-repo"])
296 |         repository = self._create_mock_repository(name="normal-repo")
297 |         release = self._create_mock_release()
298 |         asset = self._create_mock_asset()
299 | 
300 |         mock_json_dump.return_value = True
301 |         mock_retrieve.side_effect = [
302 |             [release],  # First call: get releases
303 |             [asset],  # Second call: get assets
304 |         ]
305 | 
306 |         with patch("os.path.join", side_effect=lambda *args: "/".join(args)):
307 |             github_backup.backup_releases(
308 |                 args,
309 |                 "/tmp/backup/repositories/normal-repo",
310 |                 repository,
311 |                 "https://api.github.com/repos/{owner}/{repo}",
312 |                 include_assets=True,
313 |             )
314 | 
315 |         # download_file SHOULD have been called
316 |         mock_download.assert_called_once()
317 | 
318 | 
319 | if __name__ == "__main__":
320 |     pytest.main([__file__, "-v"])
321 | 


--------------------------------------------------------------------------------
/tests/test_attachments.py:
--------------------------------------------------------------------------------
  1 | """Behavioral tests for attachment functionality."""
  2 | 
  3 | import json
  4 | import os
  5 | import tempfile
  6 | from pathlib import Path
  7 | from unittest.mock import Mock
  8 | 
  9 | import pytest
 10 | 
 11 | from github_backup import github_backup
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def attachment_test_setup(tmp_path):
 16 |     """Fixture providing setup and helper for attachment download tests."""
 17 |     from unittest.mock import patch
 18 | 
 19 |     issue_cwd = tmp_path / "issues"
 20 |     issue_cwd.mkdir()
 21 | 
 22 |     # Mock args
 23 |     args = Mock()
 24 |     args.as_app = False
 25 |     args.token_fine = None
 26 |     args.token_classic = None
 27 |     args.username = None
 28 |     args.password = None
 29 |     args.osx_keychain_item_name = None
 30 |     args.osx_keychain_item_account = None
 31 |     args.user = "testuser"
 32 |     args.repository = "testrepo"
 33 | 
 34 |     repository = {"full_name": "testuser/testrepo"}
 35 | 
 36 |     def call_download(issue_data, issue_number=123):
 37 |         """Call download_attachments with mocked HTTP downloads.
 38 | 
 39 |         Returns list of URLs that were actually downloaded.
 40 |         """
 41 |         downloaded_urls = []
 42 | 
 43 |         def mock_download(url, path, auth, as_app, fine):
 44 |             downloaded_urls.append(url)
 45 |             return {
 46 |                 "success": True,
 47 |                 "saved_as": os.path.basename(path),
 48 |                 "url": url,
 49 |             }
 50 | 
 51 |         with patch(
 52 |             "github_backup.github_backup.download_attachment_file",
 53 |             side_effect=mock_download,
 54 |         ):
 55 |             github_backup.download_attachments(
 56 |                 args, str(issue_cwd), issue_data, issue_number, repository
 57 |             )
 58 | 
 59 |         return downloaded_urls
 60 | 
 61 |     return {
 62 |         "issue_cwd": str(issue_cwd),
 63 |         "args": args,
 64 |         "repository": repository,
 65 |         "call_download": call_download,
 66 |     }
 67 | 
 68 | 
 69 | class TestURLExtraction:
 70 |     """Test URL extraction with realistic issue content."""
 71 | 
 72 |     def test_mixed_urls(self):
 73 |         issue_data = {
 74 |             "body": """
 75 |             ## Bug Report
 76 | 
 77 |             When uploading files, I see this error. Here's a screenshot:
 78 |             https://github.com/user-attachments/assets/abc123def456
 79 | 
 80 |             The logs show: https://github.com/user-attachments/files/789/error-log.txt
 81 | 
 82 |             This is similar to https://github.com/someorg/somerepo/issues/42 but different.
 83 | 
 84 |             You can also see the video at https://user-images.githubusercontent.com/12345/video-demo.mov
 85 | 
 86 |             Here's how to reproduce:
 87 |             ```bash
 88 |             # Don't extract this example URL:
 89 |             curl https://github.com/user-attachments/assets/example999
 90 |             ```
 91 | 
 92 |             More info at https://docs.example.com/guide
 93 | 
 94 |             Also see this inline code `https://github.com/user-attachments/files/111/inline.pdf` should not extract.
 95 | 
 96 |             Final attachment: https://github.com/user-attachments/files/222/report.pdf.
 97 |             """,
 98 |             "comment_data": [
 99 |                 {
100 |                     "body": "Here's another attachment: https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123"
101 |                 },
102 |                 {
103 |                     "body": """
104 |                     Example code:
105 |                     ```python
106 |                     url = "https://github.com/user-attachments/assets/code-example"
107 |                     ```
108 |                     But this is real: https://github.com/user-attachments/files/333/actual.zip
109 |                     """
110 |                 },
111 |             ],
112 |         }
113 | 
114 |         # Extract URLs
115 |         urls = github_backup.extract_attachment_urls(issue_data)
116 | 
117 |         expected_urls = [
118 |             "https://github.com/user-attachments/assets/abc123def456",
119 |             "https://github.com/user-attachments/files/789/error-log.txt",
120 |             "https://user-images.githubusercontent.com/12345/video-demo.mov",
121 |             "https://github.com/user-attachments/files/222/report.pdf",
122 |             "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123",
123 |             "https://github.com/user-attachments/files/333/actual.zip",
124 |         ]
125 | 
126 |         assert set(urls) == set(expected_urls)
127 | 
128 |     def test_trailing_punctuation_stripped(self):
129 |         """URLs with trailing punctuation should have punctuation stripped."""
130 |         issue_data = {
131 |             "body": """
132 |             See this file: https://github.com/user-attachments/files/1/doc.pdf.
133 |             And this one (https://github.com/user-attachments/files/2/image.png).
134 |             Check it out! https://github.com/user-attachments/files/3/data.csv!
135 |             """
136 |         }
137 | 
138 |         urls = github_backup.extract_attachment_urls(issue_data)
139 | 
140 |         expected = [
141 |             "https://github.com/user-attachments/files/1/doc.pdf",
142 |             "https://github.com/user-attachments/files/2/image.png",
143 |             "https://github.com/user-attachments/files/3/data.csv",
144 |         ]
145 |         assert set(urls) == set(expected)
146 | 
147 |     def test_deduplication_across_body_and_comments(self):
148 |         """Same URL in body and comments should only appear once."""
149 |         duplicate_url = "https://github.com/user-attachments/assets/abc123"
150 | 
151 |         issue_data = {
152 |             "body": f"First mention: {duplicate_url}",
153 |             "comment_data": [
154 |                 {"body": f"Second mention: {duplicate_url}"},
155 |                 {"body": f"Third mention: {duplicate_url}"},
156 |             ],
157 |         }
158 | 
159 |         urls = github_backup.extract_attachment_urls(issue_data)
160 | 
161 |         assert set(urls) == {duplicate_url}
162 | 
163 | 
164 | class TestFilenameExtraction:
165 |     """Test filename extraction from different URL types."""
166 | 
167 |     def test_modern_assets_url(self):
168 |         """Modern assets URL returns UUID."""
169 |         url = "https://github.com/user-attachments/assets/abc123def456"
170 |         filename = github_backup.get_attachment_filename(url)
171 |         assert filename == "abc123def456"
172 | 
173 |     def test_modern_files_url(self):
174 |         """Modern files URL returns filename."""
175 |         url = "https://github.com/user-attachments/files/12345/report.pdf"
176 |         filename = github_backup.get_attachment_filename(url)
177 |         assert filename == "report.pdf"
178 | 
179 |     def test_legacy_cdn_url(self):
180 |         """Legacy CDN URL returns filename with extension."""
181 |         url = "https://user-images.githubusercontent.com/123456/abc-def.png"
182 |         filename = github_backup.get_attachment_filename(url)
183 |         assert filename == "abc-def.png"
184 | 
185 |     def test_private_cdn_url(self):
186 |         """Private CDN URL returns filename."""
187 |         url = "https://private-user-images.githubusercontent.com/98765/secret.png?jwt=token123"
188 |         filename = github_backup.get_attachment_filename(url)
189 |         assert filename == "secret.png"
190 | 
191 |     def test_repo_files_url(self):
192 |         """Repo-scoped files URL returns filename."""
193 |         url = "https://github.com/owner/repo/files/789/document.txt"
194 |         filename = github_backup.get_attachment_filename(url)
195 |         assert filename == "document.txt"
196 | 
197 | 
198 | class TestFilenameCollision:
199 |     """Test filename collision resolution."""
200 | 
201 |     def test_collision_behavior(self):
202 |         """Test filename collision resolution with real files."""
203 |         with tempfile.TemporaryDirectory() as tmpdir:
204 |             # No collision - file doesn't exist
205 |             result = github_backup.resolve_filename_collision(
206 |                 os.path.join(tmpdir, "report.pdf")
207 |             )
208 |             assert result == os.path.join(tmpdir, "report.pdf")
209 | 
210 |             # Create the file, now collision exists
211 |             Path(os.path.join(tmpdir, "report.pdf")).touch()
212 |             result = github_backup.resolve_filename_collision(
213 |                 os.path.join(tmpdir, "report.pdf")
214 |             )
215 |             assert result == os.path.join(tmpdir, "report_1.pdf")
216 | 
217 |             # Create report_1.pdf too
218 |             Path(os.path.join(tmpdir, "report_1.pdf")).touch()
219 |             result = github_backup.resolve_filename_collision(
220 |                 os.path.join(tmpdir, "report.pdf")
221 |             )
222 |             assert result == os.path.join(tmpdir, "report_2.pdf")
223 | 
224 |     def test_manifest_reserved(self):
225 |         """manifest.json is always treated as reserved."""
226 |         with tempfile.TemporaryDirectory() as tmpdir:
227 |             # Even if manifest.json doesn't exist, should get manifest_1.json
228 |             result = github_backup.resolve_filename_collision(
229 |                 os.path.join(tmpdir, "manifest.json")
230 |             )
231 |             assert result == os.path.join(tmpdir, "manifest_1.json")
232 | 
233 | 
234 | class TestManifestDuplicatePrevention:
235 |     """Test that manifest prevents duplicate downloads (the bug fix)."""
236 | 
237 |     def test_manifest_filters_existing_urls(self, attachment_test_setup):
238 |         """URLs in manifest are not re-downloaded."""
239 |         setup = attachment_test_setup
240 | 
241 |         # Create manifest with existing URLs
242 |         attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123")
243 |         os.makedirs(attachments_dir)
244 |         manifest_path = os.path.join(attachments_dir, "manifest.json")
245 | 
246 |         manifest = {
247 |             "attachments": [
248 |                 {
249 |                     "url": "https://github.com/user-attachments/assets/old1",
250 |                     "success": True,
251 |                     "saved_as": "old1.pdf",
252 |                 },
253 |                 {
254 |                     "url": "https://github.com/user-attachments/assets/old2",
255 |                     "success": True,
256 |                     "saved_as": "old2.pdf",
257 |                 },
258 |             ]
259 |         }
260 |         with open(manifest_path, "w") as f:
261 |             json.dump(manifest, f)
262 | 
263 |         # Issue data with 2 old URLs and 1 new URL
264 |         issue_data = {
265 |             "body": """
266 |             Old: https://github.com/user-attachments/assets/old1
267 |             Old: https://github.com/user-attachments/assets/old2
268 |             New: https://github.com/user-attachments/assets/new1
269 |             """
270 |         }
271 | 
272 |         downloaded_urls = setup["call_download"](issue_data)
273 | 
274 |         # Should only download the NEW URL (old ones filtered by manifest)
275 |         assert len(downloaded_urls) == 1
276 |         assert downloaded_urls[0] == "https://github.com/user-attachments/assets/new1"
277 | 
278 |     def test_no_manifest_downloads_all(self, attachment_test_setup):
279 |         """Without manifest, all URLs should be downloaded."""
280 |         setup = attachment_test_setup
281 | 
282 |         # Issue data with 2 URLs
283 |         issue_data = {
284 |             "body": """
285 |             https://github.com/user-attachments/assets/url1
286 |             https://github.com/user-attachments/assets/url2
287 |             """
288 |         }
289 | 
290 |         downloaded_urls = setup["call_download"](issue_data)
291 | 
292 |         # Should download ALL URLs (no manifest to filter)
293 |         assert len(downloaded_urls) == 2
294 |         assert set(downloaded_urls) == {
295 |             "https://github.com/user-attachments/assets/url1",
296 |             "https://github.com/user-attachments/assets/url2",
297 |         }
298 | 
299 |     def test_manifest_skips_permanent_failures(self, attachment_test_setup):
300 |         """Manifest skips permanent failures (404, 410) but retries transient (503)."""
301 |         setup = attachment_test_setup
302 | 
303 |         # Create manifest with different failure types
304 |         attachments_dir = os.path.join(setup["issue_cwd"], "attachments", "123")
305 |         os.makedirs(attachments_dir)
306 |         manifest_path = os.path.join(attachments_dir, "manifest.json")
307 | 
308 |         manifest = {
309 |             "attachments": [
310 |                 {
311 |                     "url": "https://github.com/user-attachments/assets/success",
312 |                     "success": True,
313 |                     "saved_as": "success.pdf",
314 |                 },
315 |                 {
316 |                     "url": "https://github.com/user-attachments/assets/notfound",
317 |                     "success": False,
318 |                     "http_status": 404,
319 |                 },
320 |                 {
321 |                     "url": "https://github.com/user-attachments/assets/gone",
322 |                     "success": False,
323 |                     "http_status": 410,
324 |                 },
325 |                 {
326 |                     "url": "https://github.com/user-attachments/assets/unavailable",
327 |                     "success": False,
328 |                     "http_status": 503,
329 |                 },
330 |             ]
331 |         }
332 |         with open(manifest_path, "w") as f:
333 |             json.dump(manifest, f)
334 | 
335 |         # Issue data has all 4 URLs
336 |         issue_data = {
337 |             "body": """
338 |             https://github.com/user-attachments/assets/success
339 |             https://github.com/user-attachments/assets/notfound
340 |             https://github.com/user-attachments/assets/gone
341 |             https://github.com/user-attachments/assets/unavailable
342 |             """
343 |         }
344 | 
345 |         downloaded_urls = setup["call_download"](issue_data)
346 | 
347 |         # Should only retry 503 (transient failure)
348 |         # Success, 404, and 410 should be skipped
349 |         assert len(downloaded_urls) == 1
350 |         assert (
351 |             downloaded_urls[0]
352 |             == "https://github.com/user-attachments/assets/unavailable"
353 |         )
354 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =============
  2 | github-backup
  3 | =============
  4 | 
  5 | |PyPI| |Python Versions|
  6 | 
  7 | The package can be used to backup an *entire* `Github <https://github.com/>`_ organization, repository or user account, including starred repos, issues and wikis in the most appropriate format (clones for wikis, json files for issues).
  8 | 
  9 | Requirements
 10 | ============
 11 | 
 12 | - Python 3.10 or higher
 13 | - GIT 1.9+
 14 | 
 15 | Installation
 16 | ============
 17 | 
 18 | Using PIP via PyPI::
 19 | 
 20 |     pip install github-backup
 21 | 
 22 | Using PIP via Github (more likely the latest version)::
 23 | 
 24 |     pip install git+https://github.com/josegonzalez/python-github-backup.git#egg=github-backup
 25 |     
 26 | *Install note for python newcomers:*
 27 | 
 28 | Python scripts are unlikely to be included in your ``$PATH`` by default, this means it cannot be run directly in terminal with ``$ github-backup ...``, you can either add python's install path to your environments ``$PATH`` or call the script directly e.g. using ``$ ~/.local/bin/github-backup``.*
 29 | 
 30 | Basic Help
 31 | ==========
 32 | 
 33 | Show the CLI help output::
 34 | 
 35 |     github-backup -h
 36 | 
 37 | CLI Help output::
 38 | 
 39 |     github-backup [-h] [-u USERNAME] [-p PASSWORD] [-t TOKEN_CLASSIC]
 40 |                   [-f TOKEN_FINE] [--as-app] [-o OUTPUT_DIRECTORY]
 41 |                   [-l LOG_LEVEL] [-i] [--starred] [--all-starred]
 42 |                   [--watched] [--followers] [--following] [--all] [--issues]
 43 |                   [--issue-comments] [--issue-events] [--pulls]
 44 |                   [--pull-comments] [--pull-commits] [--pull-details]
 45 |                   [--labels] [--hooks] [--milestones] [--repositories]
 46 |                   [--bare] [--lfs] [--wikis] [--gists] [--starred-gists]
 47 |                   [--skip-archived] [--skip-existing] [-L [LANGUAGES ...]]
 48 |                   [-N NAME_REGEX] [-H GITHUB_HOST] [-O] [-R REPOSITORY]
 49 |                   [-P] [-F] [--prefer-ssh] [-v]
 50 |                   [--keychain-name OSX_KEYCHAIN_ITEM_NAME]
 51 |                   [--keychain-account OSX_KEYCHAIN_ITEM_ACCOUNT]
 52 |                   [--releases] [--latest-releases NUMBER_OF_LATEST_RELEASES]
 53 |                   [--skip-prerelease] [--assets] [--skip-assets-on [REPO ...]]
 54 |                   [--attachments] [--exclude [REPOSITORY [REPOSITORY ...]]
 55 |                   [--throttle-limit THROTTLE_LIMIT] [--throttle-pause THROTTLE_PAUSE]
 56 |                   USER
 57 | 
 58 |     Backup a github account
 59 | 
 60 |     positional arguments:
 61 |       USER                  github username
 62 | 
 63 |     optional arguments:
 64 |       -h, --help            show this help message and exit
 65 |       -u USERNAME, --username USERNAME
 66 |                             username for basic auth
 67 |       -p PASSWORD, --password PASSWORD
 68 |                             password for basic auth. If a username is given but
 69 |                             not a password, the password will be prompted for.
 70 |       -f TOKEN_FINE, --token-fine TOKEN_FINE
 71 |                             fine-grained personal access token or path to token
 72 |                             (file://...)
 73 |       -t TOKEN_CLASSIC, --token TOKEN_CLASSIC
 74 |                             personal access, OAuth, or JSON Web token, or path to
 75 |                             token (file://...)
 76 |       --as-app              authenticate as github app instead of as a user.
 77 |       -o OUTPUT_DIRECTORY, --output-directory OUTPUT_DIRECTORY
 78 |                             directory at which to backup the repositories
 79 |       -l LOG_LEVEL, --log-level LOG_LEVEL
 80 |                             log level to use (default: info, possible levels:
 81 |                             debug, info, warning, error, critical)
 82 |       -i, --incremental     incremental backup
 83 |       --incremental-by-files incremental backup using modified time of files
 84 |       --starred             include JSON output of starred repositories in backup
 85 |       --all-starred         include starred repositories in backup [*]
 86 |       --watched             include JSON output of watched repositories in backup
 87 |       --followers           include JSON output of followers in backup
 88 |       --following           include JSON output of following users in backup
 89 |       --all                 include everything in backup (not including [*])
 90 |       --issues              include issues in backup
 91 |       --issue-comments      include issue comments in backup
 92 |       --issue-events        include issue events in backup
 93 |       --pulls               include pull requests in backup
 94 |       --pull-comments       include pull request review comments in backup
 95 |       --pull-commits        include pull request commits in backup
 96 |       --pull-details        include more pull request details in backup [*]
 97 |       --labels              include labels in backup
 98 |       --hooks               include hooks in backup (works only when
 99 |                             authenticated)
100 |       --milestones          include milestones in backup
101 |       --repositories        include repository clone in backup
102 |       --bare                clone bare repositories
103 |       --lfs                 clone LFS repositories (requires Git LFS to be
104 |                             installed, https://git-lfs.github.com) [*]
105 |       --wikis               include wiki clone in backup
106 |       --gists               include gists in backup [*]
107 |       --starred-gists       include starred gists in backup [*]
108 |       --skip-existing       skip project if a backup directory exists
109 |       -L [LANGUAGES [LANGUAGES ...]], --languages [LANGUAGES [LANGUAGES ...]]
110 |                             only allow these languages
111 |       -N NAME_REGEX, --name-regex NAME_REGEX
112 |                             python regex to match names against
113 |       -H GITHUB_HOST, --github-host GITHUB_HOST
114 |                             GitHub Enterprise hostname
115 |       -O, --organization    whether or not this is an organization user
116 |       -R REPOSITORY, --repository REPOSITORY
117 |                             name of repository to limit backup to
118 |       -P, --private         include private repositories [*]
119 |       -F, --fork            include forked repositories [*]
120 |       --prefer-ssh          Clone repositories using SSH instead of HTTPS
121 |       -v, --version         show program's version number and exit
122 |       --keychain-name OSX_KEYCHAIN_ITEM_NAME
123 |                             OSX ONLY: name field of password item in OSX keychain
124 |                             that holds the personal access or OAuth token
125 |       --keychain-account OSX_KEYCHAIN_ITEM_ACCOUNT
126 |                             OSX ONLY: account field of password item in OSX
127 |                             keychain that holds the personal access or OAuth token
128 |       --releases            include release information, not including assets or
129 |                             binaries
130 |       --latest-releases NUMBER_OF_LATEST_RELEASES
131 |                             include certain number of the latest releases;
132 |                             only applies if including releases
133 |       --skip-prerelease     skip prerelease and draft versions; only applies if including releases
134 |       --assets              include assets alongside release information; only
135 |                             applies if including releases
136 |       --skip-assets-on [REPO ...]
137 |                             skip asset downloads for these repositories (e.g.
138 |                             --skip-assets-on repo1 owner/repo2)
139 |       --attachments         download user-attachments from issues and pull requests
140 |                             to issues/attachments/{issue_number}/ and
141 |                             pulls/attachments/{pull_number}/ directories
142 |       --exclude [REPOSITORY [REPOSITORY ...]]
143 |                             names of repositories to exclude from backup.
144 |       --throttle-limit THROTTLE_LIMIT
145 |                             start throttling of GitHub API requests after this
146 |                             amount of API requests remain
147 |       --throttle-pause THROTTLE_PAUSE
148 |                             wait this amount of seconds when API request
149 |                             throttling is active (default: 30.0, requires
150 |                             --throttle-limit to be set)
151 | 
152 | 
153 | Usage Details
154 | =============
155 | 
156 | Authentication
157 | --------------
158 | 
159 | **Password-based authentication** will fail if you have two-factor authentication enabled, and will `be deprecated <https://github.blog/2023-03-09-raising-the-bar-for-software-security-github-2fa-begins-march-13/>`_ by 2023 EOY.
160 | 
161 | ``--username`` is used for basic password authentication and separate from the positional argument ``USER``, which specifies the user account you wish to back up.
162 | 
163 | **Classic tokens** are `slightly less secure <https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#personal-access-tokens-classic>`_ as they provide very coarse-grained permissions.
164 | 
165 | If you need authentication for long-running backups (e.g. for a cron job) it is recommended to use **fine-grained personal access token** ``-f TOKEN_FINE``.
166 | 
167 | 
168 | Fine Tokens
169 | ~~~~~~~~~~~
170 | 
171 | You can "generate new token", choosing the repository scope by selecting specific repos or all repos. On Github this is under *Settings -> Developer Settings -> Personal access tokens -> Fine-grained Tokens*
172 | 
173 | Customise the permissions for your use case, but for a personal account full backup you'll need to enable the following permissions:
174 | 
175 | **User permissions**: Read access to followers, starring, and watching.
176 | 
177 | **Repository permissions**: Read access to contents, issues, metadata, pull requests, and webhooks.
178 | 
179 | 
180 | GitHub Apps
181 | ~~~~~~~~~~~
182 | 
183 | GitHub Apps are ideal for organization backups in CI/CD. Tokens are scoped to specific repositories and expire after 1 hour.
184 | 
185 | **One-time setup:**
186 | 
187 | 1. Create a GitHub App at *Settings -> Developer Settings -> GitHub Apps -> New GitHub App*
188 | 2. Set a name and homepage URL (can be any URL)
189 | 3. Uncheck "Webhook > Active" (not needed for backups)
190 | 4. Set permissions (same as fine-grained tokens above)
191 | 5. Click "Create GitHub App", then note the **App ID** shown on the next page
192 | 6. Under "Private keys", click "Generate a private key" and save the downloaded file
193 | 7. Go to *Install App* in your app's settings
194 | 8. Select the account/organization and which repositories to back up
195 | 
196 | **CI/CD usage with GitHub Actions:**
197 | 
198 | Store the App ID as a repository variable and the private key contents as a secret, then use ``actions/create-github-app-token``::
199 | 
200 |     - uses: actions/create-github-app-token@v1
201 |       id: app-token
202 |       with:
203 |         app-id: ${{ vars.APP_ID }}
204 |         private-key: ${{ secrets.APP_PRIVATE_KEY }}
205 | 
206 |     - run: github-backup myorg -t ${{ steps.app-token.outputs.token }} --as-app -o ./backup --all
207 | 
208 | Note: Installation tokens expire after 1 hour. For long-running backups, use a fine-grained personal access token instead.
209 | 
210 | 
211 | Prefer SSH
212 | ~~~~~~~~~~
213 | 
214 | If cloning repos is enabled with ``--repositories``, ``--all-starred``, ``--wikis``, ``--gists``, ``--starred-gists`` using the ``--prefer-ssh`` argument will use ssh for cloning the git repos, but all other connections will still use their own protocol, e.g. API requests for issues uses HTTPS.
215 | 
216 | To clone with SSH, you'll need SSH authentication setup `as usual with Github <https://docs.github.com/en/authentication/connecting-to-github-with-ssh>`_, e.g. via SSH public and private keys.
217 | 
218 | 
219 | Using the Keychain on Mac OSX
220 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
221 | Note: On Mac OSX the token can be stored securely in the user's keychain. To do this:
222 | 
223 | 1. Open Keychain from "Applications -> Utilities -> Keychain Access"
224 | 2. Add a new password item using "File -> New Password Item"
225 | 3. Enter a name in the "Keychain Item Name" box. You must provide this name to github-backup using the --keychain-name argument.
226 | 4. Enter an account name in the "Account Name" box, enter your Github username as set above. You must provide this name to github-backup using the --keychain-account argument.
227 | 5. Enter your Github personal access token in the "Password" box
228 | 
229 | Note:  When you run github-backup, you will be asked whether you want to allow "security" to use your confidential information stored in your keychain. You have two options:
230 | 
231 | 1. **Allow:** In this case you will need to click "Allow" each time you run `github-backup`
232 | 2. **Always Allow:** In this case, you will not be asked for permission when you run `github-backup` in future. This is less secure, but is required if you want to schedule `github-backup` to run automatically
233 | 
234 | 
235 | Github Rate-limit and Throttling
236 | --------------------------------
237 | 
238 | "github-backup" will automatically throttle itself based on feedback from the Github API. 
239 | 
240 | Their API is usually rate-limited to 5000 calls per hour. The API will ask github-backup to pause until a specific time when the limit is reset again (at the start of the next hour). This continues until the backup is complete.
241 | 
242 | During a large backup, such as ``--all-starred``, and on a fast connection this can result in (~20 min) pauses with bursts of API calls periodically maxing out the API limit. If this is not suitable `it has been observed <https://github.com/josegonzalez/python-github-backup/issues/76#issuecomment-636158717>`_ under real-world conditions that overriding the throttle with ``--throttle-limit 5000 --throttle-pause 0.6`` provides a smooth rate across the hour, although a ``--throttle-pause 0.72`` (3600 seconds [1 hour] / 5000 limit) is theoretically safer to prevent large rate-limit pauses.
243 | 
244 | 
245 | About Git LFS
246 | -------------
247 | 
248 | When you use the ``--lfs`` option, you will need to make sure you have Git LFS installed.
249 | 
250 | Instructions on how to do this can be found on https://git-lfs.github.com.
251 | 
252 | LFS objects are fetched for all refs, not just the current checkout, ensuring a complete backup of all LFS content across all branches and history.
253 | 
254 | 
255 | About Attachments
256 | -----------------
257 | 
258 | When you use the ``--attachments`` option with ``--issues`` or ``--pulls``, the tool will download user-uploaded attachments (images, videos, documents, etc.) from issue and pull request descriptions and comments. In some circumstances attachments contain valuable data related to the topic, and without their backup important information or context might be lost inadvertently.
259 | 
260 | Attachments are saved to ``issues/attachments/{issue_number}/`` and ``pulls/attachments/{pull_number}/`` directories, where ``{issue_number}`` is the GitHub issue number (e.g., issue #123 saves to ``issues/attachments/123/``). Each attachment directory contains:
261 | 
262 | - The downloaded attachment files (named by their GitHub identifier with appropriate file extensions)
263 | - If multiple attachments have the same filename, conflicts are resolved with numeric suffixes (e.g., ``report.pdf``, ``report_1.pdf``, ``report_2.pdf``)
264 | - A ``manifest.json`` file documenting all downloads, including URLs, file metadata, and download status
265 | 
266 | The tool automatically extracts file extensions from HTTP headers to ensure files can be more easily opened by your operating system.
267 | 
268 | **Supported URL formats:**
269 | 
270 | - Modern: ``github.com/user-attachments/{assets,files}/*``
271 | - Legacy: ``user-images.githubusercontent.com/*`` and ``private-user-images.githubusercontent.com/*``
272 | - Repo files: ``github.com/{owner}/{repo}/files/*`` (filtered to current repository)
273 | - Repo assets: ``github.com/{owner}/{repo}/assets/*`` (filtered to current repository)
274 | 
275 | **Repository filtering** for repo files/assets handles renamed and transferred repositories gracefully. URLs are included if they either match the current repository name directly, or redirect to it (e.g., ``willmcgugan/rich`` redirects to ``Textualize/rich`` after transfer).
276 | 
277 | 
278 | Run in Docker container
279 | -----------------------
280 | 
281 | To run the tool in a Docker container use the following command:
282 | 
283 |     sudo docker run --rm -v /path/to/backup:/data --name github-backup ghcr.io/josegonzalez/python-github-backup -o /data $OPTIONS $USER
284 | 
285 | Gotchas / Known-issues
286 | ======================
287 | 
288 | All is not everything
289 | ---------------------
290 | 
291 | The ``--all`` argument does not include: cloning private repos (``-P, --private``), cloning forks (``-F, --fork``), cloning starred repositories (``--all-starred``), ``--pull-details``, cloning LFS repositories (``--lfs``), cloning gists (``--gists``) or cloning starred gist repos (``--starred-gists``). See examples for more.
292 | 
293 | Cloning all starred size
294 | ------------------------
295 | 
296 | Using the ``--all-starred`` argument to clone all starred repositories may use a large amount of storage space, especially if ``--all`` or more arguments are used. e.g. commonly starred repos can have tens of thousands of issues, many large assets and the repo itself etc. Consider just storing links to starred repos in JSON format with ``--starred``.
297 | 
298 | Incremental Backup
299 | ------------------
300 | 
301 | Using (``-i, --incremental``) will only request new data from the API **since the last run (successful or not)**. e.g. only request issues from the API since the last run. 
302 | 
303 | This means any blocking errors on previous runs can cause a large amount of missing data in backups.
304 | 
305 | Using (``--incremental-by-files``) will request new data from the API **based on when the file was modified on filesystem**. e.g. if you modify the file yourself you may miss something.
306 | 
307 | Still saver than the previous version.
308 | 
309 | Specifically, issues and pull requests are handled like this.
310 | 
311 | Known blocking errors
312 | ---------------------
313 | 
314 | Some errors will block the backup run by exiting the script. e.g. receiving a 403 Forbidden error from the Github API.
315 | 
316 | If the incremental argument is used, this will result in the next backup only requesting API data since the last blocked/failed run. Potentially causing unexpected large amounts of missing data.
317 | 
318 | It's therefore recommended to only use the incremental argument if the output/result is being actively monitored, or complimented with periodic full non-incremental runs, to avoid unexpected missing data in a regular backup runs.
319 | 
320 | **Starred public repo hooks blocking**
321 | 
322 | Since the ``--all`` argument includes ``--hooks``, if you use ``--all`` and ``--all-starred`` together to clone a users starred public repositories, the backup will likely error and block the backup continuing.
323 | 
324 | This is due to needing the correct permission for ``--hooks`` on public repos.
325 | 
326 | 
327 | "bare" is actually "mirror"
328 | ---------------------------
329 | 
330 | Using the bare clone argument (``--bare``) will actually call git's ``clone --mirror`` command. There's a subtle difference between `bare <https://www.git-scm.com/docs/git-clone#Documentation/git-clone.txt---bare>`_ and `mirror <https://www.git-scm.com/docs/git-clone#Documentation/git-clone.txt---mirror>`_ clone.
331 | 
332 | *From git docs "Compared to --bare, --mirror not only maps local branches of the source to local branches of the target, it maps all refs (including remote-tracking branches, notes etc.) and sets up a refspec configuration such that all these refs are overwritten by a git remote update in the target repository."*
333 | 
334 | 
335 | Starred gists vs starred repo behaviour
336 | ---------------------------------------
337 | 
338 | The starred normal repo cloning (``--all-starred``) argument stores starred repos separately to the users own repositories. However, using ``--starred-gists`` will store starred gists within the same directory as the users own gists ``--gists``. Also, all gist repo directory names are IDs not the gist's name.
339 | 
340 | Note: ``--starred-gists`` only retrieves starred gists for the authenticated user, not the target user, due to a GitHub API limitation.
341 | 
342 | 
343 | Skip existing on incomplete backups
344 | -----------------------------------
345 | 
346 | The ``--skip-existing`` argument will skip a backup if the directory already exists, even if the backup in that directory failed (perhaps due to a blocking error). This may result in unexpected missing data in a regular backup.
347 | 
348 | 
349 | Updates use fetch, not pull
350 | ---------------------------
351 | 
352 | When updating an existing repository backup, ``github-backup`` uses ``git fetch`` rather than ``git pull``. This is intentional - a backup tool should reliably download data without risk of failure. Using ``git pull`` would require handling merge conflicts, which adds complexity and could cause backups to fail unexpectedly.
353 | 
354 | With fetch, **all branches and commits are downloaded** safely into remote-tracking branches. The working directory files won't change, but your backup is complete.
355 | 
356 | If you look at files directly (e.g., ``cat README.md``), you'll see the old content. The new data is in the remote-tracking branches (confusingly named "remote" but stored locally). To view or use the latest files::
357 | 
358 |     git show origin/main:README.md           # view a file
359 |     git merge origin/main                    # update working directory
360 | 
361 | All branches are backed up as remote refs (``origin/main``, ``origin/feature-branch``, etc.).
362 | 
363 | If you want to browse files directly without merging, consider using ``--bare`` which skips the working directory entirely - the backup is just the git data.
364 | 
365 | See `#269 <https://github.com/josegonzalez/python-github-backup/issues/269>`_ for more discussion.
366 | 
367 | 
368 | Github Backup Examples
369 | ======================
370 | 
371 | Backup all repositories, including private ones using a classic token::
372 | 
373 |     export ACCESS_TOKEN=SOME-GITHUB-TOKEN
374 |     github-backup WhiteHouse --token $ACCESS_TOKEN --organization --output-directory /tmp/white-house --repositories --private
375 | 
376 | Use a fine-grained access token to backup a single organization repository with everything else (wiki, pull requests, comments, issues etc)::
377 | 
378 |     export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
379 |     ORGANIZATION=docker
380 |     REPO=cli
381 |     # e.g. git@github.com:docker/cli.git
382 |     github-backup $ORGANIZATION -P -f $FINE_ACCESS_TOKEN -o . --all -O -R $REPO
383 | 
384 | Quietly and incrementally backup useful Github user data (public and private repos with SSH) including; all issues, pulls, all public starred repos and gists (omitting "hooks", "releases" and therefore "assets" to prevent blocking). *Great for a cron job.* ::
385 | 
386 |     export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
387 |     GH_USER=YOUR-GITHUB-USER
388 | 
389 |     github-backup -f $FINE_ACCESS_TOKEN --prefer-ssh -o ~/github-backup/ -l error -P -i --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --attachments --pull-details --gists --starred-gists $GH_USER
390 |     
391 | Debug an error/block or incomplete backup into a temporary directory. Omit "incremental" to fill a previous incomplete backup. ::
392 | 
393 |     export FINE_ACCESS_TOKEN=SOME-GITHUB-TOKEN
394 |     GH_USER=YOUR-GITHUB-USER
395 | 
396 |     github-backup -f $FINE_ACCESS_TOKEN -o /tmp/github-backup/ -l debug -P --all-starred --starred --watched --followers --following --issues --issue-comments --issue-events --pulls --pull-comments --pull-commits --labels --milestones --repositories --wikis --releases --assets --pull-details --gists --starred-gists $GH_USER
397 | 
398 | Pipe a token from stdin to avoid storing it in environment variables or command history (Unix-like systems only)::
399 | 
400 |     my-secret-manager get github-token | github-backup user -t file:///dev/stdin -o /backup --repositories
401 | 
402 | Restoring from Backup
403 | =====================
404 | 
405 | This tool creates backups only, there is no inbuilt restore command.
406 | 
407 | **Git repositories, wikis, and gists** can be restored by pushing them back to GitHub as you would any git repository. For example, to restore a bare repository backup::
408 | 
409 |     cd /tmp/white-house/repositories/petitions/repository
410 |     git push --mirror git@github.com:WhiteHouse/petitions.git
411 | 
412 | **Issues, pull requests, comments, and other metadata** are saved as JSON files for archival purposes. The GitHub API does not support recreating this data faithfully, creating issues via the API has limitations:
413 | 
414 | - New issue/PR numbers are assigned (original numbers cannot be set)
415 | - Timestamps reflect creation time (original dates cannot be set)
416 | - The API caller becomes the author (original authors cannot be set)
417 | - Cross-references between issues and PRs will break
418 | 
419 | These are GitHub API limitations that affect all backup and migration tools, not just this one. Recreating issues with these limitations via the GitHub API is an exercise for the reader. The JSON backups remain useful for searching, auditing, or manual reference.
420 | 
421 | 
422 | Development
423 | ===========
424 | 
425 | This project is considered feature complete for the primary maintainer @josegonzalez. If you would like a bugfix or enhancement, pull requests are welcome. Feel free to contact the maintainer for consulting estimates if you'd like to sponsor the work instead.
426 | 
427 | Contibuters
428 | -----------
429 | 
430 | A huge thanks to all the contibuters!
431 | 
432 | .. image:: https://contrib.rocks/image?repo=josegonzalez/python-github-backup
433 |    :target: https://github.com/josegonzalez/python-github-backup/graphs/contributors
434 |    :alt: contributors
435 | 
436 | Testing
437 | -------
438 | 
439 | To run the test suite::
440 | 
441 |     pip install pytest
442 |     pytest
443 | 
444 | To run linting::
445 | 
446 |     pip install flake8
447 |     flake8 --ignore=E501
448 | 
449 | 
450 | .. |PyPI| image:: https://img.shields.io/pypi/v/github-backup.svg
451 |    :target: https://pypi.python.org/pypi/github-backup/
452 | .. |Python Versions| image:: https://img.shields.io/pypi/pyversions/github-backup.svg
453 |    :target: https://github.com/josegonzalez/python-github-backup
454 | 


--------------------------------------------------------------------------------
/github_backup/github_backup.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | 
   3 | from __future__ import print_function
   4 | 
   5 | import argparse
   6 | import base64
   7 | import calendar
   8 | import codecs
   9 | import errno
  10 | import getpass
  11 | import json
  12 | import logging
  13 | import os
  14 | import platform
  15 | import re
  16 | import select
  17 | import socket
  18 | import ssl
  19 | import subprocess
  20 | import sys
  21 | import time
  22 | from datetime import datetime
  23 | from http.client import IncompleteRead
  24 | from urllib.error import HTTPError, URLError
  25 | from urllib.parse import quote as urlquote
  26 | from urllib.parse import urlencode, urlparse
  27 | from urllib.request import HTTPRedirectHandler, Request, build_opener, urlopen
  28 | 
  29 | try:
  30 |     from . import __version__
  31 | 
  32 |     VERSION = __version__
  33 | except ImportError:
  34 |     VERSION = "unknown"
  35 | 
  36 | FNULL = open(os.devnull, "w")
  37 | FILE_URI_PREFIX = "file://"
  38 | logger = logging.getLogger(__name__)
  39 | 
  40 | 
  41 | class RepositoryUnavailableError(Exception):
  42 |     """Raised when a repository is unavailable due to legal reasons (e.g., DMCA takedown)."""
  43 | 
  44 |     def __init__(self, message, dmca_url=None):
  45 |         super().__init__(message)
  46 |         self.dmca_url = dmca_url
  47 | 
  48 | 
  49 | # Setup SSL context with fallback chain
  50 | https_ctx = ssl.create_default_context()
  51 | if https_ctx.get_ca_certs():
  52 |     # Layer 1: Certificates pre-loaded from system (file-based)
  53 |     pass
  54 | else:
  55 |     paths = ssl.get_default_verify_paths()
  56 |     if (paths.cafile and os.path.exists(paths.cafile)) or (
  57 |         paths.capath and os.path.exists(paths.capath)
  58 |     ):
  59 |         # Layer 2: Cert paths exist, will be lazy-loaded on first use (directory-based)
  60 |         pass
  61 |     else:
  62 |         # Layer 3: Try certifi package as optional fallback
  63 |         try:
  64 |             import certifi
  65 | 
  66 |             https_ctx = ssl.create_default_context(cafile=certifi.where())
  67 |         except ImportError:
  68 |             # All layers failed - no certificates available anywhere
  69 |             sys.exit(
  70 |                 "\nERROR: No CA certificates found. Cannot connect to GitHub over SSL.\n\n"
  71 |                 "Solutions you can explore:\n"
  72 |                 "  1. pip install certifi\n"
  73 |                 "  2. Alpine: apk add ca-certificates\n"
  74 |                 "  3. Debian/Ubuntu: apt-get install ca-certificates\n\n"
  75 |             )
  76 | 
  77 | 
  78 | def logging_subprocess(
  79 |     popenargs, stdout_log_level=logging.DEBUG, stderr_log_level=logging.ERROR, **kwargs
  80 | ):
  81 |     """
  82 |     Variant of subprocess.call that accepts a logger instead of stdout/stderr,
  83 |     and logs stdout messages via logger.debug and stderr messages via
  84 |     logger.error.
  85 |     """
  86 |     child = subprocess.Popen(
  87 |         popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs
  88 |     )
  89 |     if sys.platform == "win32":
  90 |         logger.info(
  91 |             "Windows operating system detected - no subprocess logging will be returned"
  92 |         )
  93 | 
  94 |     log_level = {child.stdout: stdout_log_level, child.stderr: stderr_log_level}
  95 | 
  96 |     def check_io():
  97 |         if sys.platform == "win32":
  98 |             return
  99 |         ready_to_read = select.select([child.stdout, child.stderr], [], [], 1000)[0]
 100 |         for io in ready_to_read:
 101 |             line = io.readline()
 102 |             if not logger:
 103 |                 continue
 104 |             if not (io == child.stderr and not line):
 105 |                 logger.log(log_level[io], line[:-1])
 106 | 
 107 |     # keep checking stdout/stderr until the child exits
 108 |     while child.poll() is None:
 109 |         check_io()
 110 | 
 111 |     check_io()  # check again to catch anything after the process exits
 112 | 
 113 |     rc = child.wait()
 114 | 
 115 |     if rc != 0:
 116 |         print("{} returned {}:".format(popenargs[0], rc), file=sys.stderr)
 117 |         print("\t", " ".join(popenargs), file=sys.stderr)
 118 | 
 119 |     return rc
 120 | 
 121 | 
 122 | def mkdir_p(*args):
 123 |     for path in args:
 124 |         try:
 125 |             os.makedirs(path)
 126 |         except OSError as exc:  # Python >2.5
 127 |             if exc.errno == errno.EEXIST and os.path.isdir(path):
 128 |                 pass
 129 |             else:
 130 |                 raise
 131 | 
 132 | 
 133 | def mask_password(url, secret="*****"):
 134 |     parsed = urlparse(url)
 135 | 
 136 |     if not parsed.password:
 137 |         return url
 138 |     elif parsed.password == "x-oauth-basic":
 139 |         return url.replace(parsed.username, secret)
 140 | 
 141 |     return url.replace(parsed.password, secret)
 142 | 
 143 | 
 144 | def parse_args(args=None):
 145 |     parser = argparse.ArgumentParser(description="Backup a github account")
 146 |     parser.add_argument("user", metavar="USER", type=str, help="github username")
 147 |     parser.add_argument(
 148 |         "-u", "--username", dest="username", help="username for basic auth"
 149 |     )
 150 |     parser.add_argument(
 151 |         "-p",
 152 |         "--password",
 153 |         dest="password",
 154 |         help="password for basic auth. "
 155 |         "If a username is given but not a password, the "
 156 |         "password will be prompted for.",
 157 |     )
 158 |     parser.add_argument(
 159 |         "-t",
 160 |         "--token",
 161 |         dest="token_classic",
 162 |         help="personal access, OAuth, or JSON Web token, or path to token (file://...)",
 163 |     )  # noqa
 164 |     parser.add_argument(
 165 |         "-f",
 166 |         "--token-fine",
 167 |         dest="token_fine",
 168 |         help="fine-grained personal access token (github_pat_....), or path to token (file://...)",
 169 |     )  # noqa
 170 |     parser.add_argument(
 171 |         "-q",
 172 |         "--quiet",
 173 |         action="store_true",
 174 |         dest="quiet",
 175 |         help="supress log messages less severe than warning, e.g. info",
 176 |     )
 177 |     parser.add_argument(
 178 |         "--as-app",
 179 |         action="store_true",
 180 |         dest="as_app",
 181 |         help="authenticate as github app instead of as a user.",
 182 |     )
 183 |     parser.add_argument(
 184 |         "-o",
 185 |         "--output-directory",
 186 |         default=".",
 187 |         dest="output_directory",
 188 |         help="directory at which to backup the repositories",
 189 |     )
 190 |     parser.add_argument(
 191 |         "-l",
 192 |         "--log-level",
 193 |         default="info",
 194 |         dest="log_level",
 195 |         help="log level to use (default: info, possible levels: debug, info, warning, error, critical)",
 196 |     )
 197 |     parser.add_argument(
 198 |         "-i",
 199 |         "--incremental",
 200 |         action="store_true",
 201 |         dest="incremental",
 202 |         help="incremental backup",
 203 |     )
 204 |     parser.add_argument(
 205 |         "--incremental-by-files",
 206 |         action="store_true",
 207 |         dest="incremental_by_files",
 208 |         help="incremental backup based on modification date of files",
 209 |     )
 210 |     parser.add_argument(
 211 |         "--starred",
 212 |         action="store_true",
 213 |         dest="include_starred",
 214 |         help="include JSON output of starred repositories in backup",
 215 |     )
 216 |     parser.add_argument(
 217 |         "--all-starred",
 218 |         action="store_true",
 219 |         dest="all_starred",
 220 |         help="include starred repositories in backup [*]",
 221 |     )
 222 |     parser.add_argument(
 223 |         "--watched",
 224 |         action="store_true",
 225 |         dest="include_watched",
 226 |         help="include JSON output of watched repositories in backup",
 227 |     )
 228 |     parser.add_argument(
 229 |         "--followers",
 230 |         action="store_true",
 231 |         dest="include_followers",
 232 |         help="include JSON output of followers in backup",
 233 |     )
 234 |     parser.add_argument(
 235 |         "--following",
 236 |         action="store_true",
 237 |         dest="include_following",
 238 |         help="include JSON output of following users in backup",
 239 |     )
 240 |     parser.add_argument(
 241 |         "--all",
 242 |         action="store_true",
 243 |         dest="include_everything",
 244 |         help="include everything in backup (not including [*])",
 245 |     )
 246 |     parser.add_argument(
 247 |         "--issues",
 248 |         action="store_true",
 249 |         dest="include_issues",
 250 |         help="include issues in backup",
 251 |     )
 252 |     parser.add_argument(
 253 |         "--issue-comments",
 254 |         action="store_true",
 255 |         dest="include_issue_comments",
 256 |         help="include issue comments in backup",
 257 |     )
 258 |     parser.add_argument(
 259 |         "--issue-events",
 260 |         action="store_true",
 261 |         dest="include_issue_events",
 262 |         help="include issue events in backup",
 263 |     )
 264 |     parser.add_argument(
 265 |         "--pulls",
 266 |         action="store_true",
 267 |         dest="include_pulls",
 268 |         help="include pull requests in backup",
 269 |     )
 270 |     parser.add_argument(
 271 |         "--pull-comments",
 272 |         action="store_true",
 273 |         dest="include_pull_comments",
 274 |         help="include pull request review comments in backup",
 275 |     )
 276 |     parser.add_argument(
 277 |         "--pull-commits",
 278 |         action="store_true",
 279 |         dest="include_pull_commits",
 280 |         help="include pull request commits in backup",
 281 |     )
 282 |     parser.add_argument(
 283 |         "--pull-details",
 284 |         action="store_true",
 285 |         dest="include_pull_details",
 286 |         help="include more pull request details in backup [*]",
 287 |     )
 288 |     parser.add_argument(
 289 |         "--labels",
 290 |         action="store_true",
 291 |         dest="include_labels",
 292 |         help="include labels in backup",
 293 |     )
 294 |     parser.add_argument(
 295 |         "--hooks",
 296 |         action="store_true",
 297 |         dest="include_hooks",
 298 |         help="include hooks in backup (works only when authenticated)",
 299 |     )  # noqa
 300 |     parser.add_argument(
 301 |         "--milestones",
 302 |         action="store_true",
 303 |         dest="include_milestones",
 304 |         help="include milestones in backup",
 305 |     )
 306 |     parser.add_argument(
 307 |         "--repositories",
 308 |         action="store_true",
 309 |         dest="include_repository",
 310 |         help="include repository clone in backup",
 311 |     )
 312 |     parser.add_argument(
 313 |         "--bare", action="store_true", dest="bare_clone", help="clone bare repositories"
 314 |     )
 315 |     parser.add_argument(
 316 |         "--no-prune",
 317 |         action="store_true",
 318 |         dest="no_prune",
 319 |         help="disable prune option for git fetch",
 320 |     )
 321 |     parser.add_argument(
 322 |         "--lfs",
 323 |         action="store_true",
 324 |         dest="lfs_clone",
 325 |         help="clone LFS repositories (requires Git LFS to be installed, https://git-lfs.github.com) [*]",
 326 |     )
 327 |     parser.add_argument(
 328 |         "--wikis",
 329 |         action="store_true",
 330 |         dest="include_wiki",
 331 |         help="include wiki clone in backup",
 332 |     )
 333 |     parser.add_argument(
 334 |         "--gists",
 335 |         action="store_true",
 336 |         dest="include_gists",
 337 |         help="include gists in backup [*]",
 338 |     )
 339 |     parser.add_argument(
 340 |         "--starred-gists",
 341 |         action="store_true",
 342 |         dest="include_starred_gists",
 343 |         help="include starred gists in backup [*]",
 344 |     )
 345 |     parser.add_argument(
 346 |         "--skip-archived",
 347 |         action="store_true",
 348 |         dest="skip_archived",
 349 |         help="skip project if it is archived",
 350 |     )
 351 |     parser.add_argument(
 352 |         "--skip-existing",
 353 |         action="store_true",
 354 |         dest="skip_existing",
 355 |         help="skip project if a backup directory exists",
 356 |     )
 357 |     parser.add_argument(
 358 |         "-L",
 359 |         "--languages",
 360 |         dest="languages",
 361 |         help="only allow these languages",
 362 |         nargs="*",
 363 |     )
 364 |     parser.add_argument(
 365 |         "-N",
 366 |         "--name-regex",
 367 |         dest="name_regex",
 368 |         help="python regex to match names against",
 369 |     )
 370 |     parser.add_argument(
 371 |         "-H", "--github-host", dest="github_host", help="GitHub Enterprise hostname"
 372 |     )
 373 |     parser.add_argument(
 374 |         "-O",
 375 |         "--organization",
 376 |         action="store_true",
 377 |         dest="organization",
 378 |         help="whether or not this is an organization user",
 379 |     )
 380 |     parser.add_argument(
 381 |         "-R",
 382 |         "--repository",
 383 |         dest="repository",
 384 |         help="name of repository to limit backup to",
 385 |     )
 386 |     parser.add_argument(
 387 |         "-P",
 388 |         "--private",
 389 |         action="store_true",
 390 |         dest="private",
 391 |         help="include private repositories [*]",
 392 |     )
 393 |     parser.add_argument(
 394 |         "-F",
 395 |         "--fork",
 396 |         action="store_true",
 397 |         dest="fork",
 398 |         help="include forked repositories [*]",
 399 |     )
 400 |     parser.add_argument(
 401 |         "--prefer-ssh",
 402 |         action="store_true",
 403 |         help="Clone repositories using SSH instead of HTTPS",
 404 |     )
 405 |     parser.add_argument(
 406 |         "-v", "--version", action="version", version="%(prog)s " + VERSION
 407 |     )
 408 |     parser.add_argument(
 409 |         "--keychain-name",
 410 |         dest="osx_keychain_item_name",
 411 |         help="OSX ONLY: name field of password item in OSX keychain that holds the personal access or OAuth token",
 412 |     )
 413 |     parser.add_argument(
 414 |         "--keychain-account",
 415 |         dest="osx_keychain_item_account",
 416 |         help="OSX ONLY: account field of password item in OSX keychain that holds the personal access or OAuth token",
 417 |     )
 418 |     parser.add_argument(
 419 |         "--releases",
 420 |         action="store_true",
 421 |         dest="include_releases",
 422 |         help="include release information, not including assets or binaries",
 423 |     )
 424 |     parser.add_argument(
 425 |         "--latest-releases",
 426 |         type=int,
 427 |         default=0,
 428 |         dest="number_of_latest_releases",
 429 |         help="include certain number of the latest releases; only applies if including releases",
 430 |     )
 431 |     parser.add_argument(
 432 |         "--skip-prerelease",
 433 |         action="store_true",
 434 |         dest="skip_prerelease",
 435 |         help="skip prerelease and draft versions; only applies if including releases",
 436 |     )
 437 |     parser.add_argument(
 438 |         "--assets",
 439 |         action="store_true",
 440 |         dest="include_assets",
 441 |         help="include assets alongside release information; only applies if including releases",
 442 |     )
 443 |     parser.add_argument(
 444 |         "--skip-assets-on",
 445 |         dest="skip_assets_on",
 446 |         nargs="*",
 447 |         help="skip asset downloads for these repositories",
 448 |     )
 449 |     parser.add_argument(
 450 |         "--attachments",
 451 |         action="store_true",
 452 |         dest="include_attachments",
 453 |         help="download user-attachments from issues and pull requests",
 454 |     )
 455 |     parser.add_argument(
 456 |         "--throttle-limit",
 457 |         dest="throttle_limit",
 458 |         type=int,
 459 |         default=0,
 460 |         help="start throttling of GitHub API requests after this amount of API requests remain",
 461 |     )
 462 |     parser.add_argument(
 463 |         "--throttle-pause",
 464 |         dest="throttle_pause",
 465 |         type=float,
 466 |         default=30.0,
 467 |         help="wait this amount of seconds when API request throttling is active (default: 30.0, requires --throttle-limit to be set)",
 468 |     )
 469 |     parser.add_argument(
 470 |         "--exclude", dest="exclude", help="names of repositories to exclude", nargs="*"
 471 |     )
 472 |     return parser.parse_args(args)
 473 | 
 474 | 
 475 | def get_auth(args, encode=True, for_git_cli=False):
 476 |     auth = None
 477 | 
 478 |     if args.osx_keychain_item_name:
 479 |         if not args.osx_keychain_item_account:
 480 |             raise Exception(
 481 |                 "You must specify both name and account fields for osx keychain password items"
 482 |             )
 483 |         else:
 484 |             if platform.system() != "Darwin":
 485 |                 raise Exception("Keychain arguments are only supported on Mac OSX")
 486 |             try:
 487 |                 with open(os.devnull, "w") as devnull:
 488 |                     token = subprocess.check_output(
 489 |                         [
 490 |                             "security",
 491 |                             "find-generic-password",
 492 |                             "-s",
 493 |                             args.osx_keychain_item_name,
 494 |                             "-a",
 495 |                             args.osx_keychain_item_account,
 496 |                             "-w",
 497 |                         ],
 498 |                         stderr=devnull,
 499 |                     ).strip()
 500 |                 token = token.decode("utf-8")
 501 |                 auth = token + ":" + "x-oauth-basic"
 502 |             except subprocess.SubprocessError:
 503 |                 raise Exception(
 504 |                     "No password item matching the provided name and account could be found in the osx keychain."
 505 |                 )
 506 |     elif args.osx_keychain_item_account:
 507 |         raise Exception(
 508 |             "You must specify both name and account fields for osx keychain password items"
 509 |         )
 510 |     elif args.token_fine:
 511 |         if args.token_fine.startswith(FILE_URI_PREFIX):
 512 |             args.token_fine = read_file_contents(args.token_fine)
 513 | 
 514 |         if args.token_fine.startswith("github_pat_"):
 515 |             auth = args.token_fine
 516 |         else:
 517 |             raise Exception(
 518 |                 "Fine-grained token supplied does not look like a GitHub PAT"
 519 |             )
 520 |     elif args.token_classic:
 521 |         if args.token_classic.startswith(FILE_URI_PREFIX):
 522 |             args.token_classic = read_file_contents(args.token_classic)
 523 | 
 524 |         if not args.as_app:
 525 |             auth = args.token_classic + ":" + "x-oauth-basic"
 526 |         else:
 527 |             if not for_git_cli:
 528 |                 auth = args.token_classic
 529 |             else:
 530 |                 auth = "x-access-token:" + args.token_classic
 531 |     elif args.username:
 532 |         if not args.password:
 533 |             args.password = getpass.getpass()
 534 |         if encode:
 535 |             password = args.password
 536 |         else:
 537 |             password = urlquote(args.password)
 538 |         auth = args.username + ":" + password
 539 |     elif args.password:
 540 |         raise Exception("You must specify a username for basic auth")
 541 | 
 542 |     if not auth:
 543 |         return None
 544 | 
 545 |     if not encode or args.token_fine is not None:
 546 |         return auth
 547 | 
 548 |     return base64.b64encode(auth.encode("ascii"))
 549 | 
 550 | 
 551 | def get_github_api_host(args):
 552 |     if args.github_host:
 553 |         host = args.github_host + "/api/v3"
 554 |     else:
 555 |         host = "api.github.com"
 556 | 
 557 |     return host
 558 | 
 559 | 
 560 | def get_github_host(args):
 561 |     if args.github_host:
 562 |         host = args.github_host
 563 |     else:
 564 |         host = "github.com"
 565 | 
 566 |     return host
 567 | 
 568 | 
 569 | def read_file_contents(file_uri):
 570 |     return open(file_uri[len(FILE_URI_PREFIX) :], "rt").readline().strip()
 571 | 
 572 | 
 573 | def get_github_repo_url(args, repository):
 574 |     if repository.get("is_gist"):
 575 |         if args.prefer_ssh:
 576 |             # The git_pull_url value is always https for gists, so we need to transform it to ssh form
 577 |             repo_url = re.sub(
 578 |                 r"^https?:\/\/(.+)\/(.+)\.git$",
 579 |                 r"git@\1:\2.git",
 580 |                 repository["git_pull_url"],
 581 |             )
 582 |             repo_url = re.sub(
 583 |                 r"^git@gist\.", "git@", repo_url
 584 |             )  # strip gist subdomain for better hostkey compatibility
 585 |         else:
 586 |             repo_url = repository["git_pull_url"]
 587 |         return repo_url
 588 | 
 589 |     if args.prefer_ssh:
 590 |         return repository["ssh_url"]
 591 | 
 592 |     auth = get_auth(args, encode=False, for_git_cli=True)
 593 |     if auth:
 594 |         repo_url = "https://{0}@{1}/{2}/{3}.git".format(
 595 |             auth if args.token_fine is None else "oauth2:" + auth,
 596 |             get_github_host(args),
 597 |             repository["owner"]["login"],
 598 |             repository["name"],
 599 |         )
 600 |     else:
 601 |         repo_url = repository["clone_url"]
 602 | 
 603 |     return repo_url
 604 | 
 605 | 
 606 | def retrieve_data_gen(args, template, query_args=None, single_request=False):
 607 |     auth = get_auth(args, encode=not args.as_app)
 608 |     query_args = get_query_args(query_args)
 609 |     per_page = 100
 610 |     next_url = None
 611 | 
 612 |     while True:
 613 |         if single_request:
 614 |             request_per_page = None
 615 |         else:
 616 |             request_per_page = per_page
 617 | 
 618 |         request = _construct_request(
 619 |             request_per_page,
 620 |             query_args,
 621 |             next_url or template,
 622 |             auth,
 623 |             as_app=args.as_app,
 624 |             fine=True if args.token_fine is not None else False,
 625 |         )  # noqa
 626 |         r, errors = _get_response(request, auth, next_url or template)
 627 | 
 628 |         status_code = int(r.getcode())
 629 | 
 630 |         # Handle DMCA takedown (HTTP 451) - raise exception to skip entire repository
 631 |         if status_code == 451:
 632 |             dmca_url = None
 633 |             try:
 634 |                 response_data = json.loads(r.read().decode("utf-8"))
 635 |                 dmca_url = response_data.get("block", {}).get("html_url")
 636 |             except Exception:
 637 |                 pass
 638 |             raise RepositoryUnavailableError(
 639 |                 "Repository unavailable due to legal reasons (HTTP 451)",
 640 |                 dmca_url=dmca_url,
 641 |             )
 642 | 
 643 |         # Check if we got correct data
 644 |         try:
 645 |             response = json.loads(r.read().decode("utf-8"))
 646 |         except IncompleteRead:
 647 |             logger.warning("Incomplete read error detected")
 648 |             read_error = True
 649 |         except json.decoder.JSONDecodeError:
 650 |             logger.warning("JSON decode error detected")
 651 |             read_error = True
 652 |         except TimeoutError:
 653 |             logger.warning("Tiemout error detected")
 654 |             read_error = True
 655 |         else:
 656 |             read_error = False
 657 | 
 658 |         # be gentle with API request limit and throttle requests if remaining requests getting low
 659 |         limit_remaining = int(r.headers.get("x-ratelimit-remaining", 0))
 660 |         if args.throttle_limit and limit_remaining <= args.throttle_limit:
 661 |             logger.info(
 662 |                 "API request limit hit: {} requests left, pausing further requests for {}s".format(
 663 |                     limit_remaining, args.throttle_pause
 664 |                 )
 665 |             )
 666 |             time.sleep(args.throttle_pause)
 667 | 
 668 |         retries = 0
 669 |         while retries < 3 and (status_code == 502 or read_error):
 670 |             logger.warning("API request failed. Retrying in 5 seconds")
 671 |             retries += 1
 672 |             time.sleep(5)
 673 |             request = _construct_request(
 674 |                 request_per_page,
 675 |                 query_args,
 676 |                 next_url or template,
 677 |                 auth,
 678 |                 as_app=args.as_app,
 679 |                 fine=True if args.token_fine is not None else False,
 680 |             )  # noqa
 681 |             r, errors = _get_response(request, auth, next_url or template)
 682 | 
 683 |             status_code = int(r.getcode())
 684 |             try:
 685 |                 response = json.loads(r.read().decode("utf-8"))
 686 |                 read_error = False
 687 |             except IncompleteRead:
 688 |                 logger.warning("Incomplete read error detected")
 689 |                 read_error = True
 690 |             except json.decoder.JSONDecodeError:
 691 |                 logger.warning("JSON decode error detected")
 692 |                 read_error = True
 693 |             except TimeoutError:
 694 |                 logger.warning("Tiemout error detected")
 695 |                 read_error = True
 696 | 
 697 |         if status_code != 200:
 698 |             template = "API request returned HTTP {0}: {1}"
 699 |             errors.append(template.format(status_code, r.reason))
 700 |             raise Exception(", ".join(errors))
 701 | 
 702 |         if read_error:
 703 |             template = "API request problem reading response for {0}"
 704 |             errors.append(template.format(request))
 705 |             raise Exception(", ".join(errors))
 706 | 
 707 |         if len(errors) == 0:
 708 |             if type(response) is list:
 709 |                 for resp in response:
 710 |                     yield resp
 711 |                 # Parse Link header for next page URL (cursor-based pagination)
 712 |                 link_header = r.headers.get("Link", "")
 713 |                 next_url = None
 714 |                 if link_header:
 715 |                     # Parse Link header: <https://api.github.com/...?per_page=100&after=cursor>; rel="next"
 716 |                     for link in link_header.split(","):
 717 |                         if 'rel="next"' in link:
 718 |                             next_url = link[link.find("<") + 1 : link.find(">")]
 719 |                             break
 720 |                 if not next_url:
 721 |                     break
 722 |             elif type(response) is dict and single_request:
 723 |                 yield response
 724 | 
 725 |         if len(errors) > 0:
 726 |             raise Exception(", ".join(errors))
 727 | 
 728 |         if single_request:
 729 |             break
 730 | 
 731 | 
 732 | def retrieve_data(args, template, query_args=None, single_request=False):
 733 |     return list(retrieve_data_gen(args, template, query_args, single_request))
 734 | 
 735 | 
 736 | def get_query_args(query_args=None):
 737 |     if not query_args:
 738 |         query_args = {}
 739 |     return query_args
 740 | 
 741 | 
 742 | def _get_response(request, auth, template):
 743 |     retry_timeout = 3
 744 |     errors = []
 745 |     # We'll make requests in a loop so we can
 746 |     # delay and retry in the case of rate-limiting
 747 |     while True:
 748 |         should_continue = False
 749 |         try:
 750 |             r = urlopen(request, context=https_ctx)
 751 |         except HTTPError as exc:
 752 |             errors, should_continue = _request_http_error(exc, auth, errors)  # noqa
 753 |             r = exc
 754 |         except URLError as e:
 755 |             logger.warning(e.reason)
 756 |             should_continue, retry_timeout = _request_url_error(template, retry_timeout)
 757 |             if not should_continue:
 758 |                 raise
 759 |         except socket.error as e:
 760 |             logger.warning(e.strerror)
 761 |             should_continue, retry_timeout = _request_url_error(template, retry_timeout)
 762 |             if not should_continue:
 763 |                 raise
 764 | 
 765 |         if should_continue:
 766 |             continue
 767 | 
 768 |         break
 769 |     return r, errors
 770 | 
 771 | 
 772 | def _construct_request(per_page, query_args, template, auth, as_app=None, fine=False):
 773 |     # If template is already a full URL with query params (from Link header), use it directly
 774 |     if "?" in template and template.startswith("http"):
 775 |         request_url = template
 776 |         # Extract query string for logging
 777 |         querystring = template.split("?", 1)[1]
 778 |     else:
 779 |         # Build URL with query parameters
 780 |         all_query_args = {}
 781 |         if per_page:
 782 |             all_query_args["per_page"] = per_page
 783 |         if query_args:
 784 |             all_query_args.update(query_args)
 785 | 
 786 |         request_url = template
 787 |         if all_query_args:
 788 |             querystring = urlencode(all_query_args)
 789 |             request_url = template + "?" + querystring
 790 |         else:
 791 |             querystring = ""
 792 | 
 793 |     request = Request(request_url)
 794 |     if auth is not None:
 795 |         if not as_app:
 796 |             if fine:
 797 |                 request.add_header("Authorization", "token " + auth)
 798 |             else:
 799 |                 request.add_header("Authorization", "Basic ".encode("ascii") + auth)
 800 |         else:
 801 |             auth = auth.encode("ascii")
 802 |             request.add_header("Authorization", "token ".encode("ascii") + auth)
 803 | 
 804 |     log_url = template if "?" not in template else template.split("?")[0]
 805 |     if querystring:
 806 |         log_url += "?" + querystring
 807 |     logger.info("Requesting {}".format(log_url))
 808 |     return request
 809 | 
 810 | 
 811 | def _request_http_error(exc, auth, errors):
 812 |     # HTTPError behaves like a Response so we can
 813 |     # check the status code and headers to see exactly
 814 |     # what failed.
 815 | 
 816 |     should_continue = False
 817 |     headers = exc.headers
 818 |     limit_remaining = int(headers.get("x-ratelimit-remaining", 0))
 819 | 
 820 |     if exc.code == 403 and limit_remaining < 1:
 821 |         # The X-RateLimit-Reset header includes a
 822 |         # timestamp telling us when the limit will reset
 823 |         # so we can calculate how long to wait rather
 824 |         # than inefficiently polling:
 825 |         gm_now = calendar.timegm(time.gmtime())
 826 |         reset = int(headers.get("x-ratelimit-reset", 0)) or gm_now
 827 |         # We'll never sleep for less than 10 seconds:
 828 |         delta = max(10, reset - gm_now)
 829 | 
 830 |         limit = headers.get("x-ratelimit-limit")
 831 |         logger.warning(
 832 |             "Exceeded rate limit of {} requests; waiting {} seconds to reset".format(
 833 |                 limit, delta
 834 |             )
 835 |         )  # noqa
 836 | 
 837 |         if auth is None:
 838 |             logger.info("Hint: Authenticate to raise your GitHub rate limit")
 839 | 
 840 |         time.sleep(delta)
 841 |         should_continue = True
 842 |     return errors, should_continue
 843 | 
 844 | 
 845 | def _request_url_error(template, retry_timeout):
 846 |     # In case of a connection timing out, we can retry a few time
 847 |     # But we won't crash and not back-up the rest now
 848 |     logger.info("'{}' timed out".format(template))
 849 |     retry_timeout -= 1
 850 | 
 851 |     if retry_timeout >= 0:
 852 |         return True, retry_timeout
 853 | 
 854 |     raise Exception("'{}' timed out to much, skipping!".format(template))
 855 | 
 856 | 
 857 | class S3HTTPRedirectHandler(HTTPRedirectHandler):
 858 |     """
 859 |     A subclassed redirect handler for downloading Github assets from S3.
 860 | 
 861 |     urllib will add the Authorization header to the redirected request to S3, which will result in a 400,
 862 |     so we should remove said header on redirect.
 863 |     """
 864 | 
 865 |     def redirect_request(self, req, fp, code, msg, headers, newurl):
 866 |         request = super(S3HTTPRedirectHandler, self).redirect_request(
 867 |             req, fp, code, msg, headers, newurl
 868 |         )
 869 |         # Only delete Authorization header if it exists (attachments may not have it)
 870 |         if "Authorization" in request.headers:
 871 |             del request.headers["Authorization"]
 872 |         return request
 873 | 
 874 | 
 875 | def download_file(url, path, auth, as_app=False, fine=False):
 876 |     # Skip downloading release assets if they already exist on disk so we don't redownload on every sync
 877 |     if os.path.exists(path):
 878 |         return
 879 | 
 880 |     request = _construct_request(
 881 |         per_page=None,
 882 |         query_args={},
 883 |         template=url,
 884 |         auth=auth,
 885 |         as_app=as_app,
 886 |         fine=fine,
 887 |     )
 888 |     request.add_header("Accept", "application/octet-stream")
 889 |     opener = build_opener(S3HTTPRedirectHandler)
 890 | 
 891 |     try:
 892 |         response = opener.open(request)
 893 | 
 894 |         chunk_size = 16 * 1024
 895 |         with open(path, "wb") as f:
 896 |             while True:
 897 |                 chunk = response.read(chunk_size)
 898 |                 if not chunk:
 899 |                     break
 900 |                 f.write(chunk)
 901 |     except HTTPError as exc:
 902 |         # Gracefully handle 404 responses (and others) when downloading from S3
 903 |         logger.warning(
 904 |             "Skipping download of asset {0} due to HTTPError: {1}".format(
 905 |                 url, exc.reason
 906 |             )
 907 |         )
 908 |     except URLError as e:
 909 |         # Gracefully handle other URL errors
 910 |         logger.warning(
 911 |             "Skipping download of asset {0} due to URLError: {1}".format(url, e.reason)
 912 |         )
 913 |     except socket.error as e:
 914 |         # Gracefully handle socket errors
 915 |         # TODO: Implement retry logic
 916 |         logger.warning(
 917 |             "Skipping download of asset {0} due to socker error: {1}".format(
 918 |                 url, e.strerror
 919 |             )
 920 |         )
 921 | 
 922 | 
 923 | def download_attachment_file(url, path, auth, as_app=False, fine=False):
 924 |     """Download attachment file directly (not via GitHub API).
 925 | 
 926 |     Similar to download_file() but for direct file URLs, not API endpoints.
 927 |     Attachment URLs (user-images, user-attachments) are direct downloads,
 928 |     not API endpoints, so we skip _construct_request() which adds API params.
 929 | 
 930 |     URL Format Support & Authentication Requirements:
 931 | 
 932 |     | URL Format                                   | Auth Required | Notes                    |
 933 |     |----------------------------------------------|---------------|--------------------------|
 934 |     | github.com/user-attachments/assets/*         | Private only  | Modern format (2024+)    |
 935 |     | github.com/user-attachments/files/*          | Private only  | Modern format (2024+)    |
 936 |     | user-images.githubusercontent.com/*          | No (public)   | Legacy CDN, all eras     |
 937 |     | private-user-images.githubusercontent.com/*  | JWT in URL    | Legacy private (5min)    |
 938 |     | github.com/{owner}/{repo}/files/*            | Repo filter   | Old repo files           |
 939 | 
 940 |     - Modern user-attachments: Requires GitHub token auth for private repos
 941 |     - Legacy public CDN: No auth needed/accepted (returns 400 with auth header)
 942 |     - Legacy private CDN: Uses JWT token embedded in URL, no GitHub token needed
 943 |     - Repo files: Filtered to current repository only during extraction
 944 | 
 945 |     Returns dict with metadata:
 946 |         - success: bool
 947 |         - http_status: int (200, 404, etc.)
 948 |         - content_type: str or None
 949 |         - original_filename: str or None (from Content-Disposition)
 950 |         - size_bytes: int or None
 951 |         - error: str or None
 952 |     """
 953 |     import re
 954 |     from datetime import datetime, timezone
 955 | 
 956 |     metadata = {
 957 |         "url": url,
 958 |         "success": False,
 959 |         "http_status": None,
 960 |         "content_type": None,
 961 |         "original_filename": None,
 962 |         "size_bytes": None,
 963 |         "downloaded_at": datetime.now(timezone.utc).isoformat(),
 964 |         "error": None,
 965 |     }
 966 | 
 967 |     # Create simple request (no API query params)
 968 |     request = Request(url)
 969 |     request.add_header("Accept", "application/octet-stream")
 970 | 
 971 |     # Add authentication header only for modern github.com/user-attachments URLs
 972 |     # Legacy CDN URLs (user-images.githubusercontent.com) are public and don't need/accept auth
 973 |     # Private CDN URLs (private-user-images) use JWT tokens embedded in the URL
 974 |     if auth is not None and "github.com/user-attachments/" in url:
 975 |         if not as_app:
 976 |             if fine:
 977 |                 # Fine-grained token: plain token with "token " prefix
 978 |                 request.add_header("Authorization", "token " + auth)
 979 |             else:
 980 |                 # Classic token: base64-encoded with "Basic " prefix
 981 |                 request.add_header("Authorization", "Basic ".encode("ascii") + auth)
 982 |         else:
 983 |             # App authentication
 984 |             auth = auth.encode("ascii")
 985 |             request.add_header("Authorization", "token ".encode("ascii") + auth)
 986 | 
 987 |     # Reuse S3HTTPRedirectHandler from download_file()
 988 |     opener = build_opener(S3HTTPRedirectHandler)
 989 | 
 990 |     temp_path = path + ".temp"
 991 | 
 992 |     try:
 993 |         response = opener.open(request)
 994 |         metadata["http_status"] = response.getcode()
 995 | 
 996 |         # Extract Content-Type
 997 |         content_type = response.headers.get("Content-Type", "").split(";")[0].strip()
 998 |         if content_type:
 999 |             metadata["content_type"] = content_type
1000 | 
1001 |         # Extract original filename from Content-Disposition header
1002 |         # Format: attachment; filename=example.mov or attachment;filename="example.mov"
1003 |         content_disposition = response.headers.get("Content-Disposition", "")
1004 |         if content_disposition:
1005 |             # Match: filename=something or filename="something" or filename*=UTF-8''something
1006 |             match = re.search(r'filename\*?=["\']?([^"\';\r\n]+)', content_disposition)
1007 |             if match:
1008 |                 original_filename = match.group(1).strip()
1009 |                 # Handle RFC 5987 encoding: filename*=UTF-8''example.mov
1010 |                 if "UTF-8''" in original_filename:
1011 |                     original_filename = original_filename.split("UTF-8''")[1]
1012 |                 metadata["original_filename"] = original_filename
1013 | 
1014 |         # Fallback: Extract filename from final URL after redirects
1015 |         # This handles user-attachments/assets URLs which redirect to S3 with filename.ext
1016 |         if not metadata["original_filename"]:
1017 |             from urllib.parse import urlparse, unquote
1018 | 
1019 |             final_url = response.geturl()
1020 |             parsed = urlparse(final_url)
1021 |             # Get filename from path (last component before query string)
1022 |             path_parts = parsed.path.split("/")
1023 |             if path_parts:
1024 |                 # URL might be encoded, decode it
1025 |                 filename_from_url = unquote(path_parts[-1])
1026 |                 # Only use if it has an extension
1027 |                 if "." in filename_from_url:
1028 |                     metadata["original_filename"] = filename_from_url
1029 | 
1030 |         # Download file to temporary location
1031 |         chunk_size = 16 * 1024
1032 |         bytes_downloaded = 0
1033 |         with open(temp_path, "wb") as f:
1034 |             while True:
1035 |                 chunk = response.read(chunk_size)
1036 |                 if not chunk:
1037 |                     break
1038 |                 f.write(chunk)
1039 |                 bytes_downloaded += len(chunk)
1040 | 
1041 |         # Atomic rename to final location
1042 |         os.replace(temp_path, path)
1043 | 
1044 |         metadata["size_bytes"] = bytes_downloaded
1045 |         metadata["success"] = True
1046 | 
1047 |     except HTTPError as exc:
1048 |         metadata["http_status"] = exc.code
1049 |         metadata["error"] = str(exc.reason)
1050 |         logger.warning(
1051 |             "Skipping download of attachment {0} due to HTTPError: {1}".format(
1052 |                 url, exc.reason
1053 |             )
1054 |         )
1055 |     except URLError as e:
1056 |         metadata["error"] = str(e.reason)
1057 |         logger.warning(
1058 |             "Skipping download of attachment {0} due to URLError: {1}".format(
1059 |                 url, e.reason
1060 |             )
1061 |         )
1062 |     except socket.error as e:
1063 |         metadata["error"] = str(e.strerror) if hasattr(e, "strerror") else str(e)
1064 |         logger.warning(
1065 |             "Skipping download of attachment {0} due to socket error: {1}".format(
1066 |                 url, e.strerror if hasattr(e, "strerror") else str(e)
1067 |             )
1068 |         )
1069 |     except Exception as e:
1070 |         metadata["error"] = str(e)
1071 |         logger.warning(
1072 |             "Skipping download of attachment {0} due to error: {1}".format(url, str(e))
1073 |         )
1074 |         # Clean up temp file if it was partially created
1075 |         if os.path.exists(temp_path):
1076 |             try:
1077 |                 os.remove(temp_path)
1078 |             except Exception:
1079 |                 pass
1080 | 
1081 |     return metadata
1082 | 
1083 | 
1084 | def extract_attachment_urls(item_data, issue_number=None, repository_full_name=None):
1085 |     """Extract GitHub-hosted attachment URLs from issue/PR body and comments.
1086 | 
1087 |     What qualifies as an attachment?
1088 |     There is no "attachment" concept in the GitHub API - it's a user behavior pattern
1089 |     we've identified through analysis of real-world repositories. We define attachments as:
1090 | 
1091 |     - User-uploaded files hosted on GitHub's CDN domains
1092 |     - Found outside of code blocks (not examples/documentation)
1093 |     - Matches known GitHub attachment URL patterns
1094 | 
1095 |     This intentionally captures bare URLs pasted by users, not just markdown/HTML syntax.
1096 |     Some false positives (example URLs in documentation) may occur - these fail gracefully
1097 |     with HTTP 404 and are logged in the manifest.
1098 | 
1099 |     Supported URL formats:
1100 |     - Modern: github.com/user-attachments/{assets,files}/*
1101 |     - Legacy: user-images.githubusercontent.com/* (including private-user-images)
1102 |     - Repo files: github.com/{owner}/{repo}/files/* (filtered to current repo)
1103 |     - Repo assets: github.com/{owner}/{repo}/assets/* (filtered to current repo)
1104 | 
1105 |     Repository filtering (repo files/assets only):
1106 |     - Direct match: URL is for current repository → included
1107 |     - Redirect match: URL redirects to current repository → included (handles renames/transfers)
1108 |     - Different repo: URL is for different repository → excluded
1109 | 
1110 |     Code block filtering:
1111 |     - Removes fenced code blocks (```) and inline code (`) before extraction
1112 |     - Prevents extracting URLs from code examples and documentation snippets
1113 | 
1114 |     Args:
1115 |         item_data: Issue or PR data dict
1116 |         issue_number: Issue/PR number for logging
1117 |         repository_full_name: Full repository name (owner/repo) for filtering repo-scoped URLs
1118 |     """
1119 |     import re
1120 | 
1121 |     urls = []
1122 | 
1123 |     # Define all GitHub attachment patterns
1124 |     # Stop at markdown punctuation: whitespace, ), `, ", >, <
1125 |     # Trailing sentence punctuation (. ! ? , ; : ' ") is stripped in post-processing
1126 |     patterns = [
1127 |         r'https://github\.com/user-attachments/(?:assets|files)/[^\s\)`"<>]+',  # Modern
1128 |         r'https://(?:private-)?user-images\.githubusercontent\.com/[^\s\)`"<>]+',  # Legacy CDN
1129 |     ]
1130 | 
1131 |     # Add repo-scoped patterns (will be filtered by repository later)
1132 |     # These patterns match ANY repo, then we filter to current repo with redirect checking
1133 |     repo_files_pattern = r'https://github\.com/[^/]+/[^/]+/files/\d+/[^\s\)`"<>]+'
1134 |     repo_assets_pattern = r'https://github\.com/[^/]+/[^/]+/assets/\d+/[^\s\)`"<>]+'
1135 |     patterns.append(repo_files_pattern)
1136 |     patterns.append(repo_assets_pattern)
1137 | 
1138 |     def clean_url(url):
1139 |         """Remove trailing sentence and markdown punctuation that's not part of the URL."""
1140 |         return url.rstrip(".!?,;:'\")")
1141 | 
1142 |     def remove_code_blocks(text):
1143 |         """Remove markdown code blocks (fenced and inline) from text.
1144 | 
1145 |         This prevents extracting URLs from code examples like:
1146 |         - Fenced code blocks: ```code```
1147 |         - Inline code: `code`
1148 |         """
1149 |         # Remove fenced code blocks first (```...```)
1150 |         # DOTALL flag makes . match newlines
1151 |         text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)
1152 | 
1153 |         # Remove inline code (`...`)
1154 |         # Non-greedy match between backticks
1155 |         text = re.sub(r"`[^`]*`", "", text)
1156 | 
1157 |         return text
1158 | 
1159 |     def is_repo_scoped_url(url):
1160 |         """Check if URL is a repo-scoped attachment (files or assets)."""
1161 |         return bool(
1162 |             re.match(r"https://github\.com/[^/]+/[^/]+/(?:files|assets)/\d+/", url)
1163 |         )
1164 | 
1165 |     def check_redirect_to_current_repo(url, current_repo):
1166 |         """Check if URL redirects to current repository.
1167 | 
1168 |         Returns True if:
1169 |         - URL is already for current repo
1170 |         - URL redirects (301/302) to current repo (handles renames/transfers)
1171 | 
1172 |         Returns False otherwise (URL is for a different repo).
1173 |         """
1174 |         # Extract owner/repo from URL
1175 |         match = re.match(r"https://github\.com/([^/]+)/([^/]+)/", url)
1176 |         if not match:
1177 |             return False
1178 | 
1179 |         url_owner, url_repo = match.groups()
1180 |         url_repo_full = f"{url_owner}/{url_repo}"
1181 | 
1182 |         # Direct match - no need to check redirect
1183 |         if url_repo_full.lower() == current_repo.lower():
1184 |             return True
1185 | 
1186 |         # Different repo - check if it redirects to current repo
1187 |         # This handles repository transfers and renames
1188 |         try:
1189 |             import urllib.request
1190 |             import urllib.error
1191 | 
1192 |             # Make HEAD request with redirect following disabled
1193 |             # We need to manually handle redirects to see the Location header
1194 |             request = urllib.request.Request(url, method="HEAD")
1195 |             request.add_header("User-Agent", "python-github-backup")
1196 | 
1197 |             # Create opener that does NOT follow redirects
1198 |             class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
1199 |                 def redirect_request(self, req, fp, code, msg, headers, newurl):
1200 |                     return None  # Don't follow redirects
1201 | 
1202 |             opener = urllib.request.build_opener(NoRedirectHandler)
1203 | 
1204 |             try:
1205 |                 _ = opener.open(request, timeout=10)
1206 |                 # Got 200 - URL works as-is but for different repo
1207 |                 return False
1208 |             except urllib.error.HTTPError as e:
1209 |                 # Check if it's a redirect (301, 302, 307, 308)
1210 |                 if e.code in (301, 302, 307, 308):
1211 |                     location = e.headers.get("Location", "")
1212 |                     # Check if redirect points to current repo
1213 |                     if location:
1214 |                         redirect_match = re.match(
1215 |                             r"https://github\.com/([^/]+)/([^/]+)/", location
1216 |                         )
1217 |                         if redirect_match:
1218 |                             redirect_owner, redirect_repo = redirect_match.groups()
1219 |                             redirect_repo_full = f"{redirect_owner}/{redirect_repo}"
1220 |                             return redirect_repo_full.lower() == current_repo.lower()
1221 |                 return False
1222 |         except Exception:
1223 |             # On any error (timeout, network issue, etc.), be conservative
1224 |             # and exclude the URL to avoid downloading from wrong repos
1225 |             return False
1226 | 
1227 |     # Extract from body
1228 |     body = item_data.get("body") or ""
1229 |     # Remove code blocks before searching for URLs
1230 |     body_cleaned = remove_code_blocks(body)
1231 |     for pattern in patterns:
1232 |         found_urls = re.findall(pattern, body_cleaned)
1233 |         urls.extend([clean_url(url) for url in found_urls])
1234 | 
1235 |     # Extract from issue comments
1236 |     if "comment_data" in item_data:
1237 |         for comment in item_data["comment_data"]:
1238 |             comment_body = comment.get("body") or ""
1239 |             # Remove code blocks before searching for URLs
1240 |             comment_cleaned = remove_code_blocks(comment_body)
1241 |             for pattern in patterns:
1242 |                 found_urls = re.findall(pattern, comment_cleaned)
1243 |                 urls.extend([clean_url(url) for url in found_urls])
1244 | 
1245 |     # Extract from PR regular comments
1246 |     if "comment_regular_data" in item_data:
1247 |         for comment in item_data["comment_regular_data"]:
1248 |             comment_body = comment.get("body") or ""
1249 |             # Remove code blocks before searching for URLs
1250 |             comment_cleaned = remove_code_blocks(comment_body)
1251 |             for pattern in patterns:
1252 |                 found_urls = re.findall(pattern, comment_cleaned)
1253 |                 urls.extend([clean_url(url) for url in found_urls])
1254 | 
1255 |     regex_urls = list(set(urls))  # dedupe
1256 | 
1257 |     # Filter repo-scoped URLs to current repository only
1258 |     # This handles repository transfers/renames via redirect checking
1259 |     if repository_full_name:
1260 |         filtered_urls = []
1261 |         for url in regex_urls:
1262 |             if is_repo_scoped_url(url):
1263 |                 # Check if URL belongs to current repo (or redirects to it)
1264 |                 if check_redirect_to_current_repo(url, repository_full_name):
1265 |                     filtered_urls.append(url)
1266 |                 # else: skip URLs from other repositories
1267 |             else:
1268 |                 # Non-repo-scoped URLs (user-attachments, CDN) - always include
1269 |                 filtered_urls.append(url)
1270 |         regex_urls = filtered_urls
1271 | 
1272 |     return regex_urls
1273 | 
1274 | 
1275 | def get_attachment_filename(url):
1276 |     """Get filename from attachment URL, handling all GitHub formats.
1277 | 
1278 |     Formats:
1279 |     - github.com/user-attachments/assets/{uuid} → uuid (add extension later)
1280 |     - github.com/user-attachments/files/{id}/{filename} → filename
1281 |     - github.com/{owner}/{repo}/files/{id}/{filename} → filename
1282 |     - user-images.githubusercontent.com/{user}/{hash}.{ext} → hash.ext
1283 |     - private-user-images.githubusercontent.com/...?jwt=... → extract from path
1284 |     """
1285 |     from urllib.parse import urlparse
1286 | 
1287 |     parsed = urlparse(url)
1288 |     path_parts = parsed.path.split("/")
1289 | 
1290 |     # Modern: /user-attachments/files/{id}/{filename}
1291 |     if "user-attachments/files" in parsed.path:
1292 |         return path_parts[-1]
1293 | 
1294 |     # Modern: /user-attachments/assets/{uuid}
1295 |     elif "user-attachments/assets" in parsed.path:
1296 |         return path_parts[-1]  # extension added later via detect_and_add_extension
1297 | 
1298 |     # Repo files: /{owner}/{repo}/files/{id}/{filename}
1299 |     elif "/files/" in parsed.path and len(path_parts) >= 2:
1300 |         return path_parts[-1]
1301 | 
1302 |     # Legacy: user-images.githubusercontent.com/{user}/{hash-with-ext}
1303 |     elif "githubusercontent.com" in parsed.netloc:
1304 |         return path_parts[-1]  # Already has extension usually
1305 | 
1306 |     # Fallback: use last path component
1307 |     return path_parts[-1] if path_parts[-1] else "unknown_attachment"
1308 | 
1309 | 
1310 | def resolve_filename_collision(filepath):
1311 |     """Resolve filename collisions using counter suffix pattern.
1312 | 
1313 |     If filepath exists, returns a new filepath with counter suffix.
1314 |     Pattern: report.pdf → report_1.pdf → report_2.pdf
1315 | 
1316 |     Also protects against manifest.json collisions by treating it as reserved.
1317 | 
1318 |     Args:
1319 |         filepath: Full path to file that might exist
1320 | 
1321 |     Returns:
1322 |         filepath that doesn't collide (may be same as input if no collision)
1323 |     """
1324 |     directory = os.path.dirname(filepath)
1325 |     filename = os.path.basename(filepath)
1326 | 
1327 |     # Protect manifest.json - it's a reserved filename
1328 |     if filename == "manifest.json":
1329 |         name, ext = os.path.splitext(filename)
1330 |         counter = 1
1331 |         while True:
1332 |             new_filename = f"{name}_{counter}{ext}"
1333 |             new_filepath = os.path.join(directory, new_filename)
1334 |             if not os.path.exists(new_filepath):
1335 |                 return new_filepath
1336 |             counter += 1
1337 | 
1338 |     if not os.path.exists(filepath):
1339 |         return filepath
1340 | 
1341 |     name, ext = os.path.splitext(filename)
1342 | 
1343 |     counter = 1
1344 |     while True:
1345 |         new_filename = f"{name}_{counter}{ext}"
1346 |         new_filepath = os.path.join(directory, new_filename)
1347 |         if not os.path.exists(new_filepath):
1348 |             return new_filepath
1349 |         counter += 1
1350 | 
1351 | 
1352 | def download_attachments(
1353 |     args, item_cwd, item_data, number, repository, item_type="issue"
1354 | ):
1355 |     """Download user-attachments from issue/PR body and comments with manifest.
1356 | 
1357 |     Args:
1358 |         args: Command line arguments
1359 |         item_cwd: Working directory (issue_cwd or pulls_cwd)
1360 |         item_data: Issue or PR data dict
1361 |         number: Issue or PR number
1362 |         repository: Repository dict
1363 |         item_type: "issue" or "pull" for logging/manifest
1364 |     """
1365 |     import json
1366 |     from datetime import datetime, timezone
1367 | 
1368 |     item_type_display = "issue" if item_type == "issue" else "pull request"
1369 | 
1370 |     urls = extract_attachment_urls(
1371 |         item_data, issue_number=number, repository_full_name=repository["full_name"]
1372 |     )
1373 |     if not urls:
1374 |         return
1375 | 
1376 |     attachments_dir = os.path.join(item_cwd, "attachments", str(number))
1377 |     manifest_path = os.path.join(attachments_dir, "manifest.json")
1378 | 
1379 |     # Load existing manifest to prevent duplicate downloads
1380 |     existing_urls = set()
1381 |     existing_metadata = []
1382 |     if os.path.exists(manifest_path):
1383 |         try:
1384 |             with open(manifest_path, "r") as f:
1385 |                 existing_manifest = json.load(f)
1386 |                 all_metadata = existing_manifest.get("attachments", [])
1387 |                 # Only skip URLs that were successfully downloaded OR failed with permanent errors
1388 |                 # Retry transient failures (5xx, timeouts, network errors)
1389 |                 for item in all_metadata:
1390 |                     if item.get("success"):
1391 |                         existing_urls.add(item["url"])
1392 |                     else:
1393 |                         # Check if this is a permanent failure (don't retry) or transient (retry)
1394 |                         http_status = item.get("http_status")
1395 |                         if http_status in [404, 410, 451]:
1396 |                             # Permanent failures - don't retry
1397 |                             existing_urls.add(item["url"])
1398 |                 # Transient failures (5xx, auth errors, timeouts) will be retried
1399 |                 existing_metadata = all_metadata
1400 |         except (json.JSONDecodeError, IOError):
1401 |             # If manifest is corrupted, re-download everything
1402 |             logger.warning(
1403 |                 "Corrupted manifest for {0} #{1}, will re-download".format(
1404 |                     item_type_display, number
1405 |                 )
1406 |             )
1407 |             existing_urls = set()
1408 |             existing_metadata = []
1409 | 
1410 |     # Filter to only new URLs
1411 |     new_urls = [url for url in urls if url not in existing_urls]
1412 | 
1413 |     if not new_urls and existing_urls:
1414 |         logger.debug(
1415 |             "Skipping attachments for {0} #{1} (all {2} already downloaded)".format(
1416 |                 item_type_display, number, len(urls)
1417 |             )
1418 |         )
1419 |         return
1420 | 
1421 |     if new_urls:
1422 |         logger.info(
1423 |             "Downloading {0} new attachment(s) for {1} #{2}".format(
1424 |                 len(new_urls), item_type_display, number
1425 |             )
1426 |         )
1427 | 
1428 |     mkdir_p(item_cwd, attachments_dir)
1429 | 
1430 |     # Collect metadata for manifest (start with existing)
1431 |     attachment_metadata_list = existing_metadata[:]
1432 | 
1433 |     for url in new_urls:
1434 |         filename = get_attachment_filename(url)
1435 |         filepath = os.path.join(attachments_dir, filename)
1436 | 
1437 |         # Download and get metadata
1438 |         metadata = download_attachment_file(
1439 |             url,
1440 |             filepath,
1441 |             get_auth(args, encode=not args.as_app),
1442 |             as_app=args.as_app,
1443 |             fine=args.token_fine is not None,
1444 |         )
1445 | 
1446 |         # If download succeeded but we got an extension from Content-Disposition,
1447 |         # we may need to rename the file to add the extension
1448 |         if metadata["success"] and metadata.get("original_filename"):
1449 |             original_ext = os.path.splitext(metadata["original_filename"])[1]
1450 |             current_ext = os.path.splitext(filepath)[1]
1451 | 
1452 |             # Add extension if not present
1453 |             if original_ext and current_ext != original_ext:
1454 |                 final_filepath = filepath + original_ext
1455 |                 # Check for collision again with new extension
1456 |                 final_filepath = resolve_filename_collision(final_filepath)
1457 |                 logger.debug(
1458 |                     "Adding extension {0} to {1}".format(original_ext, filepath)
1459 |                 )
1460 | 
1461 |                 # Rename to add extension (already atomic from download)
1462 |                 try:
1463 |                     os.replace(filepath, final_filepath)
1464 |                     metadata["saved_as"] = os.path.basename(final_filepath)
1465 |                 except Exception as e:
1466 |                     logger.warning(
1467 |                         "Could not add extension to {0}: {1}".format(filepath, str(e))
1468 |                     )
1469 |                     metadata["saved_as"] = os.path.basename(filepath)
1470 |             else:
1471 |                 metadata["saved_as"] = os.path.basename(filepath)
1472 |         elif metadata["success"]:
1473 |             metadata["saved_as"] = os.path.basename(filepath)
1474 |         else:
1475 |             metadata["saved_as"] = None
1476 | 
1477 |         attachment_metadata_list.append(metadata)
1478 | 
1479 |     # Write manifest
1480 |     if attachment_metadata_list:
1481 |         manifest = {
1482 |             "issue_number": number,
1483 |             "issue_type": item_type,
1484 |             "repository": (
1485 |                 f"{args.user}/{args.repository}"
1486 |                 if hasattr(args, "repository") and args.repository
1487 |                 else args.user
1488 |             ),
1489 |             "manifest_updated_at": datetime.now(timezone.utc).isoformat(),
1490 |             "attachments": attachment_metadata_list,
1491 |         }
1492 | 
1493 |         manifest_path = os.path.join(attachments_dir, "manifest.json")
1494 |         with open(manifest_path + ".temp", "w") as f:
1495 |             json.dump(manifest, f, indent=2)
1496 |         os.replace(manifest_path + ".temp", manifest_path)  # Atomic write
1497 |         logger.debug(
1498 |             "Wrote manifest for {0} #{1}: {2} attachments".format(
1499 |                 item_type_display, number, len(attachment_metadata_list)
1500 |             )
1501 |         )
1502 | 
1503 | 
1504 | def get_authenticated_user(args):
1505 |     template = "https://{0}/user".format(get_github_api_host(args))
1506 |     data = retrieve_data(args, template, single_request=True)
1507 |     return data[0]
1508 | 
1509 | 
1510 | def check_git_lfs_install():
1511 |     exit_code = subprocess.call(["git", "lfs", "version"])
1512 |     if exit_code != 0:
1513 |         raise Exception(
1514 |             "The argument --lfs requires you to have Git LFS installed.\nYou can get it from https://git-lfs.github.com."
1515 |         )
1516 | 
1517 | 
1518 | def retrieve_repositories(args, authenticated_user):
1519 |     logger.info("Retrieving repositories")
1520 |     single_request = False
1521 |     if args.user == authenticated_user["login"]:
1522 |         # we must use the /user/repos API to be able to access private repos
1523 |         template = "https://{0}/user/repos".format(get_github_api_host(args))
1524 |     else:
1525 |         if args.private and not args.organization:
1526 |             logger.warning(
1527 |                 "Authenticated user is different from user being backed up, thus private repositories cannot be accessed"
1528 |             )
1529 |         template = "https://{0}/users/{1}/repos".format(
1530 |             get_github_api_host(args), args.user
1531 |         )
1532 | 
1533 |     if args.organization:
1534 |         template = "https://{0}/orgs/{1}/repos".format(
1535 |             get_github_api_host(args), args.user
1536 |         )
1537 | 
1538 |     if args.repository:
1539 |         if "/" in args.repository:
1540 |             repo_path = args.repository
1541 |         else:
1542 |             repo_path = "{0}/{1}".format(args.user, args.repository)
1543 |         single_request = True
1544 |         template = "https://{0}/repos/{1}".format(get_github_api_host(args), repo_path)
1545 | 
1546 |     repos = retrieve_data(args, template, single_request=single_request)
1547 | 
1548 |     if args.all_starred:
1549 |         starred_template = "https://{0}/users/{1}/starred".format(
1550 |             get_github_api_host(args), args.user
1551 |         )
1552 |         starred_repos = retrieve_data(args, starred_template, single_request=False)
1553 |         # flag each repo as starred for downstream processing
1554 |         for item in starred_repos:
1555 |             item.update({"is_starred": True})
1556 |         repos.extend(starred_repos)
1557 | 
1558 |     if args.include_gists:
1559 |         gists_template = "https://{0}/users/{1}/gists".format(
1560 |             get_github_api_host(args), args.user
1561 |         )
1562 |         gists = retrieve_data(args, gists_template, single_request=False)
1563 |         # flag each repo as a gist for downstream processing
1564 |         for item in gists:
1565 |             item.update({"is_gist": True})
1566 |         repos.extend(gists)
1567 | 
1568 |     if args.include_starred_gists:
1569 |         if (
1570 |             not authenticated_user.get("login")
1571 |             or args.user.lower() != authenticated_user["login"].lower()
1572 |         ):
1573 |             logger.warning(
1574 |                 "Cannot retrieve starred gists for '%s'. GitHub only allows access to the authenticated user's starred gists.",
1575 |                 args.user,
1576 |             )
1577 |         else:
1578 |             starred_gists_template = "https://{0}/gists/starred".format(
1579 |                 get_github_api_host(args)
1580 |             )
1581 |             starred_gists = retrieve_data(
1582 |                 args, starred_gists_template, single_request=False
1583 |             )
1584 |             # flag each repo as a starred gist for downstream processing
1585 |             for item in starred_gists:
1586 |                 item.update({"is_gist": True, "is_starred": True})
1587 |             repos.extend(starred_gists)
1588 | 
1589 |     return repos
1590 | 
1591 | 
1592 | def filter_repositories(args, unfiltered_repositories):
1593 |     if args.repository:
1594 |         return unfiltered_repositories
1595 |     logger.info("Filtering repositories")
1596 | 
1597 |     repositories = []
1598 |     for r in unfiltered_repositories:
1599 |         # gists can be anonymous, so need to safely check owner
1600 |         # Use case-insensitive comparison to match GitHub's case-insensitive username behavior
1601 |         owner_login = r.get("owner", {}).get("login", "")
1602 |         if owner_login.lower() == args.user.lower() or r.get("is_starred"):
1603 |             repositories.append(r)
1604 | 
1605 |     name_regex = None
1606 |     if args.name_regex:
1607 |         name_regex = re.compile(args.name_regex)
1608 | 
1609 |     languages = None
1610 |     if args.languages:
1611 |         languages = [x.lower() for x in args.languages]
1612 | 
1613 |     if not args.fork:
1614 |         repositories = [r for r in repositories if not r.get("fork")]
1615 |     if not args.private:
1616 |         repositories = [
1617 |             r for r in repositories if not r.get("private") or r.get("public")
1618 |         ]
1619 |     if languages:
1620 |         repositories = [
1621 |             r
1622 |             for r in repositories
1623 |             if r.get("language") and r.get("language").lower() in languages
1624 |         ]  # noqa
1625 |     if name_regex:
1626 |         repositories = [
1627 |             r for r in repositories if "name" not in r or name_regex.match(r["name"])
1628 |         ]
1629 |     if args.skip_archived:
1630 |         repositories = [r for r in repositories if not r.get("archived")]
1631 |     if args.exclude:
1632 |         repositories = [
1633 |             r for r in repositories if "name" not in r or r["name"] not in args.exclude
1634 |         ]
1635 | 
1636 |     return repositories
1637 | 
1638 | 
1639 | def backup_repositories(args, output_directory, repositories):
1640 |     logger.info("Backing up repositories")
1641 |     repos_template = "https://{0}/repos".format(get_github_api_host(args))
1642 | 
1643 |     if args.incremental:
1644 |         last_update_path = os.path.join(output_directory, "last_update")
1645 |         if os.path.exists(last_update_path):
1646 |             args.since = open(last_update_path).read().strip()
1647 |         else:
1648 |             args.since = None
1649 |     else:
1650 |         args.since = None
1651 | 
1652 |     last_update = "0000-00-00T00:00:00Z"
1653 |     for repository in repositories:
1654 |         if "updated_at" in repository and repository["updated_at"] > last_update:
1655 |             last_update = repository["updated_at"]
1656 |         elif "pushed_at" in repository and repository["pushed_at"] > last_update:
1657 |             last_update = repository["pushed_at"]
1658 | 
1659 |         if repository.get("is_gist"):
1660 |             repo_cwd = os.path.join(output_directory, "gists", repository["id"])
1661 |         elif repository.get("is_starred"):
1662 |             # put starred repos in -o/starred/${owner}/${repo} to prevent collision of
1663 |             # any repositories with the same name
1664 |             repo_cwd = os.path.join(
1665 |                 output_directory,
1666 |                 "starred",
1667 |                 repository["owner"]["login"],
1668 |                 repository["name"],
1669 |             )
1670 |         else:
1671 |             repo_cwd = os.path.join(
1672 |                 output_directory, "repositories", repository["name"]
1673 |             )
1674 | 
1675 |         repo_dir = os.path.join(repo_cwd, "repository")
1676 |         repo_url = get_github_repo_url(args, repository)
1677 | 
1678 |         include_gists = args.include_gists or args.include_starred_gists
1679 |         include_starred = args.all_starred and repository.get("is_starred")
1680 |         if (
1681 |             (args.include_repository or args.include_everything)
1682 |             or (include_gists and repository.get("is_gist"))
1683 |             or include_starred
1684 |         ):
1685 |             repo_name = (
1686 |                 repository.get("name")
1687 |                 if not repository.get("is_gist")
1688 |                 else repository.get("id")
1689 |             )
1690 |             fetch_repository(
1691 |                 repo_name,
1692 |                 repo_url,
1693 |                 repo_dir,
1694 |                 skip_existing=args.skip_existing,
1695 |                 bare_clone=args.bare_clone,
1696 |                 lfs_clone=args.lfs_clone,
1697 |                 no_prune=args.no_prune,
1698 |             )
1699 | 
1700 |             if repository.get("is_gist"):
1701 |                 # dump gist information to a file as well
1702 |                 output_file = "{0}/gist.json".format(repo_cwd)
1703 |                 with codecs.open(output_file, "w", encoding="utf-8") as f:
1704 |                     json_dump(repository, f)
1705 | 
1706 |                 continue  # don't try to back anything else for a gist; it doesn't exist
1707 | 
1708 |         try:
1709 |             download_wiki = args.include_wiki or args.include_everything
1710 |             if repository["has_wiki"] and download_wiki:
1711 |                 fetch_repository(
1712 |                     repository["name"],
1713 |                     repo_url.replace(".git", ".wiki.git"),
1714 |                     os.path.join(repo_cwd, "wiki"),
1715 |                     skip_existing=args.skip_existing,
1716 |                     bare_clone=args.bare_clone,
1717 |                     lfs_clone=args.lfs_clone,
1718 |                     no_prune=args.no_prune,
1719 |                 )
1720 |             if args.include_issues or args.include_everything:
1721 |                 backup_issues(args, repo_cwd, repository, repos_template)
1722 | 
1723 |             if args.include_pulls or args.include_everything:
1724 |                 backup_pulls(args, repo_cwd, repository, repos_template)
1725 | 
1726 |             if args.include_milestones or args.include_everything:
1727 |                 backup_milestones(args, repo_cwd, repository, repos_template)
1728 | 
1729 |             if args.include_labels or args.include_everything:
1730 |                 backup_labels(args, repo_cwd, repository, repos_template)
1731 | 
1732 |             if args.include_hooks or args.include_everything:
1733 |                 backup_hooks(args, repo_cwd, repository, repos_template)
1734 | 
1735 |             if args.include_releases or args.include_everything:
1736 |                 backup_releases(
1737 |                     args,
1738 |                     repo_cwd,
1739 |                     repository,
1740 |                     repos_template,
1741 |                     include_assets=args.include_assets or args.include_everything,
1742 |                 )
1743 |         except RepositoryUnavailableError as e:
1744 |             logger.warning(
1745 |                 f"Repository {repository['full_name']} is unavailable (HTTP 451)"
1746 |             )
1747 |             if e.dmca_url:
1748 |                 logger.warning(f"DMCA notice: {e.dmca_url}")
1749 |             logger.info(f"Skipping remaining resources for {repository['full_name']}")
1750 |             continue
1751 | 
1752 |     if args.incremental:
1753 |         if last_update == "0000-00-00T00:00:00Z":
1754 |             last_update = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
1755 | 
1756 |         open(last_update_path, "w").write(last_update)
1757 | 
1758 | 
1759 | def backup_issues(args, repo_cwd, repository, repos_template):
1760 |     has_issues_dir = os.path.isdir("{0}/issues/.git".format(repo_cwd))
1761 |     if args.skip_existing and has_issues_dir:
1762 |         return
1763 | 
1764 |     logger.info("Retrieving {0} issues".format(repository["full_name"]))
1765 |     issue_cwd = os.path.join(repo_cwd, "issues")
1766 |     mkdir_p(repo_cwd, issue_cwd)
1767 | 
1768 |     issues = {}
1769 |     issues_skipped = 0
1770 |     issues_skipped_message = ""
1771 |     _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"])
1772 | 
1773 |     should_include_pulls = args.include_pulls or args.include_everything
1774 |     issue_states = ["open", "closed"]
1775 |     for issue_state in issue_states:
1776 |         query_args = {"filter": "all", "state": issue_state}
1777 |         if args.since:
1778 |             query_args["since"] = args.since
1779 | 
1780 |         _issues = retrieve_data(args, _issue_template, query_args=query_args)
1781 |         for issue in _issues:
1782 |             # skip pull requests which are also returned as issues
1783 |             # if retrieving pull requests is requested as well
1784 |             if "pull_request" in issue and should_include_pulls:
1785 |                 issues_skipped += 1
1786 |                 continue
1787 | 
1788 |             issues[issue["number"]] = issue
1789 | 
1790 |     if issues_skipped:
1791 |         issues_skipped_message = " (skipped {0} pull requests)".format(issues_skipped)
1792 | 
1793 |     logger.info(
1794 |         "Saving {0} issues to disk{1}".format(
1795 |             len(list(issues.keys())), issues_skipped_message
1796 |         )
1797 |     )
1798 |     comments_template = _issue_template + "/{0}/comments"
1799 |     events_template = _issue_template + "/{0}/events"
1800 |     for number, issue in list(issues.items()):
1801 |         issue_file = "{0}/{1}.json".format(issue_cwd, number)
1802 |         if args.incremental_by_files and os.path.isfile(issue_file):
1803 |             modified = os.path.getmtime(issue_file)
1804 |             modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
1805 |             if modified > issue["updated_at"]:
1806 |                 logger.info(
1807 |                     "Skipping issue {0} because it wasn't modified since last backup".format(
1808 |                         number
1809 |                     )
1810 |                 )
1811 |                 continue
1812 | 
1813 |         if args.include_issue_comments or args.include_everything:
1814 |             template = comments_template.format(number)
1815 |             issues[number]["comment_data"] = retrieve_data(args, template)
1816 |         if args.include_issue_events or args.include_everything:
1817 |             template = events_template.format(number)
1818 |             issues[number]["event_data"] = retrieve_data(args, template)
1819 |         if args.include_attachments:
1820 |             download_attachments(
1821 |                 args, issue_cwd, issues[number], number, repository, item_type="issue"
1822 |             )
1823 | 
1824 |         with codecs.open(issue_file + ".temp", "w", encoding="utf-8") as f:
1825 |             json_dump(issue, f)
1826 |         os.replace(issue_file + ".temp", issue_file)  # Atomic write
1827 | 
1828 | 
1829 | def backup_pulls(args, repo_cwd, repository, repos_template):
1830 |     has_pulls_dir = os.path.isdir("{0}/pulls/.git".format(repo_cwd))
1831 |     if args.skip_existing and has_pulls_dir:
1832 |         return
1833 | 
1834 |     logger.info("Retrieving {0} pull requests".format(repository["full_name"]))  # noqa
1835 |     pulls_cwd = os.path.join(repo_cwd, "pulls")
1836 |     mkdir_p(repo_cwd, pulls_cwd)
1837 | 
1838 |     pulls = {}
1839 |     _pulls_template = "{0}/{1}/pulls".format(repos_template, repository["full_name"])
1840 |     _issue_template = "{0}/{1}/issues".format(repos_template, repository["full_name"])
1841 |     query_args = {
1842 |         "filter": "all",
1843 |         "state": "all",
1844 |         "sort": "updated",
1845 |         "direction": "desc",
1846 |     }
1847 | 
1848 |     if not args.include_pull_details:
1849 |         pull_states = ["open", "closed"]
1850 |         for pull_state in pull_states:
1851 |             query_args["state"] = pull_state
1852 |             _pulls = retrieve_data_gen(args, _pulls_template, query_args=query_args)
1853 |             for pull in _pulls:
1854 |                 if args.since and pull["updated_at"] < args.since:
1855 |                     break
1856 |                 if not args.since or pull["updated_at"] >= args.since:
1857 |                     pulls[pull["number"]] = pull
1858 |     else:
1859 |         _pulls = retrieve_data_gen(args, _pulls_template, query_args=query_args)
1860 |         for pull in _pulls:
1861 |             if args.since and pull["updated_at"] < args.since:
1862 |                 break
1863 |             if not args.since or pull["updated_at"] >= args.since:
1864 |                 pulls[pull["number"]] = retrieve_data(
1865 |                     args,
1866 |                     _pulls_template + "/{}".format(pull["number"]),
1867 |                     single_request=True,
1868 |                 )[0]
1869 | 
1870 |     logger.info("Saving {0} pull requests to disk".format(len(list(pulls.keys()))))
1871 |     # Comments from pulls API are only _review_ comments
1872 |     # regular comments need to be fetched via issue API.
1873 |     # For backwards compatibility with versions <= 0.41.0
1874 |     # keep name "comment_data" for review comments
1875 |     comments_regular_template = _issue_template + "/{0}/comments"
1876 |     comments_template = _pulls_template + "/{0}/comments"
1877 |     commits_template = _pulls_template + "/{0}/commits"
1878 |     for number, pull in list(pulls.items()):
1879 |         pull_file = "{0}/{1}.json".format(pulls_cwd, number)
1880 |         if args.incremental_by_files and os.path.isfile(pull_file):
1881 |             modified = os.path.getmtime(pull_file)
1882 |             modified = datetime.fromtimestamp(modified).strftime("%Y-%m-%dT%H:%M:%SZ")
1883 |             if modified > pull["updated_at"]:
1884 |                 logger.info(
1885 |                     "Skipping pull request {0} because it wasn't modified since last backup".format(
1886 |                         number
1887 |                     )
1888 |                 )
1889 |                 continue
1890 |         if args.include_pull_comments or args.include_everything:
1891 |             template = comments_regular_template.format(number)
1892 |             pulls[number]["comment_regular_data"] = retrieve_data(args, template)
1893 |             template = comments_template.format(number)
1894 |             pulls[number]["comment_data"] = retrieve_data(args, template)
1895 |         if args.include_pull_commits or args.include_everything:
1896 |             template = commits_template.format(number)
1897 |             pulls[number]["commit_data"] = retrieve_data(args, template)
1898 |         if args.include_attachments:
1899 |             download_attachments(
1900 |                 args, pulls_cwd, pulls[number], number, repository, item_type="pull"
1901 |             )
1902 | 
1903 |         with codecs.open(pull_file + ".temp", "w", encoding="utf-8") as f:
1904 |             json_dump(pull, f)
1905 |         os.replace(pull_file + ".temp", pull_file)  # Atomic write
1906 | 
1907 | 
1908 | def backup_milestones(args, repo_cwd, repository, repos_template):
1909 |     milestone_cwd = os.path.join(repo_cwd, "milestones")
1910 |     if args.skip_existing and os.path.isdir(milestone_cwd):
1911 |         return
1912 | 
1913 |     logger.info("Retrieving {0} milestones".format(repository["full_name"]))
1914 |     mkdir_p(repo_cwd, milestone_cwd)
1915 | 
1916 |     template = "{0}/{1}/milestones".format(repos_template, repository["full_name"])
1917 | 
1918 |     query_args = {"state": "all"}
1919 | 
1920 |     _milestones = retrieve_data(args, template, query_args=query_args)
1921 | 
1922 |     milestones = {}
1923 |     for milestone in _milestones:
1924 |         milestones[milestone["number"]] = milestone
1925 | 
1926 |     written_count = 0
1927 |     for number, milestone in list(milestones.items()):
1928 |         milestone_file = "{0}/{1}.json".format(milestone_cwd, number)
1929 |         if json_dump_if_changed(milestone, milestone_file):
1930 |             written_count += 1
1931 | 
1932 |     total = len(milestones)
1933 |     if written_count == total:
1934 |         logger.info("Saved {0} milestones to disk".format(total))
1935 |     elif written_count == 0:
1936 |         logger.info("{0} milestones unchanged, skipped write".format(total))
1937 |     else:
1938 |         logger.info(
1939 |             "Saved {0} of {1} milestones to disk ({2} unchanged)".format(
1940 |                 written_count, total, total - written_count
1941 |             )
1942 |         )
1943 | 
1944 | 
1945 | def backup_labels(args, repo_cwd, repository, repos_template):
1946 |     label_cwd = os.path.join(repo_cwd, "labels")
1947 |     output_file = "{0}/labels.json".format(label_cwd)
1948 |     template = "{0}/{1}/labels".format(repos_template, repository["full_name"])
1949 |     _backup_data(args, "labels", template, output_file, label_cwd)
1950 | 
1951 | 
1952 | def backup_hooks(args, repo_cwd, repository, repos_template):
1953 |     auth = get_auth(args)
1954 |     if not auth:
1955 |         logger.info("Skipping hooks since no authentication provided")
1956 |         return
1957 |     hook_cwd = os.path.join(repo_cwd, "hooks")
1958 |     output_file = "{0}/hooks.json".format(hook_cwd)
1959 |     template = "{0}/{1}/hooks".format(repos_template, repository["full_name"])
1960 |     try:
1961 |         _backup_data(args, "hooks", template, output_file, hook_cwd)
1962 |     except Exception as e:
1963 |         if "404" in str(e):
1964 |             logger.info("Unable to read hooks, skipping")
1965 |         else:
1966 |             raise e
1967 | 
1968 | 
1969 | def backup_releases(args, repo_cwd, repository, repos_template, include_assets=False):
1970 |     repository_fullname = repository["full_name"]
1971 | 
1972 |     # give release files somewhere to live & log intent
1973 |     release_cwd = os.path.join(repo_cwd, "releases")
1974 |     logger.info("Retrieving {0} releases".format(repository_fullname))
1975 |     mkdir_p(repo_cwd, release_cwd)
1976 | 
1977 |     query_args = {}
1978 | 
1979 |     release_template = "{0}/{1}/releases".format(repos_template, repository_fullname)
1980 |     releases = retrieve_data(args, release_template, query_args=query_args)
1981 | 
1982 |     if args.skip_prerelease:
1983 |         releases = [r for r in releases if not r["prerelease"] and not r["draft"]]
1984 | 
1985 |     if args.number_of_latest_releases and args.number_of_latest_releases < len(
1986 |         releases
1987 |     ):
1988 |         releases.sort(
1989 |             key=lambda item: datetime.strptime(
1990 |                 item["created_at"], "%Y-%m-%dT%H:%M:%SZ"
1991 |             ),
1992 |             reverse=True,
1993 |         )
1994 |         releases = releases[: args.number_of_latest_releases]
1995 | 
1996 |     # Check if this repo should skip asset downloads (case-insensitive)
1997 |     skip_assets = False
1998 |     if include_assets:
1999 |         repo_name = repository.get("name", "").lower()
2000 |         repo_full_name = repository.get("full_name", "").lower()
2001 |         skip_repos = [r.lower() for r in (args.skip_assets_on or [])]
2002 |         skip_assets = repo_name in skip_repos or repo_full_name in skip_repos
2003 |         if skip_assets:
2004 |             logger.info(
2005 |                 "Skipping assets for {0} ({1} releases) due to --skip-assets-on".format(
2006 |                     repository.get("name"), len(releases)
2007 |                 )
2008 |             )
2009 | 
2010 |     # for each release, store it
2011 |     written_count = 0
2012 |     for release in releases:
2013 |         release_name = release["tag_name"]
2014 |         release_name_safe = release_name.replace("/", "__")
2015 |         output_filepath = os.path.join(
2016 |             release_cwd, "{0}.json".format(release_name_safe)
2017 |         )
2018 |         if json_dump_if_changed(release, output_filepath):
2019 |             written_count += 1
2020 | 
2021 |         if include_assets and not skip_assets:
2022 |             assets = retrieve_data(args, release["assets_url"])
2023 |             if len(assets) > 0:
2024 |                 # give release asset files somewhere to live & download them (not including source archives)
2025 |                 release_assets_cwd = os.path.join(release_cwd, release_name_safe)
2026 |                 mkdir_p(release_assets_cwd)
2027 |                 for asset in assets:
2028 |                     download_file(
2029 |                         asset["url"],
2030 |                         os.path.join(release_assets_cwd, asset["name"]),
2031 |                         get_auth(args, encode=not args.as_app),
2032 |                         as_app=args.as_app,
2033 |                         fine=True if args.token_fine is not None else False,
2034 |                     )
2035 | 
2036 |     # Log the results
2037 |     total = len(releases)
2038 |     if written_count == total:
2039 |         logger.info("Saved {0} releases to disk".format(total))
2040 |     elif written_count == 0:
2041 |         logger.info("{0} releases unchanged, skipped write".format(total))
2042 |     else:
2043 |         logger.info(
2044 |             "Saved {0} of {1} releases to disk ({2} unchanged)".format(
2045 |                 written_count, total, total - written_count
2046 |             )
2047 |         )
2048 | 
2049 | 
2050 | def fetch_repository(
2051 |     name,
2052 |     remote_url,
2053 |     local_dir,
2054 |     skip_existing=False,
2055 |     bare_clone=False,
2056 |     lfs_clone=False,
2057 |     no_prune=False,
2058 | ):
2059 |     if bare_clone:
2060 |         if os.path.exists(local_dir):
2061 |             clone_exists = (
2062 |                 subprocess.check_output(
2063 |                     ["git", "rev-parse", "--is-bare-repository"], cwd=local_dir
2064 |                 )
2065 |                 == b"true\n"
2066 |             )
2067 |         else:
2068 |             clone_exists = False
2069 |     else:
2070 |         clone_exists = os.path.exists(os.path.join(local_dir, ".git"))
2071 | 
2072 |     if clone_exists and skip_existing:
2073 |         return
2074 | 
2075 |     masked_remote_url = mask_password(remote_url)
2076 | 
2077 |     initialized = subprocess.call(
2078 |         "git ls-remote " + remote_url, stdout=FNULL, stderr=FNULL, shell=True
2079 |     )
2080 |     if initialized == 128:
2081 |         if ".wiki.git" in remote_url:
2082 |             logger.info(
2083 |                 "Skipping {0} wiki (wiki is enabled but has no content)".format(name)
2084 |             )
2085 |         else:
2086 |             logger.info(
2087 |                 "Skipping {0} (repository not accessible - may be empty, private, or credentials invalid)".format(
2088 |                     name
2089 |                 )
2090 |             )
2091 |         return
2092 | 
2093 |     if clone_exists:
2094 |         logger.info("Updating {0} in {1}".format(name, local_dir))
2095 | 
2096 |         remotes = subprocess.check_output(["git", "remote", "show"], cwd=local_dir)
2097 |         remotes = [i.strip() for i in remotes.decode("utf-8").splitlines()]
2098 | 
2099 |         if "origin" not in remotes:
2100 |             git_command = ["git", "remote", "rm", "origin"]
2101 |             logging_subprocess(git_command, cwd=local_dir)
2102 |             git_command = ["git", "remote", "add", "origin", remote_url]
2103 |             logging_subprocess(git_command, cwd=local_dir)
2104 |         else:
2105 |             git_command = ["git", "remote", "set-url", "origin", remote_url]
2106 |             logging_subprocess(git_command, cwd=local_dir)
2107 | 
2108 |         git_command = ["git", "fetch", "--all", "--force", "--tags", "--prune"]
2109 |         if no_prune:
2110 |             git_command.pop()
2111 |         logging_subprocess(git_command, cwd=local_dir)
2112 |         if lfs_clone:
2113 |             git_command = ["git", "lfs", "fetch", "--all", "--prune"]
2114 |             if no_prune:
2115 |                 git_command.pop()
2116 |             logging_subprocess(git_command, cwd=local_dir)
2117 |     else:
2118 |         logger.info(
2119 |             "Cloning {0} repository from {1} to {2}".format(
2120 |                 name, masked_remote_url, local_dir
2121 |             )
2122 |         )
2123 |         if bare_clone:
2124 |             git_command = ["git", "clone", "--mirror", remote_url, local_dir]
2125 |             logging_subprocess(git_command)
2126 |             if lfs_clone:
2127 |                 git_command = ["git", "lfs", "fetch", "--all", "--prune"]
2128 |                 if no_prune:
2129 |                     git_command.pop()
2130 |                 logging_subprocess(git_command, cwd=local_dir)
2131 |         else:
2132 |             git_command = ["git", "clone", remote_url, local_dir]
2133 |             logging_subprocess(git_command)
2134 |             if lfs_clone:
2135 |                 git_command = ["git", "lfs", "fetch", "--all", "--prune"]
2136 |                 if no_prune:
2137 |                     git_command.pop()
2138 |                 logging_subprocess(git_command, cwd=local_dir)
2139 | 
2140 | 
2141 | def backup_account(args, output_directory):
2142 |     account_cwd = os.path.join(output_directory, "account")
2143 | 
2144 |     if args.include_starred or args.include_everything:
2145 |         output_file = "{0}/starred.json".format(account_cwd)
2146 |         template = "https://{0}/users/{1}/starred".format(
2147 |             get_github_api_host(args), args.user
2148 |         )
2149 |         _backup_data(args, "starred repositories", template, output_file, account_cwd)
2150 | 
2151 |     if args.include_watched or args.include_everything:
2152 |         output_file = "{0}/watched.json".format(account_cwd)
2153 |         template = "https://{0}/users/{1}/subscriptions".format(
2154 |             get_github_api_host(args), args.user
2155 |         )
2156 |         _backup_data(args, "watched repositories", template, output_file, account_cwd)
2157 | 
2158 |     if args.include_followers or args.include_everything:
2159 |         output_file = "{0}/followers.json".format(account_cwd)
2160 |         template = "https://{0}/users/{1}/followers".format(
2161 |             get_github_api_host(args), args.user
2162 |         )
2163 |         _backup_data(args, "followers", template, output_file, account_cwd)
2164 | 
2165 |     if args.include_following or args.include_everything:
2166 |         output_file = "{0}/following.json".format(account_cwd)
2167 |         template = "https://{0}/users/{1}/following".format(
2168 |             get_github_api_host(args), args.user
2169 |         )
2170 |         _backup_data(args, "following", template, output_file, account_cwd)
2171 | 
2172 | 
2173 | def _backup_data(args, name, template, output_file, output_directory):
2174 |     skip_existing = args.skip_existing
2175 |     if not skip_existing or not os.path.exists(output_file):
2176 |         logger.info("Retrieving {0} {1}".format(args.user, name))
2177 |         mkdir_p(output_directory)
2178 |         data = retrieve_data(args, template)
2179 | 
2180 |         if json_dump_if_changed(data, output_file):
2181 |             logger.info("Saved {0} {1} to disk".format(len(data), name))
2182 |         else:
2183 |             logger.info("{0} {1} unchanged, skipped write".format(len(data), name))
2184 | 
2185 | 
2186 | def json_dump(data, output_file):
2187 |     json.dump(
2188 |         data,
2189 |         output_file,
2190 |         ensure_ascii=False,
2191 |         sort_keys=True,
2192 |         indent=4,
2193 |         separators=(",", ": "),
2194 |     )
2195 | 
2196 | 
2197 | def json_dump_if_changed(data, output_file_path):
2198 |     """
2199 |     Write JSON data to file only if content has changed.
2200 | 
2201 |     Compares the serialized JSON data with the existing file content
2202 |     and only writes if different. This prevents unnecessary file
2203 |     modification timestamp updates and disk writes.
2204 | 
2205 |     Uses atomic writes (temp file + rename) to prevent corruption
2206 |     if the process is interrupted during the write.
2207 | 
2208 |     Args:
2209 |         data: The data to serialize as JSON
2210 |         output_file_path: The path to the output file
2211 | 
2212 |     Returns:
2213 |         True if file was written (content changed or new file)
2214 |         False if write was skipped (content unchanged)
2215 |     """
2216 |     # Serialize new data with consistent formatting matching json_dump()
2217 |     new_content = json.dumps(
2218 |         data,
2219 |         ensure_ascii=False,
2220 |         sort_keys=True,
2221 |         indent=4,
2222 |         separators=(",", ": "),
2223 |     )
2224 | 
2225 |     # Check if file exists and compare content
2226 |     if os.path.exists(output_file_path):
2227 |         try:
2228 |             with codecs.open(output_file_path, "r", encoding="utf-8") as f:
2229 |                 existing_content = f.read()
2230 |             if existing_content == new_content:
2231 |                 logger.debug(
2232 |                     "Content unchanged, skipping write: {0}".format(output_file_path)
2233 |                 )
2234 |                 return False
2235 |         except (OSError, UnicodeDecodeError) as e:
2236 |             # If we can't read the existing file, write the new one
2237 |             logger.debug(
2238 |                 "Error reading existing file {0}, will overwrite: {1}".format(
2239 |                     output_file_path, e
2240 |                 )
2241 |             )
2242 | 
2243 |     # Write the file atomically using temp file + rename
2244 |     temp_file = output_file_path + ".temp"
2245 |     with codecs.open(temp_file, "w", encoding="utf-8") as f:
2246 |         f.write(new_content)
2247 |     os.replace(temp_file, output_file_path)  # Atomic write
2248 |     return True
2249 | 


--------------------------------------------------------------------------------