├── .env.template
├── .github
    ├── CODEOWNERS
    ├── dependabot.yml
    └── workflows
    │   ├── project_add.yml
    │   ├── release.yml
    │   └── test_tap.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .secrets
    └── .gitignore
├── .sonarcloud.properties
├── LICENSE
├── README.md
├── config-sample.json
├── config.json
├── meltano.yml
├── plugins
    └── loaders
    │   └── target-jsonl--andyh1203.lock
├── poetry.lock
├── pyproject.toml
├── tap_github
    ├── __init__.py
    ├── authenticator.py
    ├── client.py
    ├── organization_streams.py
    ├── repository_streams.py
    ├── schema_objects.py
    ├── scraping.py
    ├── streams.py
    ├── tap.py
    ├── tests
    │   ├── __init__.py
    │   ├── fixtures.py
    │   ├── test_authenticator.py
    │   ├── test_core.py
    │   └── test_tap.py
    ├── user_streams.py
    └── utils
    │   ├── __init__.py
    │   └── filter_stdout.py
└── tox.ini


/.env.template:
--------------------------------------------------------------------------------
1 | TAP_GITHUB_AUTH_TOKEN="****"
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Global owner
2 | *       @MeltanoLabs/tap-github
3 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: pip
 9 |     directory: "/"
10 |     schedule:
11 |       interval: weekly
12 |       day: monday
13 |     versioning-strategy: increase-if-necessary
14 |     groups:
15 |       development-dependencies:
16 |         dependency-type: development
17 |       runtime-dependencies:
18 |         dependency-type: production
19 |         update-types:
20 |           - "patch"
21 |   - package-ecosystem: github-actions
22 |     directory: "/"
23 |     schedule:
24 |       interval: weekly
25 |       day: monday
26 |     groups:
27 |       actions:
28 |         patterns:
29 |           - "*"
30 | 


--------------------------------------------------------------------------------
/.github/workflows/project_add.yml:
--------------------------------------------------------------------------------
 1 | # Managed by Pulumi. Any edits to this file will be overwritten.
 2 | 
 3 | name: Add issues and PRs to MeltanoLabs Overview Project
 4 | 
 5 | on:
 6 |   issues:
 7 |     types:
 8 |       - opened
 9 |       - reopened
10 |       - transferred
11 |   pull_request:
12 |     types:
13 |       - opened
14 |       - reopened
15 | 
16 | jobs:
17 |   add-to-project:
18 |     name: Add issue to project
19 |     runs-on: ubuntu-latest
20 |     if: ${{ github.actor != 'dependabot[bot]' }}
21 |     steps:
22 |       - uses: actions/add-to-project@244f685bbc3b7adfa8466e08b698b5577571133e # v1.0.2
23 |         with:
24 |           project-url: https://github.com/orgs/MeltanoLabs/projects/3
25 |           github-token: ${{ secrets.MELTYBOT_PROJECT_ADD_PAT }}
26 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish Python Package
 2 | 
 3 | on:
 4 |   push:
 5 | 
 6 | permissions:
 7 |   contents: write
 8 |   id-token: write
 9 | 
10 | jobs:
11 |   build:
12 |     name: Build wheel and sdist
13 |     runs-on: ubuntu-latest
14 |     outputs:
15 |       version: ${{ steps.baipp.outputs.package_version }}
16 |     steps:
17 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
18 |       with:
19 |         fetch-depth: 0
20 |     - uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0
21 |       id: baipp
22 | 
23 |   publish:
24 |     name: Publish to PyPI
25 |     runs-on: ubuntu-latest
26 |     needs: [build]
27 |     environment:
28 |       name: pypi
29 |       url: https://pypi.org/project/meltanolabs-tap-github/${{ needs.build.outputs.version }}
30 |     if: startsWith(github.ref, 'refs/tags/')
31 | 
32 |     steps:
33 |     - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
34 |       with:
35 |         name: Packages
36 |         path: dist
37 |     - name: Upload wheel to release
38 |       uses: svenstaro/upload-release-action@04733e069f2d7f7f0b4aebc4fbdbce8613b03ccd # 2.9.0
39 |       with:
40 |         repo_token: ${{ secrets.GITHUB_TOKEN }}
41 |         file: dist/*.whl
42 |         tag: ${{ github.ref }}
43 |         overwrite: true
44 |         file_glob: true
45 |     - name: Deploy to PyPI
46 |       uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
47 | 


--------------------------------------------------------------------------------
/.github/workflows/test_tap.yml:
--------------------------------------------------------------------------------
  1 | name: Test tap-github
  2 | 
  3 | on:
  4 |   # Run on all pull requests and on pushes to main.
  5 |   pull_request:
  6 |     paths:
  7 |     - .github/workflows/test_tap.yml
  8 |     - poetry.lock
  9 |     - pyproject.toml
 10 |     - 'tap_github/**'
 11 |   push:
 12 |     branches:
 13 |       - main
 14 |     paths:
 15 |     - .github/workflows/test_tap.yml
 16 |     - poetry.lock
 17 |     - pyproject.toml
 18 |     - 'tap_github/**'
 19 |   workflow_dispatch:
 20 |   schedule:
 21 |   # Every 6 hours
 22 |   - cron: "0 */6 * * *"
 23 | 
 24 | concurrency:
 25 |   group: ${{ github.workflow }}-${{ github.ref }}
 26 |   cancel-in-progress: true
 27 | 
 28 | env:
 29 |   FORCE_COLOR: 1
 30 | 
 31 | jobs:
 32 |   tests:
 33 | 
 34 |     runs-on: ubuntu-latest
 35 |     env:
 36 |       GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
 37 |       ORG_LEVEL_TOKEN: ${{secrets.ORG_LEVEL_TOKEN}}
 38 |     strategy:
 39 |       matrix:
 40 |         python-version:
 41 |         - "3.13"
 42 |         - "3.12"
 43 |         - "3.11"
 44 |         - "3.10"
 45 |         - "3.9"
 46 |       # run the matrix jobs one after the other so they can benefit from caching
 47 |       max-parallel: 1
 48 | 
 49 |     steps:
 50 |     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 51 |     - name: Get Date
 52 |       id: get-date
 53 |       run: |
 54 |         echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT
 55 |       shell: bash
 56 | 
 57 |     - name: Cache github API responses
 58 |       uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
 59 |       with:
 60 |         # must match the path in tests/__init__.py
 61 |         path: '.cache/api_calls_tests_cache.sqlite'
 62 |         # github cache expires after 1wk, and we expire the content after 24h
 63 |         # this key is rotated every 24h so that the code does not find a stale
 64 |         # file in the cache. See issue #119
 65 |         key: api-cache-v4-${{ steps.get-date.outputs.date }}
 66 | 
 67 |     - name: Install Poetry
 68 |       uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1.4.1
 69 |       with:
 70 |         # Version of Poetry to use
 71 |         version: 2.1.1
 72 |         virtualenvs-create: true
 73 |         virtualenvs-in-project: true
 74 |     - name: Set up Python ${{ matrix.python-version }}
 75 |       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
 76 |       with:
 77 |         python-version: ${{ matrix.python-version }}
 78 |         cache: poetry
 79 |     - name: Install dependencies
 80 |       run: |
 81 |         poetry env use ${{ matrix.python-version }}
 82 |         poetry install
 83 |     - name: Type check with mypy
 84 |       id: type_check
 85 |       continue-on-error: true
 86 |       run: |
 87 |         poetry run mypy tap_github
 88 |     - name: Test with pytest
 89 |       id: test_pytest
 90 |       continue-on-error: true
 91 |       run: |
 92 |         LOGLEVEL=WARNING poetry run pytest --capture=no
 93 |     - name: Test with pytest (run 2)
 94 |       id: retry_test_pytest
 95 |       if: steps.test_pytest.outcome=='failure'         # check the step outcome, wait and retry
 96 |       run: |
 97 |         # sleep as little as possible to reduce CI run time
 98 |         # This assumes that REST quota is the one that caused problem
 99 |         # (which is most likely/often the case)
100 |         target_ts=$(curl -s -H "Accept: application/vnd.github+json"  -H "Authorization: Bearer $GITHUB_TOKEN" -H "X-GitHub-Api-Version: 2022-11-28"  https://api.github.com/rate_limit | grep reset | head -n 1 | awk -F: '{ print $2 }')
101 |         current_ts=$(date +%s)
102 |         seconds_to_sleep=$(echo "$target_ts - $current_ts" | bc)
103 |         sleep $seconds_to_sleep
104 |         LOGLEVEL=WARNING poetry run pytest --capture=no
105 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Meltano hidden files
  2 | .meltano
  3 | 
  4 | # Test output
  5 | .output
  6 | 
  7 | # Secrets and internal config files
  8 | **/.secrets/*
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # pipenv
 97 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 98 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 99 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | #   install all needed dependencies.
101 | #Pipfile.lock
102 | 
103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104 | __pypackages__/
105 | 
106 | # Celery stuff
107 | celerybeat-schedule
108 | celerybeat.pid
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # IDE
138 | .idea/
139 | .vscode/
140 | 
141 | # Pyre type checker
142 | .pyre/
143 | 
144 | #Other
145 | .DS_Store
146 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ci:
 2 |   autofix_prs: true
 3 |   autoupdate_schedule: monthly
 4 |   autoupdate_commit_msg: 'chore: pre-commit autoupdate'
 5 | 
 6 | repos:
 7 | - repo: https://github.com/pre-commit/pre-commit-hooks
 8 |   rev: v5.0.0
 9 |   hooks:
10 |   - id: check-json
11 |     exclude: "\\.vscode/.*.json"
12 |   - id: check-toml
13 |   - id: check-yaml
14 |   - id: end-of-file-fixer
15 |   - id: trailing-whitespace
16 | 
17 | 
18 | - repo: https://github.com/astral-sh/ruff-pre-commit
19 |   rev: v0.11.12
20 |   hooks:
21 |     - id: ruff
22 |       args: [ --fix ]
23 |     - id: ruff-format
24 | 


--------------------------------------------------------------------------------
/.secrets/.gitignore:
--------------------------------------------------------------------------------
1 | # IMPORTANT! This folder is hidden from git - if you need to store config files or other secrets,
2 | # make sure those are never staged for commit into your git repo. You can store them here or another
3 | # secure location.
4 | 
5 | *
6 | !.gitignore
7 | 


--------------------------------------------------------------------------------
/.sonarcloud.properties:
--------------------------------------------------------------------------------
1 | sonar.python.version=3.9, 3.10, 3.11, 3.12, 3.13
2 | sonar.cpd.exclusions=**/*
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tap-github
  2 | 
  3 | `tap-github` is a Singer tap for GitHub.
  4 | 
  5 | Built with the [Singer SDK](https://gitlab.com/meltano/singer-sdk).
  6 | 
  7 | ## Installation
  8 | 
  9 | ```bash
 10 | # use uv (https://docs.astral.sh/uv/)
 11 | uv tool install meltanolabs-tap-github
 12 | 
 13 | # or pipx (https://pipx.pypa.io/stable/)
 14 | pipx install meltanolabs-tap-github
 15 | 
 16 | # or Meltano
 17 | meltano add extractor tap-github
 18 | ```
 19 | 
 20 | A list of release versions is available at https://github.com/MeltanoLabs/tap-github/releases
 21 | 
 22 | ## Configuration
 23 | 
 24 | ### Accepted Config Options
 25 | 
 26 | This tap accepts the following configuration options:
 27 | 
 28 | - Required: One and only one of the following modes:
 29 |   1. `repositories`: An array of strings specifying the GitHub repositories to be included. Each element of the array should be of the form `<org>/<repository>`, e.g. `MeltanoLabs/tap-github`.
 30 |   2. `organizations`: An array of strings containing the github organizations to be included
 31 |   3. `searches`: An array of search descriptor objects with the following properties:
 32 |      - `name`: A human readable name for the search query
 33 |      - `query`: A github search string (generally the same as would come after `?q=` in the URL)
 34 |   4. `user_usernames`: A list of github usernames
 35 |   5. `user_ids`: A list of github user ids [int]
 36 | - Highly recommended:
 37 |   - Personal access tokens (PATs) for authentication can be provided in 3 ways:
 38 |     - `auth_token` - Takes a single token.
 39 |     - `additional_auth_tokens` - Takes a list of tokens. Can be used together with `auth_token` or as the sole source of PATs.
 40 |     - Any environment variables beginning with `GITHUB_TOKEN` will be assumed to be PATs. These tokens will be used in addition to `auth_token` (if provided), but will not be used if `additional_auth_tokens` is provided.
 41 |   - GitHub App keys are another option for authentication, and can be used in combination with PATs if desired. App IDs and keys should be assembled into the format `:app_id:;;-----BEGIN RSA PRIVATE KEY-----\n_YOUR_P_KEY_\n-----END RSA PRIVATE KEY-----` where the key can be generated from the `Private keys` section on https://github.com/organizations/:organization_name/settings/apps/:app_name.  Read more about GitHub App quotas [here](https://docs.github.com/en/enterprise-server@3.3/developers/apps/building-github-apps/rate-limits-for-github-apps#server-to-server-requests). Formatted app keys can be provided in 2 ways:
 42 |     - `auth_app_keys` - List of GitHub App keys in the prescribed format.
 43 |     - If `auth_app_keys` is not provided but there is an environment variable with the name `GITHUB_APP_PRIVATE_KEY`, it will be assumed to be an App key in the prescribed format.
 44 | - Optional:
 45 |   - `user_agent`
 46 |   - `start_date`
 47 |   - `metrics_log_level`
 48 |   - `stream_maps`
 49 |   - `stream_maps_config`
 50 |   - `stream_options`: Options which can change the behaviour of a specific stream are nested within.
 51 |     - `milestones`: Valid options for the `milestones` stream are nested within.
 52 |       - `state`: Determines which milestones will be extracted. One of `open` (default), `closed`, `all`.
 53 |   - `rate_limit_buffer`: A buffer to avoid consuming all query points for the auth_token at hand. Defaults to 1000.
 54 |   - `expiry_time_buffer`: A buffer used when determining when to refresh GitHub app tokens. Only relevant when authenticating as a GitHub app. Defaults to 10 minutes. Tokens generated by GitHub apps expire 1 hour after creation, and will be refreshed once fewer than `expiry_time_buffer` minutes remain until the anticipated expiry time.
 55 | 
 56 | Note that modes 1-3 are `repository` modes and 4-5 are `user` modes and will not run the same set of streams.
 57 | 
 58 | A full list of supported settings and capabilities for this tap is available by running:
 59 | 
 60 | ```bash
 61 | tap-github --about
 62 | ```
 63 | 
 64 | ### Source Authentication and Authorization
 65 | 
 66 | A small number of records may be pulled without an auth token. However, a Github auth token should generally be considered "required" since it gives more realistic rate limits. (See GitHub API docs for more info.)
 67 | 
 68 | ## Usage
 69 | 
 70 | ### API Limitation - Pagination
 71 | 
 72 | The GitHub API is limited for some resources such as `/events`. For some resources, users might encounter the following error:
 73 | 
 74 | ```
 75 | In order to keep the API fast for everyone, pagination is limited for this resource. Check the rel=last link relation in the Link response header to see how far back you can traverse.
 76 | ```
 77 | 
 78 | To avoid this, the GitHub streams will exit early. I.e. when there are no more `next page` available. If you are fecthing `/events` at the repository level, beware of letting the tap disabled for longer than a few days or you will have gaps in your data.
 79 | 
 80 | You can easily run `tap-github` by itself or in a pipeline using [Meltano](www.meltano.com).
 81 | 
 82 | ### Notes regarding permissions
 83 | 
 84 | * For the `traffic_*` streams, [you will need write access to the repository](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28). You can enable extraction for these streams by [selecting them in the catalog](https://hub.meltano.com/singer/spec/#metadata).
 85 | 
 86 | ### Executing the Tap Directly
 87 | 
 88 | ```bash
 89 | tap-github --version
 90 | tap-github --help
 91 | tap-github --config CONFIG --discover > ./catalog.json
 92 | ```
 93 | 
 94 | ## Contributing
 95 | This project uses parent-child streams. Learn more about them [here.](https://gitlab.com/meltano/sdk/-/blob/main/docs/parent_streams.md)
 96 | 
 97 | ### Initialize your Development Environment
 98 | 
 99 | ```bash
100 | pipx install poetry
101 | poetry install
102 | ```
103 | 
104 | ### Create and Run Tests
105 | 
106 | Create tests within the `tap_github/tests` subfolder and
107 | then run:
108 | 
109 | ```bash
110 | poetry run pytest
111 | ```
112 | 
113 | You can also test the `tap-github` CLI interface directly using `poetry run`:
114 | 
115 | ```bash
116 | poetry run tap-github --help
117 | ```
118 | 
119 | ### Testing with [Meltano](meltano.com)
120 | 
121 | _**Note:** This tap will work in any Singer environment and does not require Meltano.
122 | Examples here are for convenience and to streamline end-to-end orchestration scenarios._
123 | 
124 | Your project comes with a custom `meltano.yml` project file already created. Open the `meltano.yml` and follow any _"TODO"_ items listed in
125 | the file.
126 | 
127 | Next, install Meltano (if you haven't already) and any needed plugins:
128 | 
129 | ```bash
130 | # Install meltano
131 | pipx install meltano
132 | # Initialize meltano within this directory
133 | cd tap-github
134 | meltano install
135 | ```
136 | 
137 | Now you can test and orchestrate using Meltano:
138 | 
139 | ```bash
140 | # Test invocation:
141 | meltano invoke tap-github --version
142 | # OR run a test `elt` pipeline:
143 | meltano elt tap-github target-jsonl
144 | ```
145 | 
146 | One-liner to recreate output directory, run elt, and write out state file:
147 | 
148 | ```bash
149 | # Update this when you want a fresh state file:
150 | TESTJOB=testjob1
151 | 
152 | # Run everything in one line
153 | mkdir -p .output && meltano elt tap-github target-jsonl --job_id $TESTJOB && meltano elt tap-github target-jsonl --job_id $TESTJOB --dump=state > .output/state.json
154 | ```
155 | 
156 | ### Singer SDK Dev Guide
157 | 
158 | See the [dev guide](../../docs/dev_guide.md) for more instructions on how to use the Singer SDK to
159 | develop your own taps and targets.
160 | 


--------------------------------------------------------------------------------
/config-sample.json:
--------------------------------------------------------------------------------
1 | {
2 |     "searches": [
3 |         {
4 |             "name": "test_search",
5 |             "query": "target-athena+fork:only"
6 |         }
7 |     ]
8 | }
9 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "repositories": ["indeedeng/proctor"],
3 |     "start_date": "2022-05-16"
4 | }
5 | 


--------------------------------------------------------------------------------
/meltano.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | send_anonymous_usage_stats: false
 3 | project_id: 96584f7b-a36c-46e0-b41a-7f9074293137
 4 | venv:
 5 |   backend: uv
 6 | plugins:
 7 |   extractors:
 8 |   - name: tap-github
 9 |     namespace: tap_github
10 |     pip_url: -e .
11 |     capabilities:
12 |     - state
13 |     - catalog
14 |     - discover
15 |     settings:
16 |     - name: user_agent
17 |       kind: string
18 |     - name: metrics_log_level
19 |       kind: string
20 |     - name: auth_token
21 |       kind: password
22 |     - name: additional_auth_tokens
23 |       kind: array
24 |     - name: auth_app_keys
25 |       kind: array
26 |     - name: rate_limit_buffer
27 |       kind: integer
28 |     - name: expiry_time_buffer
29 |       kind: integer
30 |     - name: searches
31 |       kind: array
32 |     - name: organizations
33 |       kind: array
34 |     - name: repositories
35 |       kind: array
36 |     - name: user_usernames
37 |       kind: array
38 |     - name: user_ids
39 |       kind: array
40 |     - name: stream_options.milestones.state
41 |       kind: options
42 |       options:
43 |       - label: Open
44 |         value: open
45 |       - label: Closed
46 |         value: closed
47 |       - label: All
48 |         value: all
49 |     - name: start_date
50 |       kind: date_iso8601
51 |       value: '2010-01-01T00:00:00Z'
52 |     - name: stream_maps
53 |       kind: object
54 |     - name: stream_map_config
55 |       kind: object
56 |     select:
57 |     - '*.*'
58 |   loaders:
59 |   - name: target-jsonl
60 |     variant: andyh1203
61 |     pip_url: target-jsonl
62 |     config:
63 |       destination_path: .output
64 |       do_timestamp_file: true
65 | 


--------------------------------------------------------------------------------
/plugins/loaders/target-jsonl--andyh1203.lock:
--------------------------------------------------------------------------------
 1 | {
 2 |   "plugin_type": "loaders",
 3 |   "name": "target-jsonl",
 4 |   "namespace": "target_jsonl",
 5 |   "variant": "andyh1203",
 6 |   "label": "JSON Lines (JSONL)",
 7 |   "docs": "https://hub.meltano.com/loaders/target-jsonl--andyh1203",
 8 |   "repo": "https://github.com/andyh1203/target-jsonl",
 9 |   "pip_url": "target-jsonl",
10 |   "description": "JSONL loader",
11 |   "logo_url": "https://hub.meltano.com/assets/logos/loaders/jsonl.png",
12 |   "settings": [
13 |     {
14 |       "name": "destination_path",
15 |       "kind": "string",
16 |       "value": "output",
17 |       "label": "Destination Path",
18 |       "description": "Sets the destination path the JSONL files are written to, relative\nto the project root.\n\nThe directory needs to exist already, it will not be created\nautomatically.\n\nTo write JSONL files to the project root, set an empty string (`\"\"`).\n"
19 |     },
20 |     {
21 |       "name": "do_timestamp_file",
22 |       "kind": "boolean",
23 |       "value": false,
24 |       "label": "Include Timestamp in File Names",
25 |       "description": "Specifies if the files should get timestamped.\n\nBy default, the resulting file will not have a timestamp in the file name (i.e. `exchange_rate.jsonl`).\n\nIf this option gets set to `true`, the resulting file will have a timestamp associated with it (i.e. `exchange_rate-{timestamp}.jsonl`).\n"
26 |     },
27 |     {
28 |       "name": "custom_name",
29 |       "kind": "string",
30 |       "label": "Custom File Name Override",
31 |       "description": "Specifies a custom name for the filename, instead of the stream name.\n\nThe file name will be `{custom_name}-{timestamp}.jsonl`, if `do_timestamp_file` is `true`.\nOtherwise the file name will be `{custom_name}.jsonl`.\n\nIf custom name is not provided, the stream name will be used.\n"
32 |     }
33 |   ]
34 | }
35 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "meltanolabs-tap-github"
  3 | version = "0.0.0"
  4 | description = "Singer tap for GitHub, built with the Singer SDK."
  5 | authors = ["Meltano and Meltano Community <hello@meltano.com>"]
  6 | maintainers = [
  7 | 	"Meltano and Meltano Community <hello@meltano.com>",
  8 | 	"Edgar Ramírez-Mondragón <edgarrm358@gmail.com>",
  9 | ]
 10 | homepage = "https://github.com/MeltanoLabs/tap-github"
 11 | repository = "https://github.com/MeltanoLabs/tap-github"
 12 | license = "Apache-2.0"
 13 | keywords = ["Meltano", "Singer", "Meltano SDK", "Singer SDK", "ELT", "GitHub"]
 14 | readme = "README.md"
 15 | classifiers = [
 16 | 	"Intended Audience :: Developers",
 17 | 	"License :: OSI Approved :: Apache Software License",
 18 | 	"Operating System :: OS Independent",
 19 | 	"Programming Language :: Python :: 3.9",
 20 | 	"Programming Language :: Python :: 3.10",
 21 | 	"Programming Language :: Python :: 3.11",
 22 | 	"Programming Language :: Python :: 3.12",
 23 | 	"Programming Language :: Python :: 3.13",
 24 | 	"Programming Language :: Python :: Implementation :: CPython",
 25 | 	"Typing :: Typed",
 26 | ]
 27 | packages = [
 28 | 	{ include = "tap_github", format = ["sdist", "wheel"] }
 29 | ]
 30 | 
 31 | [tool.poetry.urls]
 32 | "Issue Tracker" = "https://github.com/MeltanoLabs/tap-github/issues"
 33 | 
 34 | [tool.poetry.dependencies]
 35 | beautifulsoup4 = "~=4.13.3"
 36 | cryptography = { version = "~=45.0.2", python = ">3.9.0,<3.9.1 || >3.9.1" }
 37 | nested-lookup = "~=0.2.25"
 38 | PyJWT = "2.10.1"
 39 | python = ">=3.9"
 40 | python-dateutil = "~=2.9"
 41 | requests = "~=2.32.3"
 42 | # For local SDK dev:
 43 | # singer-sdk = {path = "../singer-sdk", develop = true}
 44 | singer-sdk = "~=0.46.0"
 45 | 
 46 | [tool.poetry.group.dev.dependencies]
 47 | mypy = ">=1.15.0"
 48 | pytest = ">=7.3.1"
 49 | requests-cache = ">=1.0.1"
 50 | types-beautifulsoup4 = ">=4.12.0"
 51 | types-python-dateutil = "~=2.9.0"
 52 | types-requests = ">=2.30.0"
 53 | types-simplejson = "~=3.20.0"
 54 | 
 55 | [tool.poetry-dynamic-versioning]
 56 | enable = true
 57 | 
 58 | [[tool.mypy.overrides]]
 59 | module = [
 60 | 	"backoff",
 61 | 	"nested_lookup",
 62 | ]
 63 | ignore_missing_imports = true
 64 | 
 65 | [build-system]
 66 | requires = [
 67 | 	"poetry-core==2.1.1",
 68 | 	"poetry-dynamic-versioning==1.8.2",
 69 | 
 70 | ]
 71 | build-backend = "poetry_dynamic_versioning.backend"
 72 | 
 73 | [tool.poetry.scripts]
 74 | # CLI declaration
 75 | tap-github = 'tap_github.tap:cli'
 76 | 
 77 | [tool.pytest.ini_options]
 78 | markers = [
 79 | 	"repo_list: mark a test as using a list of repos in config",
 80 | 	"username_list: mark a test as using a list of usernames in config",
 81 | ]
 82 | 
 83 | [tool.ruff]
 84 | target-version = "py39"
 85 | 
 86 | [tool.ruff.lint]
 87 | ignore = []
 88 | select = [
 89 | 	"F",    # Pyflakes
 90 | 	"E",    # pycodestyle (errors)
 91 | 	"W",    # pycodestyle (warnings)
 92 | 	"I",    # isort
 93 | 	"N",    # pep8-naming
 94 | 	"UP",   # pyupgrade
 95 | 	"YTT",  # flake8-2020
 96 | 	"ANN",  # flake8-annotations
 97 | 	"B",    # flake8-bugbear
 98 | 	"A",    # flake8-builtins
 99 | 	"C4",   # flake8-comprehensions
100 | 	"DTZ",  # flake8-datetimez
101 | 	"FA",   # flake8-future-annotations
102 | 	"SIM",  # flake8-simplify
103 | 	"TC",   # flake8-type-checking
104 | 	"PERF", # Perflint
105 | 	"FURB", # refurb
106 | 	"RUF",  # Ruff-specific rules
107 | ]
108 | 
109 | [tool.ruff.lint.per-file-ignores]
110 | "tap_github/tests/*" = ["ANN"]
111 | 


--------------------------------------------------------------------------------
/tap_github/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MeltanoLabs/tap-github/0374f0768b1ffb2a3db0dd53591282830e553cf2/tap_github/__init__.py


--------------------------------------------------------------------------------
/tap_github/authenticator.py:
--------------------------------------------------------------------------------
  1 | """Classes to assist in authenticating to the GitHub API."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import logging
  6 | import time
  7 | from copy import deepcopy
  8 | from datetime import datetime, timedelta, timezone
  9 | from os import environ
 10 | from random import choice, shuffle
 11 | from typing import TYPE_CHECKING, Any
 12 | 
 13 | import jwt
 14 | import requests
 15 | from singer_sdk.authenticators import APIAuthenticatorBase
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from singer_sdk.streams import RESTStream
 19 | 
 20 | 
 21 | class TokenManager:
 22 |     """A class to store a token's attributes and state.
 23 |     This parent class should not be used directly, use a subclass instead.
 24 |     """
 25 | 
 26 |     DEFAULT_RATE_LIMIT = 5000
 27 |     # The DEFAULT_RATE_LIMIT_BUFFER buffer serves two purposes:
 28 |     # - keep some leeway and rotate tokens before erroring out on rate limit.
 29 |     # - not consume all available calls when we rare using an org or user token.
 30 |     DEFAULT_RATE_LIMIT_BUFFER = 1000
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         token: str | None,
 35 |         rate_limit_buffer: int | None = None,
 36 |         logger: Any | None = None,  # noqa: ANN401
 37 |     ) -> None:
 38 |         """Init TokenManager info."""
 39 |         self.token = token
 40 |         self.logger = logger
 41 |         self.rate_limit = self.DEFAULT_RATE_LIMIT
 42 |         self.rate_limit_remaining = self.DEFAULT_RATE_LIMIT
 43 |         self.rate_limit_reset: datetime | None = None
 44 |         self.rate_limit_used = 0
 45 |         self.rate_limit_buffer = (
 46 |             rate_limit_buffer
 47 |             if rate_limit_buffer is not None
 48 |             else self.DEFAULT_RATE_LIMIT_BUFFER
 49 |         )
 50 | 
 51 |     def update_rate_limit(self, response_headers: Any) -> None:  # noqa: ANN401
 52 |         self.rate_limit = int(response_headers["X-RateLimit-Limit"])
 53 |         self.rate_limit_remaining = int(response_headers["X-RateLimit-Remaining"])
 54 |         self.rate_limit_reset = datetime.fromtimestamp(
 55 |             int(response_headers["X-RateLimit-Reset"]),
 56 |             tz=timezone.utc,
 57 |         )
 58 |         self.rate_limit_used = int(response_headers["X-RateLimit-Used"])
 59 | 
 60 |     def is_valid_token(self) -> bool:
 61 |         """Try making a request with the current token. If the request succeeds return True, else False."""  # noqa: E501
 62 |         if not self.token:
 63 |             return False
 64 | 
 65 |         try:
 66 |             response = requests.get(
 67 |                 url="https://api.github.com/rate_limit",
 68 |                 headers={
 69 |                     "Authorization": f"token {self.token}",
 70 |                 },
 71 |             )
 72 |             response.raise_for_status()
 73 |             return True
 74 |         except requests.exceptions.HTTPError:
 75 |             msg = (
 76 |                 f"A token could not be validated. "
 77 |                 f"{response.status_code} Client Error: "
 78 |                 f"{response.content!s} (Reason: {response.reason})"
 79 |             )
 80 |             if self.logger is not None:
 81 |                 self.logger.warning(msg)
 82 |             return False
 83 | 
 84 |     def has_calls_remaining(self) -> bool:
 85 |         """Check if a token has capacity to make more calls.
 86 | 
 87 |         Returns:
 88 |             True if the token is valid and has enough api calls remaining.
 89 |         """
 90 |         if self.rate_limit_reset is None:
 91 |             return True
 92 |         return self.rate_limit_used <= (
 93 |             self.rate_limit - self.rate_limit_buffer
 94 |         ) or self.rate_limit_reset <= datetime.now(tz=timezone.utc)
 95 | 
 96 | 
 97 | class PersonalTokenManager(TokenManager):
 98 |     """A class to store token rate limiting information."""
 99 | 
100 |     def __init__(
101 |         self,
102 |         token: str,
103 |         rate_limit_buffer: int | None = None,
104 |         **kwargs,  # noqa: ANN003
105 |     ) -> None:
106 |         """Init PersonalTokenRateLimit info."""
107 |         super().__init__(token, rate_limit_buffer=rate_limit_buffer, **kwargs)
108 | 
109 | 
110 | def generate_jwt_token(
111 |     github_app_id: str,
112 |     github_private_key: str,
113 |     expiration_time: int = 600,
114 |     algorithm: str = "RS256",
115 | ) -> str:
116 |     actual_time = int(time.time())
117 | 
118 |     payload = {
119 |         "iat": actual_time,
120 |         "exp": actual_time + expiration_time,
121 |         "iss": github_app_id,
122 |     }
123 |     token = jwt.encode(payload, github_private_key, algorithm=algorithm)
124 | 
125 |     if isinstance(token, bytes):
126 |         token = token.decode("utf-8")
127 | 
128 |     return token
129 | 
130 | 
131 | def generate_app_access_token(
132 |     github_app_id: str,
133 |     github_private_key: str,
134 |     github_installation_id: str | None = None,
135 | ) -> tuple[str, datetime]:
136 |     produced_at = datetime.now(tz=timezone.utc)
137 |     jwt_token = generate_jwt_token(github_app_id, github_private_key)
138 | 
139 |     headers = {"Authorization": f"Bearer {jwt_token}"}
140 | 
141 |     if github_installation_id is None:
142 |         list_installations_resp = requests.get(
143 |             url="https://api.github.com/app/installations", headers=headers
144 |         )
145 |         list_installations_resp.raise_for_status()
146 |         list_installations = list_installations_resp.json()
147 | 
148 |         if len(list_installations) == 0:
149 |             raise Exception(f"No installations found for app {github_app_id}.")
150 | 
151 |         github_installation_id = choice(list_installations)["id"]
152 | 
153 |     url = f"https://api.github.com/app/installations/{github_installation_id}/access_tokens"
154 |     resp = requests.post(url, headers=headers)
155 | 
156 |     if resp.status_code != 201:
157 |         resp.raise_for_status()
158 | 
159 |     expires_at = produced_at + timedelta(hours=1)
160 |     return resp.json()["token"], expires_at
161 | 
162 | 
163 | class AppTokenManager(TokenManager):
164 |     """A class to store an app token's attributes and state, and handle token refreshing"""  # noqa: E501
165 | 
166 |     DEFAULT_RATE_LIMIT = 15000
167 |     DEFAULT_EXPIRY_BUFFER_MINS = 10
168 | 
169 |     def __init__(
170 |         self,
171 |         env_key: str,
172 |         rate_limit_buffer: int | None = None,
173 |         expiry_time_buffer: int | None = None,
174 |         **kwargs,  # noqa: ANN003
175 |     ) -> None:
176 |         if rate_limit_buffer is None:
177 |             rate_limit_buffer = self.DEFAULT_RATE_LIMIT_BUFFER
178 |         super().__init__(None, rate_limit_buffer=rate_limit_buffer, **kwargs)
179 | 
180 |         parts = env_key.split(";;")
181 |         self.github_app_id = parts[0]
182 |         self.github_private_key = (parts[1:2] or [""])[0].replace("\\n", "\n")
183 |         self.github_installation_id: str | None = parts[2] if len(parts) >= 3 else None
184 | 
185 |         if expiry_time_buffer is None:
186 |             expiry_time_buffer = self.DEFAULT_EXPIRY_BUFFER_MINS
187 |         self.expiry_time_buffer = expiry_time_buffer
188 | 
189 |         self.token_expires_at: datetime | None = None
190 |         self.claim_token()
191 | 
192 |     def claim_token(self) -> None:
193 |         """Updates the TokenManager's token and token_expires_at attributes.
194 | 
195 |         The outcome will be _either_ that self.token is updated to a newly claimed valid token and
196 |         self.token_expires_at is updated to the anticipated expiry time (erring on the side of an early estimate)
197 |         _or_ self.token and self.token_expires_at are both set to None.
198 |         """  # noqa: E501
199 |         self.token = None
200 |         self.token_expires_at = None
201 | 
202 |         # Make sure we have the details we need
203 |         if not self.github_app_id or not self.github_private_key:
204 |             raise ValueError(
205 |                 "GITHUB_APP_PRIVATE_KEY could not be parsed. The expected format is "
206 |                 '":app_id:;;-----BEGIN RSA PRIVATE KEY-----\\n_YOUR_P_KEY_\\n-----END RSA PRIVATE KEY-----"'  # noqa: E501
207 |             )
208 | 
209 |         self.token, self.token_expires_at = generate_app_access_token(
210 |             self.github_app_id, self.github_private_key, self.github_installation_id
211 |         )
212 | 
213 |         # Check if the token isn't valid.  If not, overwrite it with None
214 |         if not self.is_valid_token():
215 |             if self.logger:
216 |                 self.logger.warning(
217 |                     "An app token was generated but could not be validated."
218 |                 )
219 |             self.token = None
220 |             self.token_expires_at = None
221 | 
222 |     def has_calls_remaining(self) -> bool:
223 |         """Check if a token has capacity to make more calls.
224 | 
225 |         Returns:
226 |             True if the token is valid and has enough api calls remaining.
227 |         """
228 |         if self.token_expires_at is not None:
229 |             close_to_expiry = datetime.now(
230 |                 tz=timezone.utc
231 |             ) > self.token_expires_at - timedelta(minutes=self.expiry_time_buffer)
232 | 
233 |             if close_to_expiry:
234 |                 self.claim_token()
235 |                 if self.token is None:
236 |                     if self.logger:
237 |                         self.logger.warning("GitHub app token refresh failed.")
238 |                     return False
239 |                 else:
240 |                     if self.logger:
241 |                         self.logger.info("GitHub app token refresh succeeded.")
242 | 
243 |         return super().has_calls_remaining()
244 | 
245 | 
246 | class GitHubTokenAuthenticator(APIAuthenticatorBase):
247 |     """Base class for offloading API auth."""
248 | 
249 |     @staticmethod
250 |     def get_env():  # noqa: ANN205
251 |         return dict(environ)
252 | 
253 |     def prepare_tokens(self) -> list[TokenManager]:
254 |         """Prep GitHub tokens"""
255 | 
256 |         env_dict = self.get_env()
257 |         rate_limit_buffer = self._config.get("rate_limit_buffer", None)
258 |         expiry_time_buffer = self._config.get("expiry_time_buffer", None)
259 | 
260 |         personal_tokens: set[str] = set()
261 |         if "auth_token" in self._config:
262 |             personal_tokens.add(self._config["auth_token"])
263 |         if "additional_auth_tokens" in self._config:
264 |             personal_tokens = personal_tokens.union(
265 |                 self._config["additional_auth_tokens"]
266 |             )
267 |         else:
268 |             # Accept multiple tokens using environment variables GITHUB_TOKEN*
269 |             env_tokens = {
270 |                 value
271 |                 for key, value in env_dict.items()
272 |                 if key.startswith("GITHUB_TOKEN")
273 |             }
274 |             if len(env_tokens) > 0:
275 |                 self.logger.info(
276 |                     f"Found {len(env_tokens)} 'GITHUB_TOKEN' environment variables for authentication."  # noqa: E501
277 |                 )
278 |                 personal_tokens = personal_tokens.union(env_tokens)
279 | 
280 |         personal_token_managers: list[TokenManager] = []
281 |         for token in personal_tokens:
282 |             token_manager = PersonalTokenManager(
283 |                 token, rate_limit_buffer=rate_limit_buffer, logger=self.logger
284 |             )
285 |             if token_manager.is_valid_token():
286 |                 personal_token_managers.append(token_manager)
287 |             else:
288 |                 logging.warning("A token was dismissed.")
289 | 
290 |         # Parse App level private keys and generate tokens
291 |         # To simplify settings, we use a single env-key formatted as follows:
292 |         # "{app_id};;{-----BEGIN RSA PRIVATE KEY-----\n_YOUR_PRIVATE_KEY_\n-----END RSA PRIVATE KEY-----}"  # noqa: E501
293 | 
294 |         app_keys: set[str] = set()
295 |         if "auth_app_keys" in self._config:
296 |             app_keys = app_keys.union(self._config["auth_app_keys"])
297 |             self.logger.info(
298 |                 f"Provided {len(app_keys)} app keys via config for authentication."
299 |             )
300 |         elif "GITHUB_APP_PRIVATE_KEY" in env_dict:
301 |             app_keys.add(env_dict["GITHUB_APP_PRIVATE_KEY"])
302 |             self.logger.info(
303 |                 "Found 1 app key via environment variable for authentication."
304 |             )
305 | 
306 |         app_token_managers: list[TokenManager] = []
307 |         for app_key in app_keys:
308 |             try:
309 |                 app_token_manager = AppTokenManager(
310 |                     app_key,
311 |                     rate_limit_buffer=rate_limit_buffer,
312 |                     expiry_time_buffer=expiry_time_buffer,
313 |                     logger=self.logger,
314 |                 )
315 |                 if app_token_manager.is_valid_token():
316 |                     app_token_managers.append(app_token_manager)
317 |             except ValueError as e:  # noqa: PERF203
318 |                 self.logger.warning(
319 |                     f"An error was thrown while preparing an app token: {e}"
320 |                 )
321 | 
322 |         self.logger.info(
323 |             f"Tap will run with {len(personal_token_managers)} personal auth tokens "
324 |             f"and {len(app_token_managers)} app keys."
325 |         )
326 |         return personal_token_managers + app_token_managers
327 | 
328 |     def __init__(self, stream: RESTStream) -> None:
329 |         """Init authenticator.
330 | 
331 |         Args:
332 |             stream: A stream for a RESTful endpoint.
333 |         """
334 |         super().__init__(stream=stream)
335 |         self.logger: logging.Logger = stream.logger
336 |         self.tap_name: str = stream.tap_name
337 |         self._config: dict[str, Any] = dict(stream.config)
338 |         self.token_managers = self.prepare_tokens()
339 |         self.active_token: TokenManager | None = (
340 |             choice(self.token_managers) if self.token_managers else None
341 |         )
342 | 
343 |     def get_next_auth_token(self) -> None:
344 |         current_token = self.active_token.token if self.active_token else ""
345 |         token_managers = deepcopy(self.token_managers)
346 |         shuffle(token_managers)
347 |         for token_manager in token_managers:
348 |             if (
349 |                 token_manager.has_calls_remaining()
350 |                 and current_token != token_manager.token
351 |             ):
352 |                 self.active_token = token_manager
353 |                 self.logger.info("Switching to fresh auth token")
354 |                 return
355 | 
356 |         raise RuntimeError(
357 |             "All GitHub tokens have hit their rate limit. Stopping here."
358 |         )
359 | 
360 |     def update_rate_limit(
361 |         self, response_headers: requests.models.CaseInsensitiveDict
362 |     ) -> None:
363 |         # If no token or only one token is available, return early.
364 |         if len(self.token_managers) <= 1 or self.active_token is None:
365 |             return
366 | 
367 |         self.active_token.update_rate_limit(response_headers)
368 | 
369 |     def authenticate_request(
370 |         self,
371 |         request: requests.PreparedRequest,
372 |     ) -> requests.PreparedRequest:
373 |         if self.active_token:
374 |             # Make sure that our token is still valid or update it.
375 |             if not self.active_token.has_calls_remaining():
376 |                 self.get_next_auth_token()
377 |             request.headers["Authorization"] = f"token {self.active_token.token}"
378 |         else:
379 |             self.logger.info(
380 |                 "No auth token detected. "
381 |                 "For higher rate limits, please specify `auth_token` in config."
382 |             )
383 |         return request
384 | 


--------------------------------------------------------------------------------
/tap_github/client.py:
--------------------------------------------------------------------------------
  1 | """REST client handling, including GitHubStream base class."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import email.utils
  6 | import inspect
  7 | import random
  8 | import time
  9 | from typing import TYPE_CHECKING, Any, ClassVar, cast
 10 | from urllib.parse import parse_qs, urlparse
 11 | 
 12 | from dateutil.parser import parse
 13 | from nested_lookup import nested_lookup
 14 | from singer_sdk.exceptions import FatalAPIError, RetriableAPIError
 15 | from singer_sdk.helpers.jsonpath import extract_jsonpath
 16 | from singer_sdk.streams import GraphQLStream, RESTStream
 17 | 
 18 | from tap_github.authenticator import GitHubTokenAuthenticator
 19 | 
 20 | if TYPE_CHECKING:
 21 |     from collections.abc import Iterable
 22 |     from types import FrameType
 23 | 
 24 |     import requests
 25 |     from backoff.types import Details
 26 |     from singer_sdk.helpers.types import Context
 27 | 
 28 | EMPTY_REPO_ERROR_STATUS = 409
 29 | 
 30 | 
 31 | class GitHubRestStream(RESTStream):
 32 |     """GitHub Rest stream class."""
 33 | 
 34 |     MAX_PER_PAGE = 100  # GitHub's limit is 100.
 35 |     MAX_RESULTS_LIMIT: int | None = None
 36 |     DEFAULT_API_BASE_URL = "https://api.github.com"
 37 |     LOG_REQUEST_METRIC_URLS = True
 38 | 
 39 |     # GitHub is missing the "since" parameter on a few endpoints
 40 |     # set this parameter to True if your stream needs to navigate data in descending order  # noqa: E501
 41 |     # and try to exit early on its own.
 42 |     # This only has effect on streams whose `replication_key` is `updated_at`.
 43 |     use_fake_since_parameter = False
 44 | 
 45 |     _authenticator: GitHubTokenAuthenticator | None = None
 46 | 
 47 |     @property
 48 |     def authenticator(self) -> GitHubTokenAuthenticator:
 49 |         if self._authenticator is None:
 50 |             self._authenticator = GitHubTokenAuthenticator(stream=self)
 51 |         return self._authenticator
 52 | 
 53 |     @property
 54 |     def url_base(self) -> str:
 55 |         return self.config.get("api_url_base", self.DEFAULT_API_BASE_URL)
 56 | 
 57 |     primary_keys: ClassVar[list[str]] = ["id"]
 58 |     replication_key: str | None = None
 59 |     tolerated_http_errors: ClassVar[list[int]] = []
 60 | 
 61 |     @property
 62 |     def http_headers(self) -> dict[str, str]:
 63 |         """Return the http headers needed."""
 64 |         headers = {"Accept": "application/vnd.github.v3+json"}
 65 |         headers["User-Agent"] = cast("str", self.config.get("user_agent", "tap-github"))
 66 |         return headers
 67 | 
 68 |     def get_next_page_token(
 69 |         self,
 70 |         response: requests.Response,
 71 |         previous_token: Any | None,  # noqa: ANN401
 72 |     ) -> Any | None:  # noqa: ANN401
 73 |         """Return a token for identifying next page or None if no more pages."""
 74 |         if (
 75 |             previous_token
 76 |             and self.MAX_RESULTS_LIMIT
 77 |             and (
 78 |                 cast("int", previous_token) * self.MAX_PER_PAGE
 79 |                 >= self.MAX_RESULTS_LIMIT
 80 |             )
 81 |         ):
 82 |             return None
 83 | 
 84 |         # Leverage header links returned by the GitHub API.
 85 |         if "next" not in response.links:
 86 |             return None
 87 | 
 88 |         resp_json = response.json()
 89 |         results = resp_json if isinstance(resp_json, list) else resp_json.get("items")
 90 | 
 91 |         # Exit early if the response has no items. ? Maybe duplicative the "next" link check.  # noqa: E501
 92 |         if not results:
 93 |             return None
 94 | 
 95 |         # Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do not support  # noqa: E501
 96 |         # the "since" parameter out of the box. So we use a workaround here to exit early.  # noqa: E501
 97 |         # For such streams, we sort by descending dates (most recent first), and paginate  # noqa: E501
 98 |         # "back in time" until we reach records before our "fake_since" parameter.
 99 |         if self.replication_key and self.use_fake_since_parameter:
100 |             request_parameters = parse_qs(str(urlparse(response.request.url).query))
101 |             # parse_qs interprets "+" as a space, revert this to keep an aware datetime
102 |             try:
103 |                 since = (
104 |                     request_parameters["fake_since"][0].replace(" ", "+")
105 |                     if "fake_since" in request_parameters
106 |                     else ""
107 |                 )
108 |             except IndexError:
109 |                 return None
110 | 
111 |             direction = (
112 |                 request_parameters["direction"][0]
113 |                 if "direction" in request_parameters
114 |                 else None
115 |             )
116 | 
117 |             # commit_timestamp is a constructed key which does not exist in the raw response  # noqa: E501
118 |             replication_date = (
119 |                 results[-1][self.replication_key]
120 |                 if self.replication_key != "commit_timestamp"
121 |                 else results[-1]["commit"]["committer"]["date"]
122 |             )
123 |             # exit early if the replication_date is before our since parameter
124 |             if (
125 |                 since
126 |                 and direction == "desc"
127 |                 and (parse(replication_date) < parse(since))
128 |             ):
129 |                 return None
130 | 
131 |         # Use header links returned by the GitHub API.
132 |         parsed_url = urlparse(response.links["next"]["url"])
133 |         captured_page_value_list = parse_qs(parsed_url.query).get("page")
134 |         next_page_string = (
135 |             captured_page_value_list[0] if captured_page_value_list else None
136 |         )
137 |         if next_page_string and next_page_string.isdigit():
138 |             return int(next_page_string)
139 | 
140 |         return (previous_token or 1) + 1
141 | 
142 |     def get_url_params(
143 |         self,
144 |         context: Context | None,
145 |         next_page_token: Any | None,  # noqa: ANN401
146 |     ) -> dict[str, Any]:
147 |         """Return a dictionary of values to be used in URL parameterization."""
148 |         params: dict = {"per_page": self.MAX_PER_PAGE}
149 |         if next_page_token:
150 |             params["page"] = next_page_token
151 | 
152 |         if self.replication_key == "updated_at":
153 |             params["sort"] = "updated"
154 |             params["direction"] = "desc" if self.use_fake_since_parameter else "asc"
155 | 
156 |         # Unfortunately the /starred, /stargazers (starred_at) and /events (created_at) endpoints do not support  # noqa: E501
157 |         # the "since" parameter out of the box. But we use a workaround in 'get_next_page_token'.  # noqa: E501
158 |         elif self.replication_key in ["starred_at", "created_at"]:
159 |             params["sort"] = "created"
160 |             params["direction"] = "desc"
161 | 
162 |         # Warning: /commits endpoint accept "since" but results are ordered by descending commit_timestamp  # noqa: E501
163 |         elif self.replication_key == "commit_timestamp":
164 |             params["direction"] = "desc"
165 | 
166 |         elif self.replication_key:
167 |             self.logger.warning(
168 |                 f"The replication key '{self.replication_key}' is not fully supported by this client yet."  # noqa: E501
169 |             )
170 | 
171 |         since = self.get_starting_timestamp(context)
172 |         since_key = "since" if not self.use_fake_since_parameter else "fake_since"
173 |         if self.replication_key and since:
174 |             params[since_key] = since.isoformat(sep="T")
175 |             # Leverage conditional requests to save API quotas
176 |             # https://github.community/t/how-does-if-modified-since-work/139627
177 |             self.http_headers["If-modified-since"] = email.utils.format_datetime(since)
178 |         return params
179 | 
180 |     def validate_response(self, response: requests.Response) -> None:
181 |         """Validate HTTP response.
182 | 
183 |         In case an error is tolerated, continue without raising it.
184 | 
185 |         In case an error is deemed transient and can be safely retried, then this
186 |         method should raise an :class:`singer_sdk.exceptions.RetriableAPIError`.
187 | 
188 |         Args:
189 |             response: A `requests.Response`_ object.
190 | 
191 |         Raises:
192 |             FatalAPIError: If the request is not retriable.
193 |             RetriableAPIError: If the request is retriable.
194 | 
195 |         .. _requests.Response:
196 |             https://docs.python-requests.org/en/latest/api/#requests.Response
197 |         """
198 |         full_path = urlparse(response.url).path
199 |         if response.status_code in (
200 |             [*self.tolerated_http_errors, EMPTY_REPO_ERROR_STATUS]
201 |         ):
202 |             msg = (
203 |                 f"{response.status_code} Tolerated Status Code "
204 |                 f"(Reason: {response.reason}) for path: {full_path}"
205 |             )
206 |             self.logger.info(msg)
207 |             return
208 | 
209 |         if 400 <= response.status_code < 500:
210 |             msg = (
211 |                 f"{response.status_code} Client Error: "
212 |                 f"{response.content!s} (Reason: {response.reason}) for path: {full_path}"  # noqa: E501
213 |             )
214 |             # Retry on rate limiting
215 |             if (
216 |                 response.status_code == 403
217 |                 and "rate limit exceeded" in str(response.content).lower()
218 |             ):
219 |                 # Update token
220 |                 self.authenticator.get_next_auth_token()
221 |                 # Raise an error to force a retry with the new token.
222 |                 raise RetriableAPIError(msg, response)
223 | 
224 |             # Retry on secondary rate limit
225 |             if (
226 |                 response.status_code == 403
227 |                 and "secondary rate limit" in str(response.content).lower()
228 |             ):
229 |                 # Wait about a minute and retry
230 |                 time.sleep(60 + 30 * random.random())
231 |                 raise RetriableAPIError(msg, response)
232 | 
233 |             # The GitHub API randomly returns 401 Unauthorized errors, so we try again.
234 |             if (
235 |                 response.status_code == 401
236 |                 # if the token is invalid, we are also told about it
237 |                 and "bad credentials" not in str(response.content).lower()
238 |             ):
239 |                 raise RetriableAPIError(msg, response)
240 | 
241 |             # all other errors are fatal
242 |             # Note: The API returns a 404 "Not Found" if trying to read a repo
243 |             # for which the token is not allowed access.
244 |             raise FatalAPIError(msg)
245 | 
246 |         elif 500 <= response.status_code < 600:
247 |             msg = (
248 |                 f"{response.status_code} Server Error: "
249 |                 f"{response.content!s} (Reason: {response.reason}) for path: {full_path}"  # noqa: E501
250 |             )
251 |             raise RetriableAPIError(msg, response)
252 | 
253 |     def parse_response(self, response: requests.Response) -> Iterable[dict]:
254 |         """Parse the response and return an iterator of result rows."""
255 |         # TODO - Split into handle_reponse and parse_response.
256 |         if response.status_code in (
257 |             [*self.tolerated_http_errors, EMPTY_REPO_ERROR_STATUS]
258 |         ):
259 |             return
260 | 
261 |         # Update token rate limit info and loop through tokens if needed.
262 |         self.authenticator.update_rate_limit(response.headers)
263 | 
264 |         resp_json = response.json()
265 | 
266 |         if isinstance(resp_json, list):
267 |             results = resp_json
268 |         elif resp_json.get("items") is not None:
269 |             results = resp_json.get("items")
270 |         else:
271 |             results = [resp_json]
272 | 
273 |         yield from results
274 | 
275 |     def post_process(self, row: dict, context: Context | None = None) -> dict:
276 |         """Add `repo_id` by default to all streams."""
277 |         if context is not None and "repo_id" in context:
278 |             row["repo_id"] = context["repo_id"]
279 |         return row
280 | 
281 |     def backoff_handler(self, details: Details) -> None:
282 |         """Handle retriable error by swapping auth token."""
283 |         self.logger.info("Retrying request with different token")
284 |         # use python introspection to obtain the error object
285 |         # FIXME: replace this once https://github.com/litl/backoff/issues/158
286 |         # is fixed
287 |         exc = cast(
288 |             "FrameType",
289 |             cast("FrameType", cast("FrameType", inspect.currentframe()).f_back).f_back,
290 |         ).f_locals["e"]
291 |         if (
292 |             exc.response is not None
293 |             and exc.response.status_code == 403
294 |             and "rate limit exceeded" in str(exc.response.content)
295 |         ):
296 |             # we hit a rate limit, rotate token
297 |             prepared_request = details["args"][0]
298 |             self.authenticator.get_next_auth_token()
299 |             prepared_request.headers.update(self.authenticator.auth_headers or {})
300 | 
301 |     def calculate_sync_cost(
302 |         self,
303 |         request: requests.PreparedRequest,
304 |         response: requests.Response,
305 |         context: Context | None,
306 |     ) -> dict[str, int]:
307 |         """Return the cost of the last REST API call."""
308 |         return {"rest": 1, "graphql": 0, "search": 0}
309 | 
310 | 
311 | class GitHubDiffStream(GitHubRestStream):
312 |     """Base class for GitHub diff streams."""
313 | 
314 |     # Known Github API errors for diff requests
315 |     tolerated_http_errors: ClassVar[list[int]] = [404, 406, 422, 502]
316 | 
317 |     @property
318 |     def http_headers(self) -> dict:
319 |         """Return the http headers needed for diff requests."""
320 |         headers = super().http_headers
321 |         headers["Accept"] = "application/vnd.github.v3.diff"
322 |         return headers
323 | 
324 |     def parse_response(self, response: requests.Response) -> Iterable[dict]:
325 |         """Parse the response to yield the diff text instead of an object
326 |         and prevent buffer overflow."""
327 |         if response.status_code != 200:
328 |             contents = response.json()
329 |             self.logger.info(
330 |                 "Skipping %s due to %d error: %s",
331 |                 self.name.replace("_", " "),
332 |                 response.status_code,
333 |                 contents["message"],
334 |             )
335 |             yield {
336 |                 "success": False,
337 |                 "error_message": contents["message"],
338 |             }
339 |             return
340 | 
341 |         if content_length_str := response.headers.get("Content-Length"):
342 |             content_length = int(content_length_str)
343 |             max_size = 41_943_040  # 40 MiB
344 |             if content_length > max_size:
345 |                 self.logger.info(
346 |                     "Skipping %s. The diff size (%.2f MiB) exceeded the maximum"
347 |                     " size limit of 40 MiB.",
348 |                     self.name.replace("_", " "),
349 |                     content_length / 1024 / 1024,
350 |                 )
351 |                 yield {
352 |                     "success": False,
353 |                     "error_message": "Diff exceeded the maximum size limit of 40 MiB.",
354 |                 }
355 |                 return
356 | 
357 |         yield {"diff": response.text, "success": True}
358 | 
359 | 
360 | class GitHubGraphqlStream(GraphQLStream, GitHubRestStream):
361 |     """GitHub Graphql stream class."""
362 | 
363 |     @property
364 |     def url_base(self) -> str:
365 |         return f"{self.config.get('api_url_base', self.DEFAULT_API_BASE_URL)}/graphql"
366 | 
367 |     # the jsonpath under which to fetch the list of records from the graphql response
368 |     query_jsonpath: str = "$.data.[*]"
369 | 
370 |     def parse_response(self, response: requests.Response) -> Iterable[dict]:
371 |         """Parse the response and return an iterator of result rows.
372 | 
373 |         Args:
374 |             response: A raw `requests.Response`_ object.
375 | 
376 |         Yields:
377 |             One item for every item found in the response.
378 | 
379 |         .. _requests.Response:
380 |             https://docs.python-requests.org/en/latest/api/#requests.Response
381 |         """
382 |         resp_json = response.json()
383 |         yield from extract_jsonpath(self.query_jsonpath, input=resp_json)
384 | 
385 |     def get_next_page_token(
386 |         self,
387 |         response: requests.Response,
388 |         previous_token: Any | None,  # noqa: ANN401
389 |     ) -> Any | None:  # noqa: ANN401
390 |         """
391 |         Return a dict of cursors for identifying next page or None if no more pages.
392 | 
393 |         Note - pagination requires the Graphql query to have nextPageCursor_X parameters
394 |         with the assosciated hasNextPage_X, startCursor_X and endCursor_X.
395 | 
396 |         X should be an integer between 0 and 9, increasing with query depth.
397 | 
398 |         Warning - we recommend to avoid using deep (nested) pagination.
399 |         """
400 | 
401 |         resp_json = response.json()
402 | 
403 |         # Find if results contains "hasNextPage_X" flags and if any are True.
404 |         # If so, set nextPageCursor_X to endCursor_X for X max.
405 | 
406 |         next_page_results = nested_lookup(
407 |             key="hasNextPage_",
408 |             document=resp_json,
409 |             wild=True,
410 |             with_keys=True,
411 |         )
412 | 
413 |         has_next_page_indices: list[int] = []
414 |         # Iterate over all the items and filter items with hasNextPage = True.
415 |         for key, value in next_page_results.items():
416 |             # Check if key is even then add pair to new dictionary
417 |             if any(value):
418 |                 pagination_index = int(str(key).split("_")[1])
419 |                 has_next_page_indices.append(pagination_index)
420 | 
421 |         # Check if any "hasNextPage" is True. Otherwise, exit early.
422 |         if not len(has_next_page_indices) > 0:
423 |             return None
424 | 
425 |         # Get deepest pagination item
426 |         max_pagination_index = max(has_next_page_indices)
427 | 
428 |         # We leverage previous_token to remember the pagination cursors
429 |         # for indices below max_pagination_index.
430 |         next_page_cursors: dict[str, str] = {}
431 |         for key, value in (previous_token or {}).items():
432 |             # Only keep pagination info for indices below max_pagination_index.
433 |             pagination_index = int(str(key).split("_")[1])
434 |             if pagination_index < max_pagination_index:
435 |                 next_page_cursors[key] = value
436 | 
437 |         # Get the pagination cursor to update and increment it.
438 |         next_page_end_cursor_results = nested_lookup(
439 |             key=f"endCursor_{max_pagination_index}",
440 |             document=resp_json,
441 |         )
442 | 
443 |         next_page_key = f"nextPageCursor_{max_pagination_index}"
444 |         next_page_cursor = next(
445 |             cursor for cursor in next_page_end_cursor_results if cursor is not None
446 |         )
447 |         next_page_cursors[next_page_key] = next_page_cursor
448 | 
449 |         return next_page_cursors
450 | 
451 |     def get_url_params(
452 |         self,
453 |         context: Context | None,
454 |         next_page_token: Any | None,  # noqa: ANN401
455 |     ) -> dict[str, Any]:
456 |         """Return a dictionary of values to be used in URL parameterization."""
457 |         params = dict(context) if context else {}
458 |         params["per_page"] = self.MAX_PER_PAGE
459 |         if next_page_token:
460 |             params.update(next_page_token)
461 | 
462 |         since = self.get_starting_timestamp(context)
463 |         if self.replication_key and since:
464 |             params["since"] = since.isoformat(sep="T")
465 | 
466 |         return params
467 | 
468 |     def calculate_sync_cost(
469 |         self,
470 |         request: requests.PreparedRequest,
471 |         response: requests.Response,
472 |         context: Context | None,
473 |     ) -> dict[str, int]:
474 |         """Return the cost of the last graphql API call."""
475 |         costgen = extract_jsonpath("$.data.rateLimit.cost", input=response.json())
476 |         # calculate_sync_cost is called before the main response parsing.
477 |         # In some cases, the tap crashes here before we have been able to
478 |         # properly analyze where the error comes from, so we ignore these
479 |         # costs to allow figuring out what happened downstream, by setting
480 |         # them to 0.
481 |         cost = next(costgen, 0)
482 |         return {"rest": 0, "graphql": int(cost), "search": 0}
483 | 
484 |     def validate_response(self, response: requests.Response) -> None:
485 |         """Validate HTTP response.
486 | 
487 |         The graphql spec is a bit confusing around response codes
488 |         (https://github.com/graphql/graphql-over-http/blob/main/spec/GraphQLOverHTTP.md#response)
489 |         Github's API is a bit of a free adaptation of standards, so we
490 |         choose fail immediately on error here, so that something is logged
491 |         at the very minimum.
492 | 
493 |         Args:
494 |             response: A `requests.Response`_ object.
495 | 
496 |         Raises:
497 |             FatalAPIError: If the request is not retriable.
498 |             RetriableAPIError: If the request is retriable.
499 |         """
500 |         super().validate_response(response)
501 |         rj = response.json()
502 |         if "errors" in rj:
503 |             msg = rj["errors"]
504 |             raise FatalAPIError(f"Graphql error: {msg}", response)
505 | 


--------------------------------------------------------------------------------
/tap_github/organization_streams.py:
--------------------------------------------------------------------------------
  1 | """User Stream types classes for tap-github."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from typing import TYPE_CHECKING, Any, ClassVar
  6 | 
  7 | from singer_sdk import typing as th  # JSON Schema typing helpers
  8 | 
  9 | from tap_github.client import GitHubRestStream
 10 | 
 11 | if TYPE_CHECKING:
 12 |     from collections.abc import Iterable
 13 | 
 14 |     from singer_sdk.helpers.types import Context
 15 | 
 16 | 
 17 | class OrganizationStream(GitHubRestStream):
 18 |     """Defines a GitHub Organization Stream.
 19 |     API Reference: https://docs.github.com/en/rest/reference/orgs#get-an-organization
 20 |     """
 21 | 
 22 |     name = "organizations"
 23 |     path = "/orgs/{org}"
 24 | 
 25 |     @property
 26 |     def partitions(self) -> list[dict] | None:
 27 |         return [{"org": org} for org in self.config["organizations"]]
 28 | 
 29 |     def get_child_context(self, record: dict, context: Context | None) -> dict:
 30 |         return {
 31 |             "org": record["login"],
 32 |         }
 33 | 
 34 |     def get_records(self, context: Context | None) -> Iterable[dict[str, Any]]:
 35 |         """
 36 |         Override the parent method to allow skipping API calls
 37 |         if the stream is deselected and skip_parent_streams is True in config.
 38 |         This allows running the tap with fewer API calls and preserving
 39 |         quota when only syncing a child stream. Without this,
 40 |         the API call is sent but data is discarded.
 41 |         """
 42 |         if (
 43 |             not self.selected
 44 |             and "skip_parent_streams" in self.config
 45 |             and self.config["skip_parent_streams"]
 46 |             and context is not None
 47 |         ):
 48 |             # build a minimal mock record so that self._sync_records
 49 |             # can proceed with child streams
 50 |             yield {
 51 |                 "org": context["org"],
 52 |             }
 53 |         else:
 54 |             yield from super().get_records(context)
 55 | 
 56 |     schema = th.PropertiesList(
 57 |         th.Property("login", th.StringType),
 58 |         th.Property("id", th.IntegerType),
 59 |         th.Property("node_id", th.StringType),
 60 |         th.Property("url", th.StringType),
 61 |         th.Property("repos_url", th.StringType),
 62 |         th.Property("events_url", th.StringType),
 63 |         th.Property("hooks_url", th.StringType),
 64 |         th.Property("issues_url", th.StringType),
 65 |         th.Property("members_url", th.StringType),
 66 |         th.Property("public_members_url", th.StringType),
 67 |         th.Property("avatar_url", th.StringType),
 68 |         th.Property("description", th.StringType),
 69 |     ).to_dict()
 70 | 
 71 | 
 72 | class OrganizationMembersStream(GitHubRestStream):
 73 |     """
 74 |     API Reference: https://docs.github.com/en/rest/orgs/members?apiVersion=2022-11-28#list-organization-members
 75 |     """
 76 | 
 77 |     name = "organization_members"
 78 |     primary_keys: ClassVar[list[str]] = ["id"]
 79 |     path = "/orgs/{org}/members"
 80 |     ignore_parent_replication_key = True
 81 |     parent_stream_type = OrganizationStream
 82 |     state_partitioning_keys: ClassVar[list[str]] = ["org"]
 83 |     schema = th.PropertiesList(
 84 |         # Parent keys
 85 |         th.Property("org", th.StringType),
 86 |         # Rest
 87 |         th.Property("login", th.StringType),
 88 |         th.Property("id", th.IntegerType),
 89 |         th.Property("node_id", th.StringType),
 90 |         th.Property("avatar_url", th.StringType),
 91 |         th.Property("gravatar_id", th.StringType),
 92 |         th.Property("url", th.StringType),
 93 |         th.Property("html_url", th.StringType),
 94 |         th.Property("type", th.StringType),
 95 |         th.Property("site_admin", th.BooleanType),
 96 |     ).to_dict()
 97 | 
 98 | 
 99 | class TeamsStream(GitHubRestStream):
100 |     """
101 |     API Reference: https://docs.github.com/en/rest/reference/teams#list-teams
102 |     """
103 | 
104 |     name = "teams"
105 |     primary_keys: ClassVar[list[str]] = ["id"]
106 |     path = "/orgs/{org}/teams"
107 |     ignore_parent_replication_key = True
108 |     parent_stream_type = OrganizationStream
109 |     state_partitioning_keys: ClassVar[list[str]] = ["org"]
110 | 
111 |     def get_child_context(self, record: dict, context: Context | None) -> dict:
112 |         new_context = {"team_slug": record["slug"]}
113 |         if context:
114 |             return {
115 |                 **context,
116 |                 **new_context,
117 |             }
118 |         return new_context
119 | 
120 |     schema = th.PropertiesList(
121 |         # Parent Keys
122 |         th.Property("org", th.StringType),
123 |         # Rest
124 |         th.Property("id", th.IntegerType),
125 |         th.Property("node_id", th.StringType),
126 |         th.Property("url", th.StringType),
127 |         th.Property("html_url", th.StringType),
128 |         th.Property("name", th.StringType),
129 |         th.Property("slug", th.StringType),
130 |         th.Property("description", th.StringType),
131 |         th.Property("privacy", th.StringType),
132 |         th.Property("permission", th.StringType),
133 |         th.Property("members_url", th.StringType),
134 |         th.Property("repositories_url", th.StringType),
135 |         th.Property(
136 |             "parent",
137 |             th.ObjectType(
138 |                 th.Property("id", th.IntegerType),
139 |                 th.Property("node_id", th.StringType),
140 |                 th.Property("url", th.StringType),
141 |                 th.Property("html_url", th.StringType),
142 |                 th.Property("name", th.StringType),
143 |                 th.Property("slug", th.StringType),
144 |                 th.Property("description", th.StringType),
145 |                 th.Property("privacy", th.StringType),
146 |                 th.Property("permission", th.StringType),
147 |                 th.Property("members_url", th.StringType),
148 |                 th.Property("repositories_url", th.StringType),
149 |             ),
150 |         ),
151 |     ).to_dict()
152 | 
153 | 
154 | class TeamMembersStream(GitHubRestStream):
155 |     """
156 |     API Reference: https://docs.github.com/en/rest/reference/teams#list-team-members
157 |     """
158 | 
159 |     name = "team_members"
160 |     primary_keys: ClassVar[list[str]] = ["id", "team_slug"]
161 |     path = "/orgs/{org}/teams/{team_slug}/members"
162 |     ignore_parent_replication_key = True
163 |     parent_stream_type = TeamsStream
164 |     state_partitioning_keys: ClassVar[list[str]] = ["team_slug", "org"]
165 | 
166 |     def get_child_context(self, record: dict, context: Context | None) -> dict:
167 |         new_context = {"username": record["login"]}
168 |         if context:
169 |             return {
170 |                 **context,
171 |                 **new_context,
172 |             }
173 |         return new_context
174 | 
175 |     schema = th.PropertiesList(
176 |         # Parent keys
177 |         th.Property("org", th.StringType),
178 |         th.Property("team_slug", th.StringType),
179 |         # Rest
180 |         th.Property("login", th.StringType),
181 |         th.Property("id", th.IntegerType),
182 |         th.Property("node_id", th.StringType),
183 |         th.Property("avatar_url", th.StringType),
184 |         th.Property("gravatar_id", th.StringType),
185 |         th.Property("url", th.StringType),
186 |         th.Property("html_url", th.StringType),
187 |         th.Property("type", th.StringType),
188 |         th.Property("site_admin", th.BooleanType),
189 |     ).to_dict()
190 | 
191 | 
192 | class TeamRolesStream(GitHubRestStream):
193 |     """
194 |     API Reference: https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user
195 |     """
196 | 
197 |     name = "team_roles"
198 |     path = "/orgs/{org}/teams/{team_slug}/memberships/{username}"
199 |     ignore_parent_replication_key = True
200 |     primary_keys: ClassVar[list[str]] = ["url"]
201 |     parent_stream_type = TeamMembersStream
202 |     state_partitioning_keys: ClassVar[list[str]] = ["username", "team_slug", "org"]
203 | 
204 |     schema = th.PropertiesList(
205 |         # Parent keys
206 |         th.Property("org", th.StringType),
207 |         th.Property("team_slug", th.StringType),
208 |         th.Property("username", th.StringType),
209 |         # Rest
210 |         th.Property("url", th.StringType),
211 |         th.Property("role", th.StringType),
212 |         th.Property("state", th.StringType),
213 |     ).to_dict()
214 | 


--------------------------------------------------------------------------------
/tap_github/schema_objects.py:
--------------------------------------------------------------------------------
 1 | """Reusable schema objects for tap-github.
 2 | 
 3 | Below are a few common patterns in the github API
 4 | factored out as reusable objects. They help in making the
 5 | schema more readable and error-free.
 6 | """
 7 | 
 8 | from singer_sdk import typing as th  # JSON Schema typing helpers
 9 | 
10 | # This user object is common throughout the API results
11 | user_object = th.ObjectType(
12 |     th.Property("login", th.StringType),
13 |     th.Property("id", th.IntegerType),
14 |     th.Property("node_id", th.StringType),
15 |     th.Property("avatar_url", th.StringType),
16 |     th.Property("gravatar_id", th.StringType),
17 |     th.Property("html_url", th.StringType),
18 |     th.Property("type", th.StringType),
19 |     th.Property("site_admin", th.BooleanType),
20 | )
21 | 
22 | # some objects are shared between issues and pull requests
23 | label_object = th.ObjectType(
24 |     th.Property("id", th.IntegerType),
25 |     th.Property("node_id", th.StringType),
26 |     th.Property("url", th.StringType),
27 |     th.Property("name", th.StringType),
28 |     th.Property("description", th.StringType),
29 |     th.Property("color", th.StringType),
30 |     th.Property("default", th.BooleanType),
31 | )
32 | 
33 | milestone_object = th.ObjectType(
34 |     th.Property("html_url", th.StringType),
35 |     th.Property("node_id", th.StringType),
36 |     th.Property("id", th.IntegerType),
37 |     th.Property("number", th.IntegerType),
38 |     th.Property("state", th.StringType),
39 |     th.Property("title", th.StringType),
40 |     th.Property("description", th.StringType),
41 |     th.Property("creator", user_object),
42 |     th.Property("open_issues", th.IntegerType),
43 |     th.Property("closed_issues", th.IntegerType),
44 |     th.Property("created_at", th.DateTimeType),
45 |     th.Property("updated_at", th.DateTimeType),
46 |     th.Property("closed_at", th.DateTimeType),
47 |     th.Property("due_on", th.DateTimeType),
48 | )
49 | 
50 | reactions_object = th.ObjectType(
51 |     th.Property("url", th.StringType),
52 |     th.Property("total_count", th.IntegerType),
53 |     th.Property("plus_one", th.IntegerType),
54 |     th.Property("minus_one", th.IntegerType),
55 |     th.Property("laugh", th.IntegerType),
56 |     th.Property("hooray", th.IntegerType),
57 |     th.Property("confused", th.IntegerType),
58 |     th.Property("heart", th.IntegerType),
59 |     th.Property("rocket", th.IntegerType),
60 |     th.Property("eyes", th.IntegerType),
61 | )
62 | 
63 | files_object = th.ObjectType(
64 |     th.Property("sha", th.StringType),
65 |     th.Property("filename", th.StringType),
66 |     th.Property("status", th.StringType),
67 |     th.Property("additions", th.IntegerType),
68 |     th.Property("deletions", th.IntegerType),
69 |     th.Property("changes", th.IntegerType),
70 |     th.Property("blob_url", th.StringType),
71 |     th.Property("raw_url", th.StringType),
72 |     th.Property("contents_url", th.StringType),
73 |     th.Property("patch", th.StringType),
74 |     th.Property("previous_filename", th.StringType),
75 | )
76 | 


--------------------------------------------------------------------------------
/tap_github/scraping.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for scraping https://github.com
  2 | 
  3 | Inspired by https://github.com/dogsheep/github-to-sqlite/pull/70
  4 | """
  5 | 
  6 | from __future__ import annotations
  7 | 
  8 | import logging
  9 | import re
 10 | import time
 11 | from datetime import datetime, timezone
 12 | from typing import TYPE_CHECKING, Any, cast
 13 | from urllib.parse import urlparse
 14 | 
 15 | import requests
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from collections.abc import Iterable
 19 | 
 20 |     from bs4 import NavigableString, Tag
 21 | 
 22 | used_by_regex = re.compile(" {3}Used by ")
 23 | contributors_regex = re.compile(" {3}Contributors ")
 24 | 
 25 | 
 26 | def scrape_dependents(
 27 |     response: requests.Response, logger: logging.Logger | None = None
 28 | ) -> Iterable[dict[str, Any]]:
 29 |     from bs4 import BeautifulSoup
 30 | 
 31 |     logger = logger or logging.getLogger("scraping")
 32 | 
 33 |     soup = BeautifulSoup(response.content, "html.parser")
 34 |     # Navigate through Package toggle if present
 35 |     base_url = urlparse(response.url).hostname or "github.com"
 36 |     options = soup.find_all("a", class_="select-menu-item")
 37 |     links = [link["href"] for link in options] if len(options) > 0 else [response.url]
 38 | 
 39 |     logger.debug(links)
 40 | 
 41 |     for link in links:
 42 |         yield from _scrape_dependents(f"https://{base_url}/{link}", logger)
 43 | 
 44 | 
 45 | def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[dict[str, Any]]:
 46 |     # Optional dependency:
 47 |     from bs4 import BeautifulSoup
 48 | 
 49 |     s = requests.Session()
 50 | 
 51 |     while url:
 52 |         logger.debug(url)
 53 |         response = s.get(url)
 54 |         soup = BeautifulSoup(response.content, "html.parser")
 55 | 
 56 |         repo_names = [
 57 |             (a["href"] if not isinstance(a["href"], list) else a["href"][0]).lstrip("/")
 58 |             for a in soup.select("a[data-hovercard-type=repository]")
 59 |         ]
 60 |         stars = [
 61 |             int(s.next_sibling.strip())
 62 |             for s in soup.find_all("svg", {"class": "octicon octicon-star"})
 63 |         ]
 64 |         forks = [
 65 |             int(s.next_sibling.strip())
 66 |             for s in soup.find_all("svg", {"class": "octicon octicon-repo-forked"})
 67 |         ]
 68 | 
 69 |         if not len(repo_names) == len(stars) == len(forks):
 70 |             raise IndexError(
 71 |                 "Could not find star and fork info. Maybe the GitHub page format has changed?"  # noqa: E501
 72 |             )
 73 | 
 74 |         repos = [
 75 |             {"name_with_owner": name, "stars": s, "forks": f}
 76 |             for name, s, f in zip(repo_names, stars, forks)
 77 |         ]
 78 | 
 79 |         logger.debug(repos)
 80 | 
 81 |         yield from repos
 82 | 
 83 |         # next page?
 84 |         try:
 85 |             next_link: Tag = soup.select(".paginate-container")[0].find_all(
 86 |                 "a", text="Next"
 87 |             )[0]
 88 |         except IndexError:
 89 |             break
 90 |         if next_link is not None:
 91 |             href = next_link["href"]
 92 |             url = str(href if not isinstance(href, list) else href[0])
 93 |             time.sleep(1)
 94 |         else:
 95 |             url = ""
 96 | 
 97 | 
 98 | def parse_counter(tag: Tag | NavigableString | None) -> int:
 99 |     """
100 |     Extract a count of [issues|PR|contributors...] from an HTML tag.
101 |     For very high numbers, we only get an approximate value as github
102 |     does not provide the actual number.
103 |     """
104 |     if not tag:
105 |         return 0
106 |     try:
107 |         if tag == "\n":
108 |             return 0
109 |         title = tag["title"]  # type: ignore
110 |         if isinstance(title, str):
111 |             title_string = cast("str", title)
112 |         else:
113 |             title_string = cast("str", title[0])
114 |         return int(title_string.strip().replace(",", "").replace("+", ""))
115 |     except (KeyError, ValueError) as e:
116 |         raise IndexError(
117 |             f"Could not parse counter {tag}. Maybe the GitHub page format has changed?"
118 |         ) from e
119 | 
120 | 
121 | def scrape_metrics(
122 |     response: requests.Response, logger: logging.Logger | None = None
123 | ) -> Iterable[dict[str, Any]]:
124 |     from bs4 import BeautifulSoup
125 | 
126 |     logger = logger or logging.getLogger("scraping")
127 | 
128 |     soup = BeautifulSoup(response.content, "html.parser")
129 | 
130 |     try:
131 |         issues = parse_counter(soup.find("span", id="issues-repo-tab-count"))
132 |         prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count"))
133 |     except IndexError as e:
134 |         # These two items should exist. We raise an error if we could not find them.
135 |         raise IndexError(
136 |             "Could not find issues or prs info. Maybe the GitHub page format has changed?"  # noqa: E501
137 |         ) from e
138 | 
139 |     dependents_node = soup.find(string=used_by_regex)
140 |     # verify that we didn't hit some random text in the page.
141 |     # sometimes the dependents section isn't shown on the page either
142 |     dependents_node_parent = getattr(dependents_node, "parent", None)
143 |     dependents: int = 0
144 |     if dependents_node_parent is not None and "href" in dependents_node_parent:  # noqa: SIM102
145 |         if dependents_node_parent["href"].endswith("/network/dependents"):
146 |             dependents = parse_counter(getattr(dependents_node, "next_element", None))
147 | 
148 |     # likewise, handle edge cases with contributors
149 |     contributors_node = soup.find(string=contributors_regex)
150 |     contributors_node_parent = getattr(contributors_node, "parent", None)
151 |     contributors: int = 0
152 |     if contributors_node_parent is not None and "href" in contributors_node_parent:  # noqa: SIM102
153 |         if contributors_node_parent["href"].endswith("/graphs/contributors"):
154 |             contributors = parse_counter(
155 |                 getattr(contributors_node, "next_element", None),
156 |             )
157 | 
158 |     fetched_at = datetime.now(tz=timezone.utc)
159 | 
160 |     metrics = [
161 |         {
162 |             "open_issues": issues,
163 |             "open_prs": prs,
164 |             "dependents": dependents,
165 |             "contributors": contributors,
166 |             "fetched_at": fetched_at,
167 |         }
168 |     ]
169 | 
170 |     logger.debug(metrics)
171 |     return metrics
172 | 


--------------------------------------------------------------------------------
/tap_github/streams.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from enum import Enum
  4 | from typing import TYPE_CHECKING
  5 | 
  6 | from tap_github.organization_streams import (
  7 |     OrganizationMembersStream,
  8 |     OrganizationStream,
  9 |     TeamMembersStream,
 10 |     TeamRolesStream,
 11 |     TeamsStream,
 12 | )
 13 | from tap_github.repository_streams import (
 14 |     AnonymousContributorsStream,
 15 |     AssigneesStream,
 16 |     BranchesStream,
 17 |     CollaboratorsStream,
 18 |     CommitCommentsStream,
 19 |     CommitDiffsStream,
 20 |     CommitsStream,
 21 |     CommunityProfileStream,
 22 |     ContributorsStream,
 23 |     CustomPropertiesStream,
 24 |     DependenciesStream,
 25 |     DependentsStream,
 26 |     DeploymentsStream,
 27 |     DeploymentStatusesStream,
 28 |     EventsStream,
 29 |     ExtraMetricsStream,
 30 |     IssueCommentsStream,
 31 |     IssueEventsStream,
 32 |     IssuesStream,
 33 |     LabelsStream,
 34 |     LanguagesStream,
 35 |     MilestonesStream,
 36 |     PullRequestCommitDiffsStream,
 37 |     PullRequestCommitsStream,
 38 |     PullRequestDiffsStream,
 39 |     PullRequestsStream,
 40 |     ReadmeHtmlStream,
 41 |     ReadmeStream,
 42 |     ReleasesStream,
 43 |     RepositoryStream,
 44 |     ReviewCommentsStream,
 45 |     ReviewsStream,
 46 |     StargazersGraphqlStream,
 47 |     StargazersStream,
 48 |     StatsContributorsStream,
 49 |     TagsStream,
 50 |     TrafficClonesStream,
 51 |     TrafficPageViewsStream,
 52 |     TrafficReferralPathsStream,
 53 |     TrafficReferrersStream,
 54 |     WorkflowRunJobsStream,
 55 |     WorkflowRunsStream,
 56 |     WorkflowsStream,
 57 | )
 58 | from tap_github.user_streams import StarredStream, UserContributedToStream, UserStream
 59 | 
 60 | if TYPE_CHECKING:
 61 |     from singer_sdk.streams.core import Stream
 62 | 
 63 | 
 64 | class Streams(Enum):
 65 |     """
 66 |     Represents all streams our tap supports, and which queries (by username, by organization, etc.) you can use.
 67 |     """  # noqa: E501
 68 | 
 69 |     valid_queries: set[str]
 70 |     streams: list[type[Stream]]
 71 | 
 72 |     def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None:
 73 |         self.valid_queries = valid_queries
 74 |         self.streams = streams
 75 | 
 76 |     REPOSITORY = (
 77 |         {"repositories", "organizations", "searches"},
 78 |         [
 79 |             AnonymousContributorsStream,
 80 |             AssigneesStream,
 81 |             BranchesStream,
 82 |             CollaboratorsStream,
 83 |             CommitCommentsStream,
 84 |             CommitsStream,
 85 |             CommitDiffsStream,
 86 |             CommunityProfileStream,
 87 |             ContributorsStream,
 88 |             DependenciesStream,
 89 |             DependentsStream,
 90 |             DeploymentsStream,
 91 |             DeploymentStatusesStream,
 92 |             EventsStream,
 93 |             IssueCommentsStream,
 94 |             IssueEventsStream,
 95 |             IssuesStream,
 96 |             LabelsStream,
 97 |             LanguagesStream,
 98 |             MilestonesStream,
 99 |             PullRequestCommitsStream,
100 |             PullRequestCommitDiffsStream,
101 |             PullRequestDiffsStream,
102 |             PullRequestsStream,
103 |             ReadmeHtmlStream,
104 |             ReadmeStream,
105 |             ReleasesStream,
106 |             ExtraMetricsStream,
107 |             RepositoryStream,
108 |             ReviewCommentsStream,
109 |             ReviewsStream,
110 |             StargazersGraphqlStream,
111 |             StargazersStream,
112 |             StatsContributorsStream,
113 |             TagsStream,
114 |             TrafficClonesStream,
115 |             TrafficPageViewsStream,
116 |             TrafficReferralPathsStream,
117 |             TrafficReferrersStream,
118 |             WorkflowRunJobsStream,
119 |             WorkflowRunsStream,
120 |             WorkflowsStream,
121 |         ],
122 |     )
123 |     USERS = (
124 |         {"user_usernames", "user_ids"},
125 |         [
126 |             StarredStream,
127 |             UserContributedToStream,
128 |             UserStream,
129 |         ],
130 |     )
131 |     ORGANIZATIONS = (
132 |         {"organizations"},
133 |         [
134 |             CustomPropertiesStream,
135 |             OrganizationStream,
136 |             OrganizationMembersStream,
137 |             TeamMembersStream,
138 |             TeamRolesStream,
139 |             TeamsStream,
140 |         ],
141 |     )
142 | 
143 |     @classmethod
144 |     def all_valid_queries(cls) -> set[str]:
145 |         return set.union(*[stream.valid_queries for stream in Streams])
146 | 


--------------------------------------------------------------------------------
/tap_github/tap.py:
--------------------------------------------------------------------------------
  1 | """GitHub tap class."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import logging
  6 | import os
  7 | 
  8 | from singer_sdk import Stream, Tap
  9 | from singer_sdk import typing as th  # JSON schema typing helpers
 10 | from singer_sdk.helpers._classproperty import classproperty
 11 | 
 12 | from tap_github.streams import Streams
 13 | 
 14 | 
 15 | class TapGitHub(Tap):
 16 |     """Singer tap for the GitHub API."""
 17 | 
 18 |     name = "tap-github"
 19 |     package_name = "meltanolabs-tap-github"
 20 | 
 21 |     @classproperty
 22 |     def logger(cls) -> logging.Logger:  # noqa: N805
 23 |         """Get logger.
 24 | 
 25 |         Returns:
 26 |             Logger with local LOGLEVEL. LOGLEVEL from env takes priority.
 27 |         """
 28 | 
 29 |         LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper()  # noqa: N806
 30 |         assert LOGLEVEL in logging._levelToName.values(), (
 31 |             f"Invalid LOGLEVEL configuration: {LOGLEVEL}"
 32 |         )
 33 |         logger = logging.getLogger(cls.name)
 34 |         logger.setLevel(LOGLEVEL)
 35 |         return logger
 36 | 
 37 |     config_jsonschema = th.PropertiesList(
 38 |         th.Property(
 39 |             "user_agent",
 40 |             th.StringType,
 41 |             description="User agent to use for API requests.",
 42 |         ),
 43 |         th.Property("metrics_log_level", th.StringType),
 44 |         # Authentication options
 45 |         th.Property(
 46 |             "auth_token",
 47 |             th.StringType,
 48 |             description="GitHub token to authenticate with.",
 49 |         ),
 50 |         th.Property(
 51 |             "additional_auth_tokens",
 52 |             th.ArrayType(th.StringType),
 53 |             description="List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits.",  # noqa: E501
 54 |         ),
 55 |         th.Property(
 56 |             "auth_app_keys",
 57 |             th.ArrayType(th.StringType),
 58 |             description=(
 59 |                 "List of GitHub App credentials to authenticate with. Each credential "
 60 |                 "can be constructed by combining an App ID and App private key into "
 61 |                 "the format `:app_id:;;-----BEGIN RSA PRIVATE KEY-----\n_YOUR_P_KEY_\n-----END RSA PRIVATE KEY-----`."  # noqa: E501
 62 |             ),
 63 |         ),
 64 |         th.Property(
 65 |             "rate_limit_buffer",
 66 |             th.IntegerType,
 67 |             description="Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000.",  # noqa: E501
 68 |         ),
 69 |         th.Property(
 70 |             "expiry_time_buffer",
 71 |             th.IntegerType,
 72 |             description=(
 73 |                 "When authenticating as a GitHub App, this buffer controls how many "
 74 |                 "minutes before expiry the GitHub app tokens will be refreshed. "
 75 |                 "Defaults to 10 minutes."
 76 |             ),
 77 |         ),
 78 |         th.Property(
 79 |             "searches",
 80 |             th.ArrayType(
 81 |                 th.ObjectType(
 82 |                     th.Property("name", th.StringType, required=True),
 83 |                     th.Property("query", th.StringType, required=True),
 84 |                 )
 85 |             ),
 86 |             description=(
 87 |                 "An array of search descriptor objects with the following properties:\n"
 88 |                 '"name" - a human readable name for the search query.\n'
 89 |                 '"query" -  a github search string (generally the same as would come after ?q= in the URL)"'  # noqa: E501
 90 |             ),
 91 |         ),
 92 |         th.Property("organizations", th.ArrayType(th.StringType)),
 93 |         th.Property("repositories", th.ArrayType(th.StringType)),
 94 |         th.Property("user_usernames", th.ArrayType(th.StringType)),
 95 |         th.Property("user_ids", th.ArrayType(th.StringType)),
 96 |         th.Property(
 97 |             "start_date",
 98 |             th.DateTimeType,
 99 |             description="Start date for incremental sync.",
100 |         ),
101 |         th.Property("stream_maps", th.ObjectType()),
102 |         th.Property("stream_map_config", th.ObjectType()),
103 |         th.Property(
104 |             "skip_parent_streams",
105 |             th.BooleanType,
106 |             description=(
107 |                 "Set to true to skip API calls for the parent "
108 |                 "streams (such as repositories) if it is not selected but children are"
109 |             ),
110 |         ),
111 |         th.Property(
112 |             "stream_options",
113 |             th.ObjectType(
114 |                 th.Property(
115 |                     "milestones",
116 |                     th.ObjectType(
117 |                         th.Property(
118 |                             "state",
119 |                             th.StringType,
120 |                             description=(
121 |                                 "Configures which states are of interest. "
122 |                                 "Must be one of [open, closed, all], defaults to open."
123 |                             ),
124 |                             default="open",
125 |                             allowed_values=["open", "closed", "all"],
126 |                         ),
127 |                         additional_properties=False,
128 |                     ),
129 |                     description="Options specific to the 'milestones' stream.",
130 |                 ),
131 |                 additional_properties=False,
132 |             ),
133 |             description="Options which change the behaviour of a specific stream.",
134 |         ),
135 |     ).to_dict()
136 | 
137 |     def discover_streams(self) -> list[Stream]:
138 |         """Return a list of discovered streams for each query."""
139 | 
140 |         # If the config is empty, assume we are running --help or --capabilities.
141 |         if (
142 |             self.config
143 |             and len(Streams.all_valid_queries().intersection(self.config)) != 1
144 |         ):
145 |             raise ValueError(
146 |                 "This tap requires one and only one of the following path options: "
147 |                 f"{Streams.all_valid_queries()}."
148 |             )
149 |         streams = []
150 |         for stream_type in Streams:
151 |             if (not self.config) or len(
152 |                 stream_type.valid_queries.intersection(self.config)
153 |             ) > 0:
154 |                 streams += [
155 |                     StreamClass(tap=self) for StreamClass in stream_type.streams
156 |                 ]
157 | 
158 |         if not streams:
159 |             raise ValueError("No valid streams found.")
160 |         return streams
161 | 
162 | 
163 | # CLI Execution:
164 | 
165 | cli = TapGitHub.cli
166 | 


--------------------------------------------------------------------------------
/tap_github/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | """Test suite for tap-github."""
 2 | 
 3 | import requests_cache
 4 | 
 5 | # Setup caching for all api calls done through `requests` in order to limit
 6 | # rate limiting problems with github.
 7 | # Use the sqlite backend as it's the default option and seems to be best supported.
 8 | # To clear the cache, just delete the sqlite db file at api_calls_tests_cache.sqlite
 9 | # in the root of this repository
10 | requests_cache.install_cache(
11 |     ".cache/api_calls_tests_cache",
12 |     backend="sqlite",
13 |     # make sure that API keys don't end up being cached
14 |     # Also ignore user-agent so that various versions of request
15 |     # can share the cache
16 |     ignored_parameters=["Authorization", "User-Agent", "If-modified-since"],
17 |     # tell requests_cache to check headers for the above parameter
18 |     match_headers=True,
19 |     # expire the cache after 24h (86400 seconds)
20 |     expire_after=24 * 60 * 60,
21 |     # make sure graphql calls get cached as well
22 |     allowable_methods=["GET", "POST"],
23 | )
24 | 


--------------------------------------------------------------------------------
/tap_github/tests/fixtures.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import datetime
  4 | import logging
  5 | import os
  6 | import sys
  7 | from typing import TYPE_CHECKING
  8 | 
  9 | import pytest
 10 | 
 11 | from ..utils.filter_stdout import FilterStdOutput
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from singer_sdk.helpers.types import Context
 15 | 
 16 | # Filter out singer output during tests
 17 | sys.stdout = FilterStdOutput(sys.stdout, r'{"type": ')  # type: ignore
 18 | 
 19 | 
 20 | @pytest.fixture
 21 | def search_config():
 22 |     return {
 23 |         "metrics_log_level": "warning",
 24 |         "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"),
 25 |         "searches": [
 26 |             {
 27 |                 "name": "tap_something",
 28 |                 "query": "tap-+language:Python",
 29 |             }
 30 |         ],
 31 |     }
 32 | 
 33 | 
 34 | @pytest.fixture
 35 | def repo_list_config(request):
 36 |     """
 37 |     Get a default list of repos or pass your own by decorating your test with
 38 |     @pytest.mark.repo_list(['org1/repo1', 'org2/repo2'])
 39 |     """
 40 |     marker = request.node.get_closest_marker("repo_list")
 41 |     if marker is None:
 42 |         repo_list = ["MeltanoLabs/tap-github", "mapswipe/mapswipe"]
 43 |     else:
 44 |         repo_list = marker.args[0]
 45 | 
 46 |     return {
 47 |         "metrics_log_level": "warning",
 48 |         "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"),
 49 |         "repositories": repo_list,
 50 |         "rate_limit_buffer": 100,
 51 |     }
 52 | 
 53 | 
 54 | @pytest.fixture
 55 | def username_list_config(request):
 56 |     """
 57 |     Get a default list of usernames or pass your own by decorating your test with
 58 |     @pytest.mark.username_list(['ericboucher', 'aaronsteers'])
 59 |     """
 60 |     marker = request.node.get_closest_marker("username_list")
 61 |     username_list = ["ericboucher", "aaronsteers"] if marker is None else marker.args[0]
 62 | 
 63 |     return {
 64 |         "metrics_log_level": "warning",
 65 |         "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"),
 66 |         "user_usernames": username_list,
 67 |         "rate_limit_buffer": 100,
 68 |     }
 69 | 
 70 | 
 71 | @pytest.fixture
 72 | def user_id_list_config(request):
 73 |     """
 74 |     Get a default list of usernames or pass your own by decorating your test with
 75 |     @pytest.mark.user_id_list(['ericboucher', 'aaronsteers'])
 76 |     """
 77 |     marker = request.node.get_closest_marker("user_id_list")
 78 |     user_id_list = [1, 2] if marker is None else marker.args[0]
 79 | 
 80 |     return {
 81 |         "metrics_log_level": "warning",
 82 |         "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"),
 83 |         "user_ids": user_id_list,
 84 |         "rate_limit_buffer": 100,
 85 |     }
 86 | 
 87 | 
 88 | @pytest.fixture
 89 | def organization_list_config(request):
 90 |     """
 91 |     Get a default list of organizations or pass your own by decorating your test with
 92 |     @pytest.mark.organization_list(['MeltanoLabs', 'oviohub'])
 93 |     """
 94 |     marker = request.node.get_closest_marker("organization_list")
 95 | 
 96 |     organization_list = ["MeltanoLabs"] if marker is None else marker.args[0]
 97 | 
 98 |     return {
 99 |         "metrics_log_level": "warning",
100 |         "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"),
101 |         "organizations": organization_list,
102 |         "rate_limit_buffer": 100,
103 |     }
104 | 
105 | 
106 | def alternative_sync_chidren(
107 |     self,
108 |     child_context: Context,
109 |     no_sync: bool = True,
110 | ) -> None:
111 |     """
112 |     Override for Stream._sync_children.
113 |     Enabling us to use an ORG_LEVEL_TOKEN for the collaborators stream.
114 |     """
115 |     for child_stream in self.child_streams:
116 |         # Use org:write access level credentials for collaborators stream
117 |         if child_stream.name in ["collaborators"]:
118 |             ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN")  # noqa: N806
119 |             # TODO - Fix collaborators tests, likely by mocking API responses directly.
120 |             # Currently we have to bypass them as they are failing frequently.
121 |             if not ORG_LEVEL_TOKEN or no_sync:
122 |                 logging.warning(
123 |                     'No "ORG_LEVEL_TOKEN" found. Skipping collaborators stream sync.'
124 |                 )
125 |                 continue
126 |             SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN")  # noqa: N806
127 |             os.environ["GITHUB_TOKEN"] = ORG_LEVEL_TOKEN
128 |             child_stream.sync(context=child_context)
129 |             os.environ["GITHUB_TOKEN"] = SAVED_GTHUB_TOKEN or ""
130 |             continue
131 | 
132 |         # default behavior:
133 |         if child_stream.selected or child_stream.has_selected_descendents:
134 |             child_stream.sync(context=child_context)
135 | 


--------------------------------------------------------------------------------
/tap_github/tests/test_authenticator.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from datetime import datetime, timedelta, timezone
  3 | from unittest.mock import MagicMock, call, patch
  4 | 
  5 | import pytest
  6 | import requests
  7 | from singer_sdk.streams import RESTStream
  8 | 
  9 | from tap_github.authenticator import (
 10 |     AppTokenManager,
 11 |     GitHubTokenAuthenticator,
 12 |     PersonalTokenManager,
 13 |     TokenManager,
 14 | )
 15 | 
 16 | 
 17 | def _now():
 18 |     return datetime.now(tz=timezone.utc)
 19 | 
 20 | 
 21 | class TestTokenManager:
 22 |     def test_default_rate_limits(self):
 23 |         token_manager = TokenManager("mytoken", rate_limit_buffer=700)
 24 | 
 25 |         assert token_manager.rate_limit == 5000
 26 |         assert token_manager.rate_limit_remaining == 5000
 27 |         assert token_manager.rate_limit_reset is None
 28 |         assert token_manager.rate_limit_used == 0
 29 |         assert token_manager.rate_limit_buffer == 700
 30 | 
 31 |         token_manager_2 = TokenManager("mytoken")
 32 |         assert token_manager_2.rate_limit_buffer == 1000
 33 | 
 34 |     def test_update_rate_limit(self):
 35 |         mock_response_headers = {
 36 |             "X-RateLimit-Limit": "5000",
 37 |             "X-RateLimit-Remaining": "4999",
 38 |             "X-RateLimit-Reset": "1372700873",
 39 |             "X-RateLimit-Used": "1",
 40 |         }
 41 | 
 42 |         token_manager = TokenManager("mytoken")
 43 |         token_manager.update_rate_limit(mock_response_headers)
 44 | 
 45 |         assert token_manager.rate_limit == 5000
 46 |         assert token_manager.rate_limit_remaining == 4999
 47 |         assert token_manager.rate_limit_reset == datetime(
 48 |             2013,
 49 |             7,
 50 |             1,
 51 |             17,
 52 |             47,
 53 |             53,
 54 |             tzinfo=timezone.utc,
 55 |         )
 56 |         assert token_manager.rate_limit_used == 1
 57 | 
 58 |     def test_is_valid_token_successful(self):
 59 |         with patch("requests.get") as mock_get:
 60 |             mock_response = mock_get.return_value
 61 |             mock_response.raise_for_status.return_value = None
 62 | 
 63 |             token_manager = TokenManager("validtoken")
 64 | 
 65 |             assert token_manager.is_valid_token()
 66 |             mock_get.assert_called_once_with(
 67 |                 url="https://api.github.com/rate_limit",
 68 |                 headers={"Authorization": "token validtoken"},
 69 |             )
 70 | 
 71 |     def test_is_valid_token_failure(self):
 72 |         with patch("requests.get") as mock_get:
 73 |             # Setup for a failed request
 74 |             mock_response = mock_get.return_value
 75 |             mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError()
 76 |             mock_response.status_code = 401
 77 |             mock_response.content = b"Unauthorized Access"
 78 |             mock_response.reason = "Unauthorized"
 79 | 
 80 |             token_manager = TokenManager("invalidtoken")
 81 |             token_manager.logger = MagicMock()
 82 | 
 83 |             assert not token_manager.is_valid_token()
 84 |             token_manager.logger.warning.assert_called_once()
 85 |             assert "401" in token_manager.logger.warning.call_args[0][0]
 86 | 
 87 |     def test_has_calls_remaining_succeeds_if_token_never_used(self):
 88 |         token_manager = TokenManager("mytoken")
 89 |         assert token_manager.has_calls_remaining()
 90 | 
 91 |     def test_has_calls_remaining_succeeds_if_lots_remaining(self):
 92 |         mock_response_headers = {
 93 |             "X-RateLimit-Limit": "5000",
 94 |             "X-RateLimit-Remaining": "4999",
 95 |             "X-RateLimit-Reset": "1372700873",
 96 |             "X-RateLimit-Used": "1",
 97 |         }
 98 | 
 99 |         token_manager = TokenManager("mytoken")
100 |         token_manager.update_rate_limit(mock_response_headers)
101 | 
102 |         assert token_manager.has_calls_remaining()
103 | 
104 |     def test_has_calls_remaining_succeeds_if_reset_time_reached(self):
105 |         mock_response_headers = {
106 |             "X-RateLimit-Limit": "5000",
107 |             "X-RateLimit-Remaining": "1",
108 |             "X-RateLimit-Reset": "1372700873",
109 |             "X-RateLimit-Used": "4999",
110 |         }
111 | 
112 |         token_manager = TokenManager("mytoken", rate_limit_buffer=1000)
113 |         token_manager.update_rate_limit(mock_response_headers)
114 | 
115 |         assert token_manager.has_calls_remaining()
116 | 
117 |     def test_has_calls_remaining_fails_if_few_calls_remaining_and_reset_time_not_reached(  # noqa: E501
118 |         self,
119 |     ):
120 |         mock_response_headers = {
121 |             "X-RateLimit-Limit": "5000",
122 |             "X-RateLimit-Remaining": "1",
123 |             "X-RateLimit-Reset": str(int((_now() + timedelta(days=100)).timestamp())),
124 |             "X-RateLimit-Used": "4999",
125 |         }
126 | 
127 |         token_manager = TokenManager("mytoken", rate_limit_buffer=1000)
128 |         token_manager.update_rate_limit(mock_response_headers)
129 | 
130 |         assert not token_manager.has_calls_remaining()
131 | 
132 | 
133 | class TestAppTokenManager:
134 |     def test_initialization_with_3_part_env_key(self):
135 |         with patch.object(AppTokenManager, "claim_token", return_value=None):
136 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
137 |             assert token_manager.github_app_id == "12345"
138 |             assert token_manager.github_private_key == "key\ncontent"
139 |             assert token_manager.github_installation_id == "67890"
140 | 
141 |     def test_initialization_with_2_part_env_key(self):
142 |         with patch.object(AppTokenManager, "claim_token", return_value=None):
143 |             token_manager = AppTokenManager("12345;;key\\ncontent")
144 |             assert token_manager.github_app_id == "12345"
145 |             assert token_manager.github_private_key == "key\ncontent"
146 |             assert token_manager.github_installation_id is None
147 | 
148 |     def test_initialization_with_malformed_env_key(self):
149 |         expected_error_expression = re.escape(
150 |             "GITHUB_APP_PRIVATE_KEY could not be parsed. The expected format is "
151 |             '":app_id:;;-----BEGIN RSA PRIVATE KEY-----\\n_YOUR_P_KEY_\\n-----END RSA PRIVATE KEY-----"'  # noqa: E501
152 |         )
153 |         with pytest.raises(ValueError, match=expected_error_expression):
154 |             AppTokenManager("12345key\\ncontent")
155 | 
156 |     def test_generate_token_with_invalid_credentials(self):
157 |         with (
158 |             patch.object(AppTokenManager, "is_valid_token", return_value=False),
159 |             patch(
160 |                 "tap_github.authenticator.generate_app_access_token",
161 |                 return_value=("some_token", MagicMock()),
162 |             ),
163 |         ):
164 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
165 |             assert token_manager.token is None
166 |             assert token_manager.token_expires_at is None
167 | 
168 |     def test_successful_token_generation(self):
169 |         token_time = MagicMock()
170 |         with (
171 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
172 |             patch(
173 |                 "tap_github.authenticator.generate_app_access_token",
174 |                 return_value=("valid_token", token_time),
175 |             ),
176 |         ):
177 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
178 |             token_manager.claim_token()
179 |             assert token_manager.token == "valid_token"
180 |             assert token_manager.token_expires_at == token_time
181 | 
182 |     def test_has_calls_remaining_regenerates_a_token_if_close_to_expiry(self):
183 |         unexpired_time = _now() + timedelta(days=1)
184 |         expired_time = _now() - timedelta(days=1)
185 |         with (
186 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
187 |             patch(
188 |                 "tap_github.authenticator.generate_app_access_token",
189 |                 return_value=("valid_token", unexpired_time),
190 |             ),
191 |         ):
192 |             mock_response_headers = {
193 |                 "X-RateLimit-Limit": "5000",
194 |                 "X-RateLimit-Remaining": "4999",
195 |                 "X-RateLimit-Reset": "1372700873",
196 |                 "X-RateLimit-Used": "1",
197 |             }
198 | 
199 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
200 |             token_manager.logger = MagicMock()
201 |             token_manager.token_expires_at = expired_time
202 |             token_manager.update_rate_limit(mock_response_headers)
203 | 
204 |             assert token_manager.has_calls_remaining()
205 |             # calling has_calls_remaining() will trigger the token generation function to be called again,  # noqa: E501
206 |             # so token_expires_at should have been reset back to the mocked unexpired_time  # noqa: E501
207 |             assert token_manager.token_expires_at == unexpired_time
208 |             token_manager.logger.info.assert_called_once()
209 |             assert (
210 |                 "GitHub app token refresh succeeded."
211 |                 in token_manager.logger.info.call_args[0][0]
212 |             )
213 | 
214 |     def test_has_calls_remaining_logs_warning_if_token_regeneration_fails(self):
215 |         unexpired_time = _now() + timedelta(days=1)
216 |         expired_time = _now() - timedelta(days=1)
217 |         with (
218 |             patch.object(
219 |                 AppTokenManager, "is_valid_token", return_value=True
220 |             ) as mock_is_valid,
221 |             patch(
222 |                 "tap_github.authenticator.generate_app_access_token",
223 |                 return_value=("valid_token", unexpired_time),
224 |             ),
225 |         ):
226 |             mock_response_headers = {
227 |                 "X-RateLimit-Limit": "5000",
228 |                 "X-RateLimit-Remaining": "4999",
229 |                 "X-RateLimit-Reset": "1372700873",
230 |                 "X-RateLimit-Used": "1",
231 |             }
232 | 
233 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
234 |             token_manager.logger = MagicMock()
235 |             token_manager.token_expires_at = expired_time
236 |             token_manager.update_rate_limit(mock_response_headers)
237 | 
238 |             mock_is_valid.return_value = False
239 |             assert not token_manager.has_calls_remaining()
240 |             assert isinstance(token_manager.logger.warning, MagicMock)
241 |             token_manager.logger.warning.assert_has_calls(
242 |                 [call("GitHub app token refresh failed.")],
243 |                 any_order=True,
244 |             )
245 | 
246 |     def test_has_calls_remaining_succeeds_if_token_new_and_never_used(self):
247 |         unexpired_time = _now() + timedelta(days=1)
248 |         with (
249 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
250 |             patch(
251 |                 "tap_github.authenticator.generate_app_access_token",
252 |                 return_value=("valid_token", unexpired_time),
253 |             ),
254 |         ):
255 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
256 |             assert token_manager.has_calls_remaining()
257 | 
258 |     def test_has_calls_remaining_succeeds_if_time_and_requests_left(self):
259 |         unexpired_time = _now() + timedelta(days=1)
260 |         with (
261 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
262 |             patch(
263 |                 "tap_github.authenticator.generate_app_access_token",
264 |                 return_value=("valid_token", unexpired_time),
265 |             ),
266 |         ):
267 |             mock_response_headers = {
268 |                 "X-RateLimit-Limit": "5000",
269 |                 "X-RateLimit-Remaining": "4999",
270 |                 "X-RateLimit-Reset": "1372700873",
271 |                 "X-RateLimit-Used": "1",
272 |             }
273 | 
274 |             token_manager = AppTokenManager("12345;;key\\ncontent;;67890")
275 |             token_manager.update_rate_limit(mock_response_headers)
276 | 
277 |             assert token_manager.has_calls_remaining()
278 | 
279 |     def test_has_calls_remaining_succeeds_if_time_left_and_reset_time_reached(self):
280 |         unexpired_time = _now() + timedelta(days=1)
281 |         with (
282 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
283 |             patch(
284 |                 "tap_github.authenticator.generate_app_access_token",
285 |                 return_value=("valid_token", unexpired_time),
286 |             ),
287 |         ):
288 |             mock_response_headers = {
289 |                 "X-RateLimit-Limit": "5000",
290 |                 "X-RateLimit-Remaining": "1",
291 |                 "X-RateLimit-Reset": "1372700873",
292 |                 "X-RateLimit-Used": "4999",
293 |             }
294 | 
295 |             token_manager = AppTokenManager(
296 |                 "12345;;key\\ncontent;;67890", rate_limit_buffer=1000
297 |             )
298 |             token_manager.update_rate_limit(mock_response_headers)
299 | 
300 |             assert token_manager.has_calls_remaining()
301 | 
302 |     def test_has_calls_remaining_fails_if_time_left_and_few_calls_remaining_and_reset_time_not_reached(  # noqa: E501
303 |         self,
304 |     ):
305 |         unexpired_time = _now() + timedelta(days=1)
306 |         with (
307 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
308 |             patch(
309 |                 "tap_github.authenticator.generate_app_access_token",
310 |                 return_value=("valid_token", unexpired_time),
311 |             ),
312 |         ):
313 |             mock_response_headers = {
314 |                 "X-RateLimit-Limit": "5000",
315 |                 "X-RateLimit-Remaining": "1",
316 |                 "X-RateLimit-Reset": str(
317 |                     int((_now() + timedelta(days=100)).timestamp())
318 |                 ),
319 |                 "X-RateLimit-Used": "4999",
320 |             }
321 | 
322 |             token_manager = AppTokenManager(
323 |                 "12345;;key\\ncontent;;67890", rate_limit_buffer=1000
324 |             )
325 |             token_manager.update_rate_limit(mock_response_headers)
326 | 
327 |         assert not token_manager.has_calls_remaining()
328 | 
329 | 
330 | @pytest.fixture
331 | def mock_stream():
332 |     stream = MagicMock(spec=RESTStream)
333 |     stream.logger = MagicMock()
334 |     stream.tap_name = "tap_github"
335 |     stream.config = {"rate_limit_buffer": 5}
336 |     return stream
337 | 
338 | 
339 | class TestGitHubTokenAuthenticator:
340 |     def test_prepare_tokens_returns_empty_if_none_found(self, mock_stream):
341 |         with (
342 |             patch.object(
343 |                 GitHubTokenAuthenticator,
344 |                 "get_env",
345 |                 return_value={"GITHUB_TLJKJFDS": "gt1"},
346 |             ),
347 |             patch.object(PersonalTokenManager, "is_valid_token", return_value=True),
348 |         ):
349 |             auth = GitHubTokenAuthenticator(stream=mock_stream)
350 |             token_managers = auth.prepare_tokens()
351 | 
352 |             assert len(token_managers) == 0
353 | 
354 |     def test_config_auth_token_only(self, mock_stream):
355 |         with (
356 |             patch.object(
357 |                 GitHubTokenAuthenticator,
358 |                 "get_env",
359 |                 return_value={"OTHER_TOKEN": "blah", "NOT_THE_RIGHT_TOKEN": "meh"},
360 |             ),
361 |             patch.object(PersonalTokenManager, "is_valid_token", return_value=True),
362 |         ):
363 |             stream = mock_stream
364 |             stream.config.update({"auth_token": "gt5"})
365 |             auth = GitHubTokenAuthenticator(stream=stream)
366 |             token_managers = auth.prepare_tokens()
367 | 
368 |             assert len(token_managers) == 1
369 |             assert token_managers[0].token == "gt5"
370 | 
371 |     def test_config_additional_auth_tokens_only(self, mock_stream):
372 |         with (
373 |             patch.object(
374 |                 GitHubTokenAuthenticator,
375 |                 "get_env",
376 |                 return_value={"OTHER_TOKEN": "blah", "NOT_THE_RIGHT_TOKEN": "meh"},
377 |             ),
378 |             patch.object(PersonalTokenManager, "is_valid_token", return_value=True),
379 |         ):
380 |             stream = mock_stream
381 |             stream.config.update({"additional_auth_tokens": ["gt7", "gt8", "gt9"]})
382 |             auth = GitHubTokenAuthenticator(stream=stream)
383 |             token_managers = auth.prepare_tokens()
384 | 
385 |             assert len(token_managers) == 3
386 |             assert sorted({tm.token for tm in token_managers}) == ["gt7", "gt8", "gt9"]
387 | 
388 |     def test_env_personal_tokens_only(self, mock_stream):
389 |         with (
390 |             patch.object(
391 |                 GitHubTokenAuthenticator,
392 |                 "get_env",
393 |                 return_value={
394 |                     "GITHUB_TOKEN1": "gt1",
395 |                     "GITHUB_TOKENxyz": "gt2",
396 |                     "OTHER_TOKEN": "blah",
397 |                 },
398 |             ),
399 |             patch.object(PersonalTokenManager, "is_valid_token", return_value=True),
400 |         ):
401 |             auth = GitHubTokenAuthenticator(stream=mock_stream)
402 |             token_managers = auth.prepare_tokens()
403 | 
404 |             assert len(token_managers) == 2
405 |             assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt2"]
406 | 
407 |     def test_config_app_keys(self, mock_stream):
408 |         def generate_token_mock(app_id, private_key, installation_id):
409 |             return (f"installationtokenfor{app_id}", MagicMock())
410 | 
411 |         with (
412 |             patch.object(TokenManager, "is_valid_token", return_value=True),
413 |             patch(
414 |                 "tap_github.authenticator.generate_app_access_token",
415 |                 side_effect=generate_token_mock,
416 |             ),
417 |         ):
418 |             stream = mock_stream
419 |             stream.config.update(
420 |                 {
421 |                     "auth_token": "gt5",
422 |                     "additional_auth_tokens": ["gt7", "gt8", "gt9"],
423 |                     "auth_app_keys": [
424 |                         "123;;gak1;;13",
425 |                         "456;;gak1;;46",
426 |                         "789;;gak1;;79",
427 |                     ],
428 |                 }
429 |             )
430 |             auth = GitHubTokenAuthenticator(stream=stream)
431 |             token_managers = auth.prepare_tokens()
432 | 
433 |             assert len(token_managers) == 7
434 | 
435 |             app_token_managers = {
436 |                 tm for tm in token_managers if isinstance(tm, AppTokenManager)
437 |             }
438 |             assert len(app_token_managers) == 3
439 | 
440 |             app_tokens = {tm.token for tm in app_token_managers}
441 |             assert app_tokens == {
442 |                 "installationtokenfor123",
443 |                 "installationtokenfor456",
444 |                 "installationtokenfor789",
445 |             }
446 | 
447 |     def test_env_app_key_only(self, mock_stream):
448 |         with (
449 |             patch.object(
450 |                 GitHubTokenAuthenticator,
451 |                 "get_env",
452 |                 return_value={
453 |                     "GITHUB_APP_PRIVATE_KEY": "123;;key",
454 |                     "OTHER_TOKEN": "blah",
455 |                 },
456 |             ),
457 |             patch.object(AppTokenManager, "is_valid_token", return_value=True),
458 |             patch(
459 |                 "tap_github.authenticator.generate_app_access_token",
460 |                 return_value=("installationtoken12345", MagicMock()),
461 |             ),
462 |         ):
463 |             auth = GitHubTokenAuthenticator(stream=mock_stream)
464 |             token_managers = auth.prepare_tokens()
465 | 
466 |             assert len(token_managers) == 1
467 |             assert token_managers[0].token == "installationtoken12345"
468 | 
469 |     def test_all_token_types(self, mock_stream):
470 |         # Expectations:
471 |         # - the presence of additional_auth_tokens causes personal tokens in the environment to be ignored.  # noqa: E501
472 |         # - the other types all coexist
473 |         with (
474 |             patch.object(
475 |                 GitHubTokenAuthenticator,
476 |                 "get_env",
477 |                 return_value={
478 |                     "GITHUB_TOKEN1": "gt1",
479 |                     "GITHUB_TOKENxyz": "gt2",
480 |                     "GITHUB_APP_PRIVATE_KEY": "123;;key;;install_id",
481 |                     "OTHER_TOKEN": "blah",
482 |                 },
483 |             ),
484 |             patch.object(TokenManager, "is_valid_token", return_value=True),
485 |             patch(
486 |                 "tap_github.authenticator.generate_app_access_token",
487 |                 return_value=("installationtoken12345", MagicMock()),
488 |             ),
489 |         ):
490 |             stream = mock_stream
491 |             stream.config.update(
492 |                 {
493 |                     "auth_token": "gt5",
494 |                     "additional_auth_tokens": ["gt7", "gt8", "gt9"],
495 |                 }
496 |             )
497 |             auth = GitHubTokenAuthenticator(stream=stream)
498 |             token_managers = auth.prepare_tokens()
499 | 
500 |             assert len(token_managers) == 5
501 |             assert sorted({tm.token for tm in token_managers}) == [
502 |                 "gt5",
503 |                 "gt7",
504 |                 "gt8",
505 |                 "gt9",
506 |                 "installationtoken12345",
507 |             ]
508 | 
509 |     def test_all_token_types_except_additional_auth_tokens(self, mock_stream):
510 |         # Expectations:
511 |         # - in the absence of additional_auth_tokens, all the other types can coexist
512 |         with (
513 |             patch.object(
514 |                 GitHubTokenAuthenticator,
515 |                 "get_env",
516 |                 return_value={
517 |                     "GITHUB_TOKEN1": "gt1",
518 |                     "GITHUB_TOKENxyz": "gt2",
519 |                     "GITHUB_APP_PRIVATE_KEY": "123;;key;;install_id",
520 |                     "OTHER_TOKEN": "blah",
521 |                 },
522 |             ),
523 |             patch.object(TokenManager, "is_valid_token", return_value=True),
524 |             patch(
525 |                 "tap_github.authenticator.generate_app_access_token",
526 |                 return_value=("installationtoken12345", MagicMock()),
527 |             ),
528 |         ):
529 |             stream = mock_stream
530 |             stream.config.update(
531 |                 {
532 |                     "auth_token": "gt5",
533 |                 }
534 |             )
535 |             auth = GitHubTokenAuthenticator(stream=stream)
536 |             token_managers = auth.prepare_tokens()
537 | 
538 |             assert len(token_managers) == 4
539 |             assert sorted({tm.token for tm in token_managers}) == [
540 |                 "gt1",
541 |                 "gt2",
542 |                 "gt5",
543 |                 "installationtoken12345",
544 |             ]
545 | 
546 |     def test_auth_token_and_additional_auth_tokens_deduped(self, mock_stream):
547 |         with (
548 |             patch.object(
549 |                 GitHubTokenAuthenticator,
550 |                 "get_env",
551 |                 return_value={
552 |                     "GITHUB_TOKEN1": "gt1",
553 |                     "GITHUB_TOKENxyz": "gt2",
554 |                     "OTHER_TOKEN": "blah",
555 |                 },
556 |             ),
557 |             patch.object(TokenManager, "is_valid_token", return_value=True),
558 |             patch(
559 |                 "tap_github.authenticator.generate_app_access_token",
560 |                 return_value=("installationtoken12345", MagicMock()),
561 |             ),
562 |         ):
563 |             stream = mock_stream
564 |             stream.config.update(
565 |                 {
566 |                     "auth_token": "gt1",
567 |                     "additional_auth_tokens": ["gt1", "gt1", "gt8", "gt8", "gt9"],
568 |                 }
569 |             )
570 |             auth = GitHubTokenAuthenticator(stream=stream)
571 |             token_managers = auth.prepare_tokens()
572 | 
573 |             assert len(token_managers) == 3
574 |             assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt8", "gt9"]
575 | 
576 |     def test_auth_token_and_env_tokens_deduped(self, mock_stream):
577 |         with (
578 |             patch.object(
579 |                 GitHubTokenAuthenticator,
580 |                 "get_env",
581 |                 return_value={
582 |                     "GITHUB_TOKEN1": "gt1",
583 |                     "GITHUB_TOKENa": "gt2",
584 |                     "GITHUB_TOKENxyz": "gt2",
585 |                     "OTHER_TOKEN": "blah",
586 |                 },
587 |             ),
588 |             patch.object(TokenManager, "is_valid_token", return_value=True),
589 |             patch(
590 |                 "tap_github.authenticator.generate_app_access_token",
591 |                 return_value=("installationtoken12345", MagicMock()),
592 |             ),
593 |         ):
594 |             stream = mock_stream
595 |             stream.config.update({"auth_token": "gt1"})
596 |             auth = GitHubTokenAuthenticator(stream=stream)
597 |             token_managers = auth.prepare_tokens()
598 | 
599 |             assert len(token_managers) == 2
600 |             assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt2"]
601 | 
602 |     def test_handle_error_if_app_key_invalid(self, mock_stream):
603 |         # Confirm expected behaviour if an error is raised while setting up the app token manager:  # noqa: E501
604 |         # - don"t crash
605 |         # - print the error as a warning
606 |         # - continue with any other obtained tokens
607 |         with (
608 |             patch.object(
609 |                 GitHubTokenAuthenticator,
610 |                 "get_env",
611 |                 return_value={"GITHUB_APP_PRIVATE_KEY": "123garbagekey"},
612 |             ),
613 |             patch("tap_github.authenticator.AppTokenManager") as mock_app_manager,
614 |         ):
615 |             mock_app_manager.side_effect = ValueError("Invalid key format")
616 | 
617 |             auth = GitHubTokenAuthenticator(stream=mock_stream)
618 |             auth.prepare_tokens()
619 | 
620 |             mock_stream.logger.warning.assert_called_with(
621 |                 "An error was thrown while preparing an app token: Invalid key format"
622 |             )
623 | 
624 |     def test_exclude_generated_app_token_if_invalid(self, mock_stream):
625 |         with (
626 |             patch.object(
627 |                 GitHubTokenAuthenticator,
628 |                 "get_env",
629 |                 return_value={"GITHUB_APP_PRIVATE_KEY": "123;;key"},
630 |             ),
631 |             patch.object(AppTokenManager, "is_valid_token", return_value=False),
632 |             patch(
633 |                 "tap_github.authenticator.generate_app_access_token",
634 |                 return_value=("installationtoken12345", MagicMock()),
635 |             ),
636 |         ):
637 |             auth = GitHubTokenAuthenticator(stream=mock_stream)
638 |             token_managers = auth.prepare_tokens()
639 | 
640 |             assert len(token_managers) == 0
641 | 
642 |     def test_prepare_tokens_returns_empty_if_all_tokens_invalid(self, mock_stream):
643 |         with (
644 |             patch.object(
645 |                 GitHubTokenAuthenticator,
646 |                 "get_env",
647 |                 return_value={
648 |                     "GITHUB_TOKEN1": "gt1",
649 |                     "GITHUB_APP_PRIVATE_KEY": "123;;key",
650 |                 },
651 |             ),
652 |             patch.object(PersonalTokenManager, "is_valid_token", return_value=False),
653 |             patch.object(AppTokenManager, "is_valid_token", return_value=False),
654 |             patch(
655 |                 "tap_github.authenticator.generate_app_access_token",
656 |                 return_value=("installationtoken12345", MagicMock()),
657 |             ),
658 |         ):
659 |             stream = mock_stream
660 |             stream.config.update(
661 |                 {
662 |                     "auth_token": "gt5",
663 |                     "additional_auth_tokens": ["gt7", "gt8", "gt9"],
664 |                 }
665 |             )
666 |             auth = GitHubTokenAuthenticator(stream=stream)
667 |             token_managers = auth.prepare_tokens()
668 | 
669 |             assert len(token_managers) == 0
670 | 


--------------------------------------------------------------------------------
/tap_github/tests/test_core.py:
--------------------------------------------------------------------------------
 1 | """Tests standard tap features using the built-in SDK tests library."""
 2 | 
 3 | import logging
 4 | import os
 5 | from unittest import mock
 6 | from unittest.mock import patch
 7 | 
 8 | from singer_sdk.testing import get_standard_tap_tests
 9 | 
10 | from tap_github.tap import TapGitHub
11 | from tap_github.utils.filter_stdout import nostdout
12 | 
13 | from .fixtures import (  # noqa: F401
14 |     alternative_sync_chidren,
15 |     organization_list_config,
16 |     repo_list_config,
17 |     search_config,
18 |     username_list_config,
19 | )
20 | 
21 | 
22 | # Run standard built-in tap tests from the SDK:
23 | def test_standard_tap_tests_for_search_mode(search_config):  # noqa: F811
24 |     """Run standard tap tests from the SDK."""
25 |     tests = get_standard_tap_tests(TapGitHub, config=search_config)
26 |     with (
27 |         patch(
28 |             "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren
29 |         ),
30 |         nostdout(),
31 |     ):
32 |         for test in tests:
33 |             test()
34 | 
35 | 
36 | def test_standard_tap_tests_for_repo_list_mode(repo_list_config):  # noqa: F811
37 |     """Run standard tap tests from the SDK."""
38 |     tests = get_standard_tap_tests(TapGitHub, config=repo_list_config)
39 |     with (
40 |         patch(
41 |             "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren
42 |         ),
43 |         nostdout(),
44 |     ):
45 |         for test in tests:
46 |             test()
47 | 
48 | 
49 | def test_standard_tap_tests_for_username_list_mode(username_list_config):  # noqa: F811
50 |     """Run standard tap tests from the SDK."""
51 |     tests = get_standard_tap_tests(TapGitHub, config=username_list_config)
52 |     with nostdout():
53 |         for test in tests:
54 |             test()
55 | 
56 | 
57 | # This token needs to have read:org access for the organization listed in fixtures.py
58 | # Default is "MeltanoLabs"
59 | ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN")
60 | 
61 | 
62 | @mock.patch.dict(os.environ, {"GITHUB_TOKEN": ORG_LEVEL_TOKEN or ""})
63 | def test_standard_tap_tests_for_organization_list_mode(organization_list_config):  # noqa: F811
64 |     """Run standard tap tests from the SDK."""
65 |     if not ORG_LEVEL_TOKEN:
66 |         logging.warning('No "ORG_LEVEL_TOKEN" found. Skipping organization tap tests.')
67 |         return
68 |     tests = get_standard_tap_tests(TapGitHub, config=organization_list_config)
69 |     with nostdout():
70 |         for test in tests:
71 |             test()
72 | 


--------------------------------------------------------------------------------
/tap_github/tests/test_tap.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import re
  5 | from unittest.mock import patch
  6 | 
  7 | import pytest
  8 | from bs4 import BeautifulSoup
  9 | from dateutil.parser import isoparse
 10 | from singer_sdk._singerlib import Catalog
 11 | from singer_sdk.helpers import _catalog as cat_helpers
 12 | 
 13 | from tap_github.scraping import parse_counter
 14 | from tap_github.tap import TapGitHub
 15 | 
 16 | from .fixtures import (  # noqa: F401
 17 |     alternative_sync_chidren,
 18 |     repo_list_config,
 19 |     username_list_config,
 20 | )
 21 | 
 22 | repo_list_2 = [
 23 |     "MeltanoLabs/tap-github",
 24 |     # mistype the repo name so we can check that the tap corrects it
 25 |     "MeltanoLabs/Tap-GitLab",
 26 |     # mistype the org
 27 |     "meltanolabs/target-athena",
 28 |     # a repo that does not exist at all
 29 |     # this one has no matching record below as it should be removed
 30 |     # from the list by the TempStream
 31 |     "brokenOrg/does_not_exist",
 32 | ]
 33 | # the same list, but without typos, for validation
 34 | repo_list_2_corrected = [
 35 |     "MeltanoLabs/tap-github",
 36 |     "MeltanoLabs/tap-gitlab",
 37 |     "MeltanoLabs/target-athena",
 38 | ]
 39 | # the github repo ids that match the repo names above
 40 | # in the same order
 41 | repo_list_2_ids = [
 42 |     365087920,
 43 |     416891176,
 44 |     361619143,
 45 | ]
 46 | 
 47 | 
 48 | @pytest.mark.repo_list(repo_list_2)
 49 | def test_validate_repo_list_config(repo_list_config):  # noqa: F811
 50 |     """Verify that the repositories list is parsed correctly"""
 51 |     repo_list_context = [
 52 |         {
 53 |             "org": repo[0].split("/")[0],
 54 |             "repo": repo[0].split("/")[1],
 55 |             "repo_id": repo[1],
 56 |         }
 57 |         for repo in zip(repo_list_2_corrected, repo_list_2_ids)
 58 |     ]
 59 |     tap = TapGitHub(config=repo_list_config)
 60 |     partitions = tap.streams["repositories"].partitions
 61 |     assert partitions == repo_list_context
 62 | 
 63 | 
 64 | def run_tap_with_config(
 65 |     capsys, config_obj: dict, skip_stream: str | None, single_stream: str | None
 66 | ) -> str:
 67 |     """
 68 |     Run the tap with the given config and capture stdout, optionally
 69 |     skipping a stream (this is meant to be the top level stream), or
 70 |     running a single one.
 71 |     """
 72 |     tap1 = TapGitHub(config=config_obj)
 73 |     tap1.run_discovery()
 74 |     catalog = Catalog.from_dict(tap1.catalog_dict)
 75 |     # Reset and re-initialize with an input catalog
 76 |     if skip_stream is not None:
 77 |         cat_helpers.set_catalog_stream_selected(
 78 |             catalog=catalog,
 79 |             stream_name=skip_stream,
 80 |             selected=False,
 81 |         )
 82 |     elif single_stream is not None:
 83 |         cat_helpers.deselect_all_streams(catalog)
 84 |         cat_helpers.set_catalog_stream_selected(catalog, "repositories", selected=True)
 85 |         cat_helpers.set_catalog_stream_selected(
 86 |             catalog, stream_name=single_stream, selected=True
 87 |         )
 88 | 
 89 |     # discard previous output to stdout (potentially from other tests)
 90 |     capsys.readouterr()
 91 |     with patch(
 92 |         "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren
 93 |     ):
 94 |         tap2 = TapGitHub(config=config_obj, catalog=catalog.to_dict())
 95 |         tap2.sync_all()
 96 |     captured = capsys.readouterr()
 97 |     return captured.out
 98 | 
 99 | 
100 | @pytest.mark.parametrize("skip_parent_streams", [False, True])
101 | @pytest.mark.repo_list(repo_list_2)
102 | def test_get_a_repository_in_repo_list_mode(
103 |     capsys,
104 |     repo_list_config,  # noqa: F811
105 |     skip_parent_streams,
106 | ):
107 |     """
108 |     Discover the catalog, and request 2 repository records.
109 |     The test is parametrized to run twice, with and without
110 |     syncing the top level `repositories` stream.
111 |     """
112 |     repo_list_config["skip_parent_streams"] = skip_parent_streams
113 |     captured_out = run_tap_with_config(
114 |         capsys,
115 |         repo_list_config,
116 |         "repositories" if skip_parent_streams else None,
117 |         single_stream=None,
118 |     )
119 |     # Verify we got the right number of records
120 |     # one per repo in the list only if we sync the "repositories" stream, 0 if not
121 |     assert captured_out.count('{"type":"RECORD","stream":"repositories"') == len(
122 |         repo_list_2_ids * (not skip_parent_streams)
123 |     )
124 |     # check that the tap corrects invalid case in config input
125 |     assert '"repo": "Tap-GitLab"' not in captured_out
126 |     assert '"org": "meltanolabs"' not in captured_out
127 | 
128 | 
129 | @pytest.mark.repo_list(["MeltanoLabs/tap-github"])
130 | def test_last_state_message_is_valid(capsys, repo_list_config):  # noqa: F811
131 |     """
132 |     Validate that the last state message is not a temporary one and contains the
133 |     expected values for a stream with overridden state partitioning keys.
134 |     Run this on a single repo to avoid having to filter messages too much.
135 |     """
136 |     repo_list_config["skip_parent_streams"] = True
137 |     captured_out = run_tap_with_config(
138 |         capsys, repo_list_config, "repositories", single_stream=None
139 |     )
140 |     # capture the messages we're interested in
141 |     state_messages = re.findall(r'{"type":"STATE","value":.*}', captured_out)
142 |     issue_comments_records = re.findall(
143 |         r'{"type":"RECORD","stream":"issue_comments",.*}', captured_out
144 |     )
145 |     assert state_messages is not None
146 |     last_state_msg = state_messages[-1]
147 | 
148 |     # make sure we don't have a temporary state message at the very end
149 |     assert "progress_markers" not in last_state_msg
150 | 
151 |     last_state = json.loads(last_state_msg)
152 |     last_state_updated_at = isoparse(
153 |         last_state["value"]["bookmarks"]["issue_comments"]["partitions"][0][
154 |             "replication_key_value"
155 |         ]
156 |     )
157 |     latest_updated_at = max(
158 |         isoparse(json.loads(record)["record"]["updated_at"])
159 |         for record in issue_comments_records
160 |     )
161 |     assert last_state_updated_at == latest_updated_at
162 | 
163 | 
164 | # case is incorrect on purpose, so we can check that the tap corrects it
165 | # and run the test twice, with and without syncing the `users` stream
166 | @pytest.mark.parametrize("skip_parent_streams", [False, True])
167 | @pytest.mark.username_list(["EricBoucher", "aaRONsTeeRS"])
168 | def test_get_a_user_in_user_usernames_mode(
169 |     capsys,
170 |     username_list_config,  # noqa: F811
171 |     skip_parent_streams,
172 | ):
173 |     """
174 |     Discover the catalog, and request 2 repository records
175 |     """
176 |     username_list_config["skip_parent_streams"] = skip_parent_streams
177 |     captured_out = run_tap_with_config(
178 |         capsys,
179 |         username_list_config,
180 |         "users" if skip_parent_streams else None,
181 |         single_stream=None,
182 |     )
183 |     # Verify we got the right number of records:
184 |     # one per user in the list if we sync the root stream, 0 otherwise
185 |     assert captured_out.count('{"type":"RECORD","stream":"users"') == len(
186 |         username_list_config["user_usernames"] * (not skip_parent_streams)
187 |     )
188 |     # these 2 are inequalities as number will keep changing :)
189 |     assert captured_out.count('{"type":"RECORD","stream":"starred"') > 150
190 |     assert captured_out.count('{"type":"RECORD","stream":"user_contributed_to"') > 25
191 |     assert '{"username":"aaronsteers"' in captured_out
192 |     assert '{"username":"aaRONsTeeRS"' not in captured_out
193 |     assert '{"username":"EricBoucher"' not in captured_out
194 | 
195 | 
196 | @pytest.mark.repo_list(["torvalds/linux"])
197 | def test_large_list_of_contributors(capsys, repo_list_config):  # noqa: F811
198 |     """
199 |     Check that the github error message for very large lists of contributors
200 |     is handled properly (does not return any records).
201 |     """
202 |     captured_out = run_tap_with_config(
203 |         capsys, repo_list_config, skip_stream=None, single_stream="contributors"
204 |     )
205 |     assert captured_out.count('{"type":"RECORD","stream":"contributors"') == 0
206 | 
207 | 
208 | def test_web_tag_parse_counter():
209 |     """
210 |     Check that the parser runs ok on various forms of counters.
211 |     Used in extra_metrics stream.
212 |     """
213 |     # regular int
214 |     tag = BeautifulSoup(
215 |         '<span id="issues-repo-tab-count" title="57" class="Counter">57</span>',
216 |         "html.parser",
217 |     ).span
218 |     assert parse_counter(tag) == 57
219 | 
220 |     # 2k
221 |     tag = BeautifulSoup(
222 |         '<span id="issues-repo-tab-count" title="2028" class="Counter">2k</span>',
223 |         "html.parser",
224 |     ).span
225 |     assert parse_counter(tag) == 2028
226 | 
227 |     # 5k+. The real number is not available in the page, use this approx value
228 |     tag = BeautifulSoup(
229 |         '<span id="issues-repo-tab-count" title="5,000+" class="Counter">5k+</span>',
230 |         "html.parser",
231 |     ).span
232 |     assert parse_counter(tag) == 5_000
233 | 


--------------------------------------------------------------------------------
/tap_github/user_streams.py:
--------------------------------------------------------------------------------
  1 | """User Stream types classes for tap-github."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import re
  6 | from typing import TYPE_CHECKING, Any, ClassVar
  7 | 
  8 | from singer_sdk import typing as th  # JSON Schema typing helpers
  9 | from singer_sdk.exceptions import FatalAPIError
 10 | 
 11 | from tap_github.client import GitHubGraphqlStream, GitHubRestStream
 12 | from tap_github.schema_objects import user_object
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from collections.abc import Iterable
 16 | 
 17 |     from singer_sdk.helpers.types import Context
 18 |     from singer_sdk.tap_base import Tap
 19 | 
 20 | 
 21 | class UserStream(GitHubRestStream):
 22 |     """Defines 'User' stream."""
 23 | 
 24 |     name = "users"
 25 |     replication_key = "updated_at"
 26 | 
 27 |     @property
 28 |     def path(self) -> str:  # type: ignore
 29 |         """Return the API endpoint path."""
 30 |         if "user_usernames" in self.config:
 31 |             return "/users/{username}"
 32 |         elif "user_ids" in self.config:
 33 |             return "/user/{id}"
 34 | 
 35 |     @property
 36 |     def partitions(self) -> list[dict] | None:
 37 |         """Return a list of partitions."""
 38 |         if "user_usernames" in self.config:
 39 |             input_user_list = self.config["user_usernames"]
 40 | 
 41 |             augmented_user_list = []
 42 |             # chunk requests to the graphql endpoint to avoid timeouts and other
 43 |             # obscure errors that the api doesn't say much about. The actual limit
 44 |             # seems closer to 1000, use half that to stay safe.
 45 |             chunk_size = 500
 46 |             list_length = len(input_user_list)
 47 |             self.logger.info(f"Filtering user list of {list_length} users")
 48 |             for ndx in range(0, list_length, chunk_size):
 49 |                 augmented_user_list += self.get_user_ids(
 50 |                     input_user_list[ndx : ndx + chunk_size]
 51 |                 )
 52 |             self.logger.info(f"Running the tap on {len(augmented_user_list)} users")
 53 |             return augmented_user_list
 54 | 
 55 |         elif "user_ids" in self.config:
 56 |             return [{"id": user_id} for user_id in self.config["user_ids"]]
 57 |         return None
 58 | 
 59 |     def get_child_context(self, record: dict, context: Context | None) -> dict:
 60 |         return {
 61 |             "username": record["login"],
 62 |             "user_id": record["id"],
 63 |         }
 64 | 
 65 |     def get_user_ids(self, user_list: list[str]) -> list[dict[str, str]]:
 66 |         """Enrich the list of userse with their numeric ID from github.
 67 | 
 68 |         This helps maintain a stable id for context and bookmarks.
 69 |         It uses the github graphql api to fetch the databaseId.
 70 |         It also removes non-existant repos and corrects casing to ensure
 71 |         data is correct downstream.
 72 |         """
 73 | 
 74 |         # use a temp handmade stream to reuse all the graphql setup of the tap
 75 |         class TempStream(GitHubGraphqlStream):
 76 |             name = "tempStream"
 77 |             schema = th.PropertiesList(
 78 |                 th.Property("id", th.StringType),
 79 |                 th.Property("databaseId", th.IntegerType),
 80 |             ).to_dict()
 81 | 
 82 |             def __init__(self, tap: Tap, user_list: list[str]) -> None:
 83 |                 super().__init__(tap)
 84 |                 self.user_list = user_list
 85 | 
 86 |             @property
 87 |             def query(self) -> str:
 88 |                 chunks = []
 89 |                 for i, user in enumerate(self.user_list):
 90 |                     # we use the `repositoryOwner` query which is the only one that
 91 |                     # works on both users and orgs with graphql. REST is less picky
 92 |                     # and the /user endpoint works for all types.
 93 |                     chunks.append(
 94 |                         f'user{i}: repositoryOwner(login: "{user}") '
 95 |                         "{ login avatarUrl}"
 96 |                     )
 97 |                 return "query {" + " ".join(chunks) + " rateLimit { cost } }"
 98 | 
 99 |         if len(user_list) < 1:
100 |             return []
101 | 
102 |         users_with_ids: list = []
103 |         temp_stream = TempStream(self._tap, list(user_list))
104 | 
105 |         database_id_pattern: re.Pattern = re.compile(
106 |             r"https://avatars.githubusercontent.com/u/(\d+)?.*"
107 |         )
108 |         # replace manually provided org/repo values by the ones obtained
109 |         # from github api. This guarantees that case is correct in the output data.
110 |         # See https://github.com/MeltanoLabs/tap-github/issues/110
111 |         # Also remove repos which do not exist to avoid crashing further down
112 |         # the line.
113 |         for record in temp_stream.request_records({}):
114 |             for item in record:
115 |                 if item == "rateLimit":
116 |                     continue
117 |                 try:
118 |                     username = record[item]["login"]
119 |                 except TypeError:
120 |                     # one of the usernames returned `None`, which means it does
121 |                     # not exist, log some details, and move on to the next one
122 |                     invalid_username = user_list[int(item[4:])]
123 |                     self.logger.info(
124 |                         f"Username not found: {invalid_username} \t"
125 |                         "Removing it from list"
126 |                     )
127 |                     continue
128 |                 # the databaseId (in graphql language) is not available on
129 |                 # repositoryOwner, so we parse the avatarUrl to get it :/
130 |                 m = database_id_pattern.match(record[item]["avatarUrl"])
131 |                 if m is not None:
132 |                     db_id = m.group(1)
133 |                     users_with_ids.append({"username": username, "user_id": db_id})
134 |                 else:
135 |                     # If we get here, github's API is not returning what
136 |                     # we expected, so it's most likely a breaking change on
137 |                     # their end, and the tap's code needs updating
138 |                     raise FatalAPIError("Unexpected GitHub API error: Breaking change?")
139 | 
140 |         self.logger.info(f"Running the tap on {len(users_with_ids)} users")
141 |         return users_with_ids
142 | 
143 |     def get_records(self, context: Context | None) -> Iterable[dict[str, Any]]:
144 |         """
145 |         Override the parent method to allow skipping API calls
146 |         if the stream is deselected and skip_parent_streams is True in config.
147 |         This allows running the tap with fewer API calls and preserving
148 |         quota when only syncing a child stream. Without this,
149 |         the API call is sent but data is discarded.
150 |         """
151 |         if (
152 |             not self.selected
153 |             and "skip_parent_streams" in self.config
154 |             and self.config["skip_parent_streams"]
155 |             and context is not None
156 |         ):
157 |             # build a minimal mock record so that self._sync_records
158 |             # can proceed with child streams
159 |             # the id is fetched in `get_user_ids` above
160 |             yield {
161 |                 "login": context["username"],
162 |                 "id": context["user_id"],
163 |             }
164 |         else:
165 |             yield from super().get_records(context)
166 | 
167 |     schema = th.PropertiesList(
168 |         th.Property("login", th.StringType),
169 |         th.Property("id", th.IntegerType),
170 |         th.Property("node_id", th.StringType),
171 |         th.Property("avatar_url", th.StringType),
172 |         th.Property("gravatar_id", th.StringType),
173 |         th.Property("url", th.StringType),
174 |         th.Property("html_url", th.StringType),
175 |         th.Property("followers_url", th.StringType),
176 |         th.Property("following_url", th.StringType),
177 |         th.Property("gists_url", th.StringType),
178 |         th.Property("starred_url", th.StringType),
179 |         th.Property("subscriptions_url", th.StringType),
180 |         th.Property("organizations_url", th.StringType),
181 |         th.Property("repos_url", th.StringType),
182 |         th.Property("events_url", th.StringType),
183 |         th.Property("received_events_url", th.StringType),
184 |         th.Property("type", th.StringType),
185 |         th.Property("site_admin", th.BooleanType),
186 |         th.Property("name", th.StringType),
187 |         th.Property("company", th.StringType),
188 |         th.Property("blog", th.StringType),
189 |         th.Property("location", th.StringType),
190 |         th.Property("email", th.StringType),
191 |         th.Property("hireable", th.BooleanType),
192 |         th.Property("bio", th.StringType),
193 |         th.Property("twitter_username", th.StringType),
194 |         th.Property("public_repos", th.IntegerType),
195 |         th.Property("public_gists", th.IntegerType),
196 |         th.Property("followers", th.IntegerType),
197 |         th.Property("following", th.IntegerType),
198 |         th.Property("updated_at", th.DateTimeType),
199 |         th.Property("created_at", th.DateTimeType),
200 |     ).to_dict()
201 | 
202 | 
203 | class StarredStream(GitHubRestStream):
204 |     """Defines 'Stars' stream. Warning: this stream does NOT track star deletions."""
205 | 
206 |     name = "starred"
207 |     path = "/users/{username}/starred"
208 |     # "repo_id" is the starred repo's id.
209 |     primary_keys: ClassVar[list[str]] = ["repo_id", "username"]
210 |     parent_stream_type = UserStream
211 |     # TODO - change partitioning key to user_id?
212 |     state_partitioning_keys: ClassVar[list[str]] = ["username"]
213 |     replication_key = "starred_at"
214 |     ignore_parent_replication_key = True
215 |     # GitHub is missing the "since" parameter on this endpoint.
216 |     use_fake_since_parameter = True
217 | 
218 |     @property
219 |     def http_headers(self) -> dict:
220 |         """Return the http headers needed.
221 | 
222 |         Overridden to use an endpoint which includes starred_at property:
223 |         https://docs.github.com/en/rest/reference/activity#custom-media-types-for-starring
224 |         """
225 |         headers = super().http_headers
226 |         headers["Accept"] = "application/vnd.github.v3.star+json"
227 |         return headers
228 | 
229 |     def post_process(self, row: dict, context: Context | None = None) -> dict:
230 |         """
231 |         Add a repo_id top-level field to be used as state replication key.
232 |         """
233 |         row["repo_id"] = row["repo"]["id"]
234 |         if context is not None:
235 |             row["user_id"] = context["user_id"]
236 |         return row
237 | 
238 |     schema = th.PropertiesList(
239 |         # Parent Keys
240 |         th.Property("username", th.StringType),
241 |         th.Property("repo_id", th.IntegerType),
242 |         th.Property("user_id", th.IntegerType),
243 |         # Starred Repo Info
244 |         th.Property("starred_at", th.DateTimeType),
245 |         th.Property(
246 |             "repo",
247 |             th.ObjectType(
248 |                 th.Property("id", th.IntegerType),
249 |                 th.Property("node_id", th.StringType),
250 |                 th.Property("full_name", th.StringType),
251 |                 th.Property("description", th.StringType),
252 |                 th.Property("html_url", th.StringType),
253 |                 th.Property("owner", user_object),
254 |                 th.Property(
255 |                     "license",
256 |                     th.ObjectType(
257 |                         th.Property("key", th.StringType),
258 |                         th.Property("name", th.StringType),
259 |                         th.Property("url", th.StringType),
260 |                         th.Property("spdx_id", th.StringType),
261 |                     ),
262 |                 ),
263 |                 th.Property("updated_at", th.DateTimeType),
264 |                 th.Property("created_at", th.DateTimeType),
265 |                 th.Property("pushed_at", th.DateTimeType),
266 |                 th.Property("stargazers_count", th.IntegerType),
267 |                 th.Property("fork", th.BooleanType),
268 |                 th.Property(
269 |                     "topics",
270 |                     th.ArrayType(th.StringType),
271 |                 ),
272 |                 th.Property("visibility", th.StringType),
273 |                 th.Property("language", th.StringType),
274 |                 th.Property("forks", th.IntegerType),
275 |                 th.Property("watchers", th.IntegerType),
276 |                 th.Property("open_issues", th.IntegerType),
277 |             ),
278 |         ),
279 |     ).to_dict()
280 | 
281 | 
282 | class UserContributedToStream(GitHubGraphqlStream):
283 |     """Defines 'UserContributedToStream' stream."""
284 | 
285 |     name = "user_contributed_to"
286 |     query_jsonpath = "$.data.user.repositoriesContributedTo.nodes.[*]"
287 |     primary_keys: ClassVar[list[str]] = ["username", "name_with_owner"]
288 |     replication_key = None
289 |     parent_stream_type = UserStream
290 |     # TODO - add user_id to schema
291 |     # TODO - change partitioning key to user_id?
292 |     state_partitioning_keys: ClassVar[list[str]] = ["username"]
293 |     ignore_parent_replication_key = True
294 | 
295 |     @property
296 |     def query(self) -> str:
297 |         """Return dynamic GraphQL query."""
298 |         # Graphql id is equivalent to REST node_id. To keep the tap consistent,
299 |         # we rename "id" to "node_id".
300 |         return """
301 |           query userContributedTo($username: String! $nextPageCursor_0: String) {
302 |             user (login: $username) {
303 |               repositoriesContributedTo (first: 100 after: $nextPageCursor_0 includeUserRepositories: true orderBy: {field: STARGAZERS, direction: DESC}) {
304 |                 pageInfo {
305 |                   hasNextPage_0: hasNextPage
306 |                   startCursor_0: startCursor
307 |                   endCursor_0: endCursor
308 |                 }
309 |                 nodes {
310 |                   node_id: id
311 |                   database_id: databaseId
312 |                   name_with_owner: nameWithOwner
313 |                   open_graph_image_url: openGraphImageUrl
314 |                   stargazer_count: stargazerCount
315 |                   pushed_at: pushedAt
316 |                   owner {
317 |                     node_id: id
318 |                     login
319 |                   }
320 |                 }
321 |               }
322 |             }
323 |             rateLimit {
324 |               cost
325 |             }
326 |           }
327 |         """  # noqa: E501
328 | 
329 |     schema = th.PropertiesList(
330 |         th.Property("node_id", th.StringType),
331 |         th.Property("username", th.StringType),
332 |         th.Property("name_with_owner", th.StringType),
333 |         th.Property("open_graph_image_url", th.StringType),
334 |         th.Property("stargazer_count", th.IntegerType),
335 |         th.Property(
336 |             "owner",
337 |             th.ObjectType(
338 |                 th.Property("node_id", th.StringType),
339 |                 th.Property("login", th.StringType),
340 |             ),
341 |         ),
342 |     ).to_dict()
343 | 


--------------------------------------------------------------------------------
/tap_github/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MeltanoLabs/tap-github/0374f0768b1ffb2a3db0dd53591282830e553cf2/tap_github/utils/__init__.py


--------------------------------------------------------------------------------
/tap_github/utils/filter_stdout.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import contextlib
 4 | import io
 5 | import re
 6 | import sys
 7 | from re import Pattern
 8 | from typing import TYPE_CHECKING, TextIO
 9 | 
10 | if TYPE_CHECKING:
11 |     from collections.abc import Generator
12 | 
13 | 
14 | class FilterStdOutput:
15 |     """Filter out stdout/sterr given a regex pattern."""
16 | 
17 |     def __init__(self, stream: TextIO, re_pattern: str | Pattern) -> None:
18 |         self.stream = stream
19 |         self.pattern = (
20 |             re.compile(re_pattern) if isinstance(re_pattern, str) else re_pattern
21 |         )
22 |         self.triggered = False
23 | 
24 |     def __getattr__(self, attr_name: str) -> object:
25 |         return getattr(self.stream, attr_name)
26 | 
27 |     def write(self, data: str) -> None:
28 |         if data == "\n" and self.triggered:
29 |             self.triggered = False
30 |         else:
31 |             if self.pattern.search(data) is None:
32 |                 self.stream.write(data)
33 |                 self.stream.flush()
34 |             else:
35 |                 # caught bad pattern
36 |                 self.triggered = True
37 | 
38 |     def flush(self) -> None:
39 |         self.stream.flush()
40 | 
41 | 
42 | @contextlib.contextmanager
43 | def nostdout() -> Generator[None, None, None]:
44 |     save_stdout = sys.stdout
45 |     sys.stdout = io.StringIO()
46 |     yield
47 |     sys.stdout = save_stdout
48 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | isolated_build = true
 3 | envlist = py3{9,10,11,12,13}
 4 | 
 5 | [testenv]
 6 | whitelist_externals = poetry
 7 | commands =
 8 |     poetry install -v
 9 |     poetry run pytest
10 | 


--------------------------------------------------------------------------------