├── .env.template ├── .github ├── CODEOWNERS ├── dependabot.yml └── workflows │ ├── project_add.yml │ ├── release.yml │ └── test_tap.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .secrets └── .gitignore ├── .sonarcloud.properties ├── LICENSE ├── README.md ├── config-sample.json ├── config.json ├── meltano.yml ├── plugins └── loaders │ └── target-jsonl--andyh1203.lock ├── poetry.lock ├── pyproject.toml ├── tap_github ├── __init__.py ├── authenticator.py ├── client.py ├── organization_streams.py ├── repository_streams.py ├── schema_objects.py ├── scraping.py ├── streams.py ├── tap.py ├── tests │ ├── __init__.py │ ├── fixtures.py │ ├── test_authenticator.py │ ├── test_core.py │ └── test_tap.py ├── user_streams.py └── utils │ ├── __init__.py │ └── filter_stdout.py └── tox.ini /.env.template: -------------------------------------------------------------------------------- 1 | TAP_GITHUB_AUTH_TOKEN="****" 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Global owner 2 | * @MeltanoLabs/tap-github 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: pip 9 | directory: "/" 10 | schedule: 11 | interval: weekly 12 | day: monday 13 | versioning-strategy: increase-if-necessary 14 | groups: 15 | development-dependencies: 16 | dependency-type: development 17 | runtime-dependencies: 18 | dependency-type: production 19 | update-types: 20 | - "patch" 21 | - package-ecosystem: github-actions 22 | directory: "/" 23 | schedule: 24 | interval: weekly 25 | day: monday 26 | groups: 27 | actions: 28 | patterns: 29 | - "*" 30 | -------------------------------------------------------------------------------- /.github/workflows/project_add.yml: -------------------------------------------------------------------------------- 1 | # Managed by Pulumi. Any edits to this file will be overwritten. 2 | 3 | name: Add issues and PRs to MeltanoLabs Overview Project 4 | 5 | on: 6 | issues: 7 | types: 8 | - opened 9 | - reopened 10 | - transferred 11 | pull_request: 12 | types: 13 | - opened 14 | - reopened 15 | 16 | jobs: 17 | add-to-project: 18 | name: Add issue to project 19 | runs-on: ubuntu-latest 20 | if: ${{ github.actor != 'dependabot[bot]' }} 21 | steps: 22 | - uses: actions/add-to-project@244f685bbc3b7adfa8466e08b698b5577571133e # v1.0.2 23 | with: 24 | project-url: https://github.com/orgs/MeltanoLabs/projects/3 25 | github-token: ${{ secrets.MELTYBOT_PROJECT_ADD_PAT }} 26 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish Python Package 2 | 3 | on: 4 | push: 5 | 6 | permissions: 7 | contents: write 8 | id-token: write 9 | 10 | jobs: 11 | build: 12 | name: Build wheel and sdist 13 | runs-on: ubuntu-latest 14 | outputs: 15 | version: ${{ steps.baipp.outputs.package_version }} 16 | steps: 17 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 18 | with: 19 | fetch-depth: 0 20 | - uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0 21 | id: baipp 22 | 23 | publish: 24 | name: Publish to PyPI 25 | runs-on: ubuntu-latest 26 | needs: [build] 27 | environment: 28 | name: pypi 29 | url: https://pypi.org/project/meltanolabs-tap-github/${{ needs.build.outputs.version }} 30 | if: startsWith(github.ref, 'refs/tags/') 31 | 32 | steps: 33 | - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 34 | with: 35 | name: Packages 36 | path: dist 37 | - name: Upload wheel to release 38 | uses: svenstaro/upload-release-action@04733e069f2d7f7f0b4aebc4fbdbce8613b03ccd # 2.9.0 39 | with: 40 | repo_token: ${{ secrets.GITHUB_TOKEN }} 41 | file: dist/*.whl 42 | tag: ${{ github.ref }} 43 | overwrite: true 44 | file_glob: true 45 | - name: Deploy to PyPI 46 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 47 | -------------------------------------------------------------------------------- /.github/workflows/test_tap.yml: -------------------------------------------------------------------------------- 1 | name: Test tap-github 2 | 3 | on: 4 | # Run on all pull requests and on pushes to main. 5 | pull_request: 6 | paths: 7 | - .github/workflows/test_tap.yml 8 | - poetry.lock 9 | - pyproject.toml 10 | - 'tap_github/**' 11 | push: 12 | branches: 13 | - main 14 | paths: 15 | - .github/workflows/test_tap.yml 16 | - poetry.lock 17 | - pyproject.toml 18 | - 'tap_github/**' 19 | workflow_dispatch: 20 | schedule: 21 | # Every 6 hours 22 | - cron: "0 */6 * * *" 23 | 24 | concurrency: 25 | group: ${{ github.workflow }}-${{ github.ref }} 26 | cancel-in-progress: true 27 | 28 | env: 29 | FORCE_COLOR: 1 30 | 31 | jobs: 32 | tests: 33 | 34 | runs-on: ubuntu-latest 35 | env: 36 | GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} 37 | ORG_LEVEL_TOKEN: ${{secrets.ORG_LEVEL_TOKEN}} 38 | strategy: 39 | matrix: 40 | python-version: 41 | - "3.13" 42 | - "3.12" 43 | - "3.11" 44 | - "3.10" 45 | - "3.9" 46 | # run the matrix jobs one after the other so they can benefit from caching 47 | max-parallel: 1 48 | 49 | steps: 50 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 51 | - name: Get Date 52 | id: get-date 53 | run: | 54 | echo "date=$(/bin/date -u "+%Y%m%d")" >> $GITHUB_OUTPUT 55 | shell: bash 56 | 57 | - name: Cache github API responses 58 | uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 59 | with: 60 | # must match the path in tests/__init__.py 61 | path: '.cache/api_calls_tests_cache.sqlite' 62 | # github cache expires after 1wk, and we expire the content after 24h 63 | # this key is rotated every 24h so that the code does not find a stale 64 | # file in the cache. See issue #119 65 | key: api-cache-v4-${{ steps.get-date.outputs.date }} 66 | 67 | - name: Install Poetry 68 | uses: snok/install-poetry@76e04a911780d5b312d89783f7b1cd627778900a # v1.4.1 69 | with: 70 | # Version of Poetry to use 71 | version: 2.1.1 72 | virtualenvs-create: true 73 | virtualenvs-in-project: true 74 | - name: Set up Python ${{ matrix.python-version }} 75 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 76 | with: 77 | python-version: ${{ matrix.python-version }} 78 | cache: poetry 79 | - name: Install dependencies 80 | run: | 81 | poetry env use ${{ matrix.python-version }} 82 | poetry install 83 | - name: Type check with mypy 84 | id: type_check 85 | continue-on-error: true 86 | run: | 87 | poetry run mypy tap_github 88 | - name: Test with pytest 89 | id: test_pytest 90 | continue-on-error: true 91 | run: | 92 | LOGLEVEL=WARNING poetry run pytest --capture=no 93 | - name: Test with pytest (run 2) 94 | id: retry_test_pytest 95 | if: steps.test_pytest.outcome=='failure' # check the step outcome, wait and retry 96 | run: | 97 | # sleep as little as possible to reduce CI run time 98 | # This assumes that REST quota is the one that caused problem 99 | # (which is most likely/often the case) 100 | target_ts=$(curl -s -H "Accept: application/vnd.github+json" -H "Authorization: Bearer $GITHUB_TOKEN" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/rate_limit | grep reset | head -n 1 | awk -F: '{ print $2 }') 101 | current_ts=$(date +%s) 102 | seconds_to_sleep=$(echo "$target_ts - $current_ts" | bc) 103 | sleep $seconds_to_sleep 104 | LOGLEVEL=WARNING poetry run pytest --capture=no 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Meltano hidden files 2 | .meltano 3 | 4 | # Test output 5 | .output 6 | 7 | # Secrets and internal config files 8 | **/.secrets/* 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # IDE 138 | .idea/ 139 | .vscode/ 140 | 141 | # Pyre type checker 142 | .pyre/ 143 | 144 | #Other 145 | .DS_Store 146 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: true 3 | autoupdate_schedule: monthly 4 | autoupdate_commit_msg: 'chore: pre-commit autoupdate' 5 | 6 | repos: 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v5.0.0 9 | hooks: 10 | - id: check-json 11 | exclude: "\\.vscode/.*.json" 12 | - id: check-toml 13 | - id: check-yaml 14 | - id: end-of-file-fixer 15 | - id: trailing-whitespace 16 | 17 | 18 | - repo: https://github.com/astral-sh/ruff-pre-commit 19 | rev: v0.11.12 20 | hooks: 21 | - id: ruff 22 | args: [ --fix ] 23 | - id: ruff-format 24 | -------------------------------------------------------------------------------- /.secrets/.gitignore: -------------------------------------------------------------------------------- 1 | # IMPORTANT! This folder is hidden from git - if you need to store config files or other secrets, 2 | # make sure those are never staged for commit into your git repo. You can store them here or another 3 | # secure location. 4 | 5 | * 6 | !.gitignore 7 | -------------------------------------------------------------------------------- /.sonarcloud.properties: -------------------------------------------------------------------------------- 1 | sonar.python.version=3.9, 3.10, 3.11, 3.12, 3.13 2 | sonar.cpd.exclusions=**/* 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tap-github 2 | 3 | `tap-github` is a Singer tap for GitHub. 4 | 5 | Built with the [Singer SDK](https://gitlab.com/meltano/singer-sdk). 6 | 7 | ## Installation 8 | 9 | ```bash 10 | # use uv (https://docs.astral.sh/uv/) 11 | uv tool install meltanolabs-tap-github 12 | 13 | # or pipx (https://pipx.pypa.io/stable/) 14 | pipx install meltanolabs-tap-github 15 | 16 | # or Meltano 17 | meltano add extractor tap-github 18 | ``` 19 | 20 | A list of release versions is available at https://github.com/MeltanoLabs/tap-github/releases 21 | 22 | ## Configuration 23 | 24 | ### Accepted Config Options 25 | 26 | This tap accepts the following configuration options: 27 | 28 | - Required: One and only one of the following modes: 29 | 1. `repositories`: An array of strings specifying the GitHub repositories to be included. Each element of the array should be of the form `/`, e.g. `MeltanoLabs/tap-github`. 30 | 2. `organizations`: An array of strings containing the github organizations to be included 31 | 3. `searches`: An array of search descriptor objects with the following properties: 32 | - `name`: A human readable name for the search query 33 | - `query`: A github search string (generally the same as would come after `?q=` in the URL) 34 | 4. `user_usernames`: A list of github usernames 35 | 5. `user_ids`: A list of github user ids [int] 36 | - Highly recommended: 37 | - Personal access tokens (PATs) for authentication can be provided in 3 ways: 38 | - `auth_token` - Takes a single token. 39 | - `additional_auth_tokens` - Takes a list of tokens. Can be used together with `auth_token` or as the sole source of PATs. 40 | - Any environment variables beginning with `GITHUB_TOKEN` will be assumed to be PATs. These tokens will be used in addition to `auth_token` (if provided), but will not be used if `additional_auth_tokens` is provided. 41 | - GitHub App keys are another option for authentication, and can be used in combination with PATs if desired. App IDs and keys should be assembled into the format `:app_id:;;-----BEGIN RSA PRIVATE KEY-----\n_YOUR_P_KEY_\n-----END RSA PRIVATE KEY-----` where the key can be generated from the `Private keys` section on https://github.com/organizations/:organization_name/settings/apps/:app_name. Read more about GitHub App quotas [here](https://docs.github.com/en/enterprise-server@3.3/developers/apps/building-github-apps/rate-limits-for-github-apps#server-to-server-requests). Formatted app keys can be provided in 2 ways: 42 | - `auth_app_keys` - List of GitHub App keys in the prescribed format. 43 | - If `auth_app_keys` is not provided but there is an environment variable with the name `GITHUB_APP_PRIVATE_KEY`, it will be assumed to be an App key in the prescribed format. 44 | - Optional: 45 | - `user_agent` 46 | - `start_date` 47 | - `metrics_log_level` 48 | - `stream_maps` 49 | - `stream_maps_config` 50 | - `stream_options`: Options which can change the behaviour of a specific stream are nested within. 51 | - `milestones`: Valid options for the `milestones` stream are nested within. 52 | - `state`: Determines which milestones will be extracted. One of `open` (default), `closed`, `all`. 53 | - `rate_limit_buffer`: A buffer to avoid consuming all query points for the auth_token at hand. Defaults to 1000. 54 | - `expiry_time_buffer`: A buffer used when determining when to refresh GitHub app tokens. Only relevant when authenticating as a GitHub app. Defaults to 10 minutes. Tokens generated by GitHub apps expire 1 hour after creation, and will be refreshed once fewer than `expiry_time_buffer` minutes remain until the anticipated expiry time. 55 | 56 | Note that modes 1-3 are `repository` modes and 4-5 are `user` modes and will not run the same set of streams. 57 | 58 | A full list of supported settings and capabilities for this tap is available by running: 59 | 60 | ```bash 61 | tap-github --about 62 | ``` 63 | 64 | ### Source Authentication and Authorization 65 | 66 | A small number of records may be pulled without an auth token. However, a Github auth token should generally be considered "required" since it gives more realistic rate limits. (See GitHub API docs for more info.) 67 | 68 | ## Usage 69 | 70 | ### API Limitation - Pagination 71 | 72 | The GitHub API is limited for some resources such as `/events`. For some resources, users might encounter the following error: 73 | 74 | ``` 75 | In order to keep the API fast for everyone, pagination is limited for this resource. Check the rel=last link relation in the Link response header to see how far back you can traverse. 76 | ``` 77 | 78 | To avoid this, the GitHub streams will exit early. I.e. when there are no more `next page` available. If you are fecthing `/events` at the repository level, beware of letting the tap disabled for longer than a few days or you will have gaps in your data. 79 | 80 | You can easily run `tap-github` by itself or in a pipeline using [Meltano](www.meltano.com). 81 | 82 | ### Notes regarding permissions 83 | 84 | * For the `traffic_*` streams, [you will need write access to the repository](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28). You can enable extraction for these streams by [selecting them in the catalog](https://hub.meltano.com/singer/spec/#metadata). 85 | 86 | ### Executing the Tap Directly 87 | 88 | ```bash 89 | tap-github --version 90 | tap-github --help 91 | tap-github --config CONFIG --discover > ./catalog.json 92 | ``` 93 | 94 | ## Contributing 95 | This project uses parent-child streams. Learn more about them [here.](https://gitlab.com/meltano/sdk/-/blob/main/docs/parent_streams.md) 96 | 97 | ### Initialize your Development Environment 98 | 99 | ```bash 100 | pipx install poetry 101 | poetry install 102 | ``` 103 | 104 | ### Create and Run Tests 105 | 106 | Create tests within the `tap_github/tests` subfolder and 107 | then run: 108 | 109 | ```bash 110 | poetry run pytest 111 | ``` 112 | 113 | You can also test the `tap-github` CLI interface directly using `poetry run`: 114 | 115 | ```bash 116 | poetry run tap-github --help 117 | ``` 118 | 119 | ### Testing with [Meltano](meltano.com) 120 | 121 | _**Note:** This tap will work in any Singer environment and does not require Meltano. 122 | Examples here are for convenience and to streamline end-to-end orchestration scenarios._ 123 | 124 | Your project comes with a custom `meltano.yml` project file already created. Open the `meltano.yml` and follow any _"TODO"_ items listed in 125 | the file. 126 | 127 | Next, install Meltano (if you haven't already) and any needed plugins: 128 | 129 | ```bash 130 | # Install meltano 131 | pipx install meltano 132 | # Initialize meltano within this directory 133 | cd tap-github 134 | meltano install 135 | ``` 136 | 137 | Now you can test and orchestrate using Meltano: 138 | 139 | ```bash 140 | # Test invocation: 141 | meltano invoke tap-github --version 142 | # OR run a test `elt` pipeline: 143 | meltano elt tap-github target-jsonl 144 | ``` 145 | 146 | One-liner to recreate output directory, run elt, and write out state file: 147 | 148 | ```bash 149 | # Update this when you want a fresh state file: 150 | TESTJOB=testjob1 151 | 152 | # Run everything in one line 153 | mkdir -p .output && meltano elt tap-github target-jsonl --job_id $TESTJOB && meltano elt tap-github target-jsonl --job_id $TESTJOB --dump=state > .output/state.json 154 | ``` 155 | 156 | ### Singer SDK Dev Guide 157 | 158 | See the [dev guide](../../docs/dev_guide.md) for more instructions on how to use the Singer SDK to 159 | develop your own taps and targets. 160 | -------------------------------------------------------------------------------- /config-sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "searches": [ 3 | { 4 | "name": "test_search", 5 | "query": "target-athena+fork:only" 6 | } 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "repositories": ["indeedeng/proctor"], 3 | "start_date": "2022-05-16" 4 | } 5 | -------------------------------------------------------------------------------- /meltano.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | send_anonymous_usage_stats: false 3 | project_id: 96584f7b-a36c-46e0-b41a-7f9074293137 4 | venv: 5 | backend: uv 6 | plugins: 7 | extractors: 8 | - name: tap-github 9 | namespace: tap_github 10 | pip_url: -e . 11 | capabilities: 12 | - state 13 | - catalog 14 | - discover 15 | settings: 16 | - name: user_agent 17 | kind: string 18 | - name: metrics_log_level 19 | kind: string 20 | - name: auth_token 21 | kind: password 22 | - name: additional_auth_tokens 23 | kind: array 24 | - name: auth_app_keys 25 | kind: array 26 | - name: rate_limit_buffer 27 | kind: integer 28 | - name: expiry_time_buffer 29 | kind: integer 30 | - name: searches 31 | kind: array 32 | - name: organizations 33 | kind: array 34 | - name: repositories 35 | kind: array 36 | - name: user_usernames 37 | kind: array 38 | - name: user_ids 39 | kind: array 40 | - name: stream_options.milestones.state 41 | kind: options 42 | options: 43 | - label: Open 44 | value: open 45 | - label: Closed 46 | value: closed 47 | - label: All 48 | value: all 49 | - name: start_date 50 | kind: date_iso8601 51 | value: '2010-01-01T00:00:00Z' 52 | - name: stream_maps 53 | kind: object 54 | - name: stream_map_config 55 | kind: object 56 | select: 57 | - '*.*' 58 | loaders: 59 | - name: target-jsonl 60 | variant: andyh1203 61 | pip_url: target-jsonl 62 | config: 63 | destination_path: .output 64 | do_timestamp_file: true 65 | -------------------------------------------------------------------------------- /plugins/loaders/target-jsonl--andyh1203.lock: -------------------------------------------------------------------------------- 1 | { 2 | "plugin_type": "loaders", 3 | "name": "target-jsonl", 4 | "namespace": "target_jsonl", 5 | "variant": "andyh1203", 6 | "label": "JSON Lines (JSONL)", 7 | "docs": "https://hub.meltano.com/loaders/target-jsonl--andyh1203", 8 | "repo": "https://github.com/andyh1203/target-jsonl", 9 | "pip_url": "target-jsonl", 10 | "description": "JSONL loader", 11 | "logo_url": "https://hub.meltano.com/assets/logos/loaders/jsonl.png", 12 | "settings": [ 13 | { 14 | "name": "destination_path", 15 | "kind": "string", 16 | "value": "output", 17 | "label": "Destination Path", 18 | "description": "Sets the destination path the JSONL files are written to, relative\nto the project root.\n\nThe directory needs to exist already, it will not be created\nautomatically.\n\nTo write JSONL files to the project root, set an empty string (`\"\"`).\n" 19 | }, 20 | { 21 | "name": "do_timestamp_file", 22 | "kind": "boolean", 23 | "value": false, 24 | "label": "Include Timestamp in File Names", 25 | "description": "Specifies if the files should get timestamped.\n\nBy default, the resulting file will not have a timestamp in the file name (i.e. `exchange_rate.jsonl`).\n\nIf this option gets set to `true`, the resulting file will have a timestamp associated with it (i.e. `exchange_rate-{timestamp}.jsonl`).\n" 26 | }, 27 | { 28 | "name": "custom_name", 29 | "kind": "string", 30 | "label": "Custom File Name Override", 31 | "description": "Specifies a custom name for the filename, instead of the stream name.\n\nThe file name will be `{custom_name}-{timestamp}.jsonl`, if `do_timestamp_file` is `true`.\nOtherwise the file name will be `{custom_name}.jsonl`.\n\nIf custom name is not provided, the stream name will be used.\n" 32 | } 33 | ] 34 | } 35 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "meltanolabs-tap-github" 3 | version = "0.0.0" 4 | description = "Singer tap for GitHub, built with the Singer SDK." 5 | authors = ["Meltano and Meltano Community "] 6 | maintainers = [ 7 | "Meltano and Meltano Community ", 8 | "Edgar Ramírez-Mondragón ", 9 | ] 10 | homepage = "https://github.com/MeltanoLabs/tap-github" 11 | repository = "https://github.com/MeltanoLabs/tap-github" 12 | license = "Apache-2.0" 13 | keywords = ["Meltano", "Singer", "Meltano SDK", "Singer SDK", "ELT", "GitHub"] 14 | readme = "README.md" 15 | classifiers = [ 16 | "Intended Audience :: Developers", 17 | "License :: OSI Approved :: Apache Software License", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Programming Language :: Python :: 3.13", 24 | "Programming Language :: Python :: Implementation :: CPython", 25 | "Typing :: Typed", 26 | ] 27 | packages = [ 28 | { include = "tap_github", format = ["sdist", "wheel"] } 29 | ] 30 | 31 | [tool.poetry.urls] 32 | "Issue Tracker" = "https://github.com/MeltanoLabs/tap-github/issues" 33 | 34 | [tool.poetry.dependencies] 35 | beautifulsoup4 = "~=4.13.3" 36 | cryptography = { version = "~=45.0.2", python = ">3.9.0,<3.9.1 || >3.9.1" } 37 | nested-lookup = "~=0.2.25" 38 | PyJWT = "2.10.1" 39 | python = ">=3.9" 40 | python-dateutil = "~=2.9" 41 | requests = "~=2.32.3" 42 | # For local SDK dev: 43 | # singer-sdk = {path = "../singer-sdk", develop = true} 44 | singer-sdk = "~=0.46.0" 45 | 46 | [tool.poetry.group.dev.dependencies] 47 | mypy = ">=1.15.0" 48 | pytest = ">=7.3.1" 49 | requests-cache = ">=1.0.1" 50 | types-beautifulsoup4 = ">=4.12.0" 51 | types-python-dateutil = "~=2.9.0" 52 | types-requests = ">=2.30.0" 53 | types-simplejson = "~=3.20.0" 54 | 55 | [tool.poetry-dynamic-versioning] 56 | enable = true 57 | 58 | [[tool.mypy.overrides]] 59 | module = [ 60 | "backoff", 61 | "nested_lookup", 62 | ] 63 | ignore_missing_imports = true 64 | 65 | [build-system] 66 | requires = [ 67 | "poetry-core==2.1.1", 68 | "poetry-dynamic-versioning==1.8.2", 69 | 70 | ] 71 | build-backend = "poetry_dynamic_versioning.backend" 72 | 73 | [tool.poetry.scripts] 74 | # CLI declaration 75 | tap-github = 'tap_github.tap:cli' 76 | 77 | [tool.pytest.ini_options] 78 | markers = [ 79 | "repo_list: mark a test as using a list of repos in config", 80 | "username_list: mark a test as using a list of usernames in config", 81 | ] 82 | 83 | [tool.ruff] 84 | target-version = "py39" 85 | 86 | [tool.ruff.lint] 87 | ignore = [] 88 | select = [ 89 | "F", # Pyflakes 90 | "E", # pycodestyle (errors) 91 | "W", # pycodestyle (warnings) 92 | "I", # isort 93 | "N", # pep8-naming 94 | "UP", # pyupgrade 95 | "YTT", # flake8-2020 96 | "ANN", # flake8-annotations 97 | "B", # flake8-bugbear 98 | "A", # flake8-builtins 99 | "C4", # flake8-comprehensions 100 | "DTZ", # flake8-datetimez 101 | "FA", # flake8-future-annotations 102 | "SIM", # flake8-simplify 103 | "TC", # flake8-type-checking 104 | "PERF", # Perflint 105 | "FURB", # refurb 106 | "RUF", # Ruff-specific rules 107 | ] 108 | 109 | [tool.ruff.lint.per-file-ignores] 110 | "tap_github/tests/*" = ["ANN"] 111 | -------------------------------------------------------------------------------- /tap_github/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MeltanoLabs/tap-github/0374f0768b1ffb2a3db0dd53591282830e553cf2/tap_github/__init__.py -------------------------------------------------------------------------------- /tap_github/authenticator.py: -------------------------------------------------------------------------------- 1 | """Classes to assist in authenticating to the GitHub API.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import time 7 | from copy import deepcopy 8 | from datetime import datetime, timedelta, timezone 9 | from os import environ 10 | from random import choice, shuffle 11 | from typing import TYPE_CHECKING, Any 12 | 13 | import jwt 14 | import requests 15 | from singer_sdk.authenticators import APIAuthenticatorBase 16 | 17 | if TYPE_CHECKING: 18 | from singer_sdk.streams import RESTStream 19 | 20 | 21 | class TokenManager: 22 | """A class to store a token's attributes and state. 23 | This parent class should not be used directly, use a subclass instead. 24 | """ 25 | 26 | DEFAULT_RATE_LIMIT = 5000 27 | # The DEFAULT_RATE_LIMIT_BUFFER buffer serves two purposes: 28 | # - keep some leeway and rotate tokens before erroring out on rate limit. 29 | # - not consume all available calls when we rare using an org or user token. 30 | DEFAULT_RATE_LIMIT_BUFFER = 1000 31 | 32 | def __init__( 33 | self, 34 | token: str | None, 35 | rate_limit_buffer: int | None = None, 36 | logger: Any | None = None, # noqa: ANN401 37 | ) -> None: 38 | """Init TokenManager info.""" 39 | self.token = token 40 | self.logger = logger 41 | self.rate_limit = self.DEFAULT_RATE_LIMIT 42 | self.rate_limit_remaining = self.DEFAULT_RATE_LIMIT 43 | self.rate_limit_reset: datetime | None = None 44 | self.rate_limit_used = 0 45 | self.rate_limit_buffer = ( 46 | rate_limit_buffer 47 | if rate_limit_buffer is not None 48 | else self.DEFAULT_RATE_LIMIT_BUFFER 49 | ) 50 | 51 | def update_rate_limit(self, response_headers: Any) -> None: # noqa: ANN401 52 | self.rate_limit = int(response_headers["X-RateLimit-Limit"]) 53 | self.rate_limit_remaining = int(response_headers["X-RateLimit-Remaining"]) 54 | self.rate_limit_reset = datetime.fromtimestamp( 55 | int(response_headers["X-RateLimit-Reset"]), 56 | tz=timezone.utc, 57 | ) 58 | self.rate_limit_used = int(response_headers["X-RateLimit-Used"]) 59 | 60 | def is_valid_token(self) -> bool: 61 | """Try making a request with the current token. If the request succeeds return True, else False.""" # noqa: E501 62 | if not self.token: 63 | return False 64 | 65 | try: 66 | response = requests.get( 67 | url="https://api.github.com/rate_limit", 68 | headers={ 69 | "Authorization": f"token {self.token}", 70 | }, 71 | ) 72 | response.raise_for_status() 73 | return True 74 | except requests.exceptions.HTTPError: 75 | msg = ( 76 | f"A token could not be validated. " 77 | f"{response.status_code} Client Error: " 78 | f"{response.content!s} (Reason: {response.reason})" 79 | ) 80 | if self.logger is not None: 81 | self.logger.warning(msg) 82 | return False 83 | 84 | def has_calls_remaining(self) -> bool: 85 | """Check if a token has capacity to make more calls. 86 | 87 | Returns: 88 | True if the token is valid and has enough api calls remaining. 89 | """ 90 | if self.rate_limit_reset is None: 91 | return True 92 | return self.rate_limit_used <= ( 93 | self.rate_limit - self.rate_limit_buffer 94 | ) or self.rate_limit_reset <= datetime.now(tz=timezone.utc) 95 | 96 | 97 | class PersonalTokenManager(TokenManager): 98 | """A class to store token rate limiting information.""" 99 | 100 | def __init__( 101 | self, 102 | token: str, 103 | rate_limit_buffer: int | None = None, 104 | **kwargs, # noqa: ANN003 105 | ) -> None: 106 | """Init PersonalTokenRateLimit info.""" 107 | super().__init__(token, rate_limit_buffer=rate_limit_buffer, **kwargs) 108 | 109 | 110 | def generate_jwt_token( 111 | github_app_id: str, 112 | github_private_key: str, 113 | expiration_time: int = 600, 114 | algorithm: str = "RS256", 115 | ) -> str: 116 | actual_time = int(time.time()) 117 | 118 | payload = { 119 | "iat": actual_time, 120 | "exp": actual_time + expiration_time, 121 | "iss": github_app_id, 122 | } 123 | token = jwt.encode(payload, github_private_key, algorithm=algorithm) 124 | 125 | if isinstance(token, bytes): 126 | token = token.decode("utf-8") 127 | 128 | return token 129 | 130 | 131 | def generate_app_access_token( 132 | github_app_id: str, 133 | github_private_key: str, 134 | github_installation_id: str | None = None, 135 | ) -> tuple[str, datetime]: 136 | produced_at = datetime.now(tz=timezone.utc) 137 | jwt_token = generate_jwt_token(github_app_id, github_private_key) 138 | 139 | headers = {"Authorization": f"Bearer {jwt_token}"} 140 | 141 | if github_installation_id is None: 142 | list_installations_resp = requests.get( 143 | url="https://api.github.com/app/installations", headers=headers 144 | ) 145 | list_installations_resp.raise_for_status() 146 | list_installations = list_installations_resp.json() 147 | 148 | if len(list_installations) == 0: 149 | raise Exception(f"No installations found for app {github_app_id}.") 150 | 151 | github_installation_id = choice(list_installations)["id"] 152 | 153 | url = f"https://api.github.com/app/installations/{github_installation_id}/access_tokens" 154 | resp = requests.post(url, headers=headers) 155 | 156 | if resp.status_code != 201: 157 | resp.raise_for_status() 158 | 159 | expires_at = produced_at + timedelta(hours=1) 160 | return resp.json()["token"], expires_at 161 | 162 | 163 | class AppTokenManager(TokenManager): 164 | """A class to store an app token's attributes and state, and handle token refreshing""" # noqa: E501 165 | 166 | DEFAULT_RATE_LIMIT = 15000 167 | DEFAULT_EXPIRY_BUFFER_MINS = 10 168 | 169 | def __init__( 170 | self, 171 | env_key: str, 172 | rate_limit_buffer: int | None = None, 173 | expiry_time_buffer: int | None = None, 174 | **kwargs, # noqa: ANN003 175 | ) -> None: 176 | if rate_limit_buffer is None: 177 | rate_limit_buffer = self.DEFAULT_RATE_LIMIT_BUFFER 178 | super().__init__(None, rate_limit_buffer=rate_limit_buffer, **kwargs) 179 | 180 | parts = env_key.split(";;") 181 | self.github_app_id = parts[0] 182 | self.github_private_key = (parts[1:2] or [""])[0].replace("\\n", "\n") 183 | self.github_installation_id: str | None = parts[2] if len(parts) >= 3 else None 184 | 185 | if expiry_time_buffer is None: 186 | expiry_time_buffer = self.DEFAULT_EXPIRY_BUFFER_MINS 187 | self.expiry_time_buffer = expiry_time_buffer 188 | 189 | self.token_expires_at: datetime | None = None 190 | self.claim_token() 191 | 192 | def claim_token(self) -> None: 193 | """Updates the TokenManager's token and token_expires_at attributes. 194 | 195 | The outcome will be _either_ that self.token is updated to a newly claimed valid token and 196 | self.token_expires_at is updated to the anticipated expiry time (erring on the side of an early estimate) 197 | _or_ self.token and self.token_expires_at are both set to None. 198 | """ # noqa: E501 199 | self.token = None 200 | self.token_expires_at = None 201 | 202 | # Make sure we have the details we need 203 | if not self.github_app_id or not self.github_private_key: 204 | raise ValueError( 205 | "GITHUB_APP_PRIVATE_KEY could not be parsed. The expected format is " 206 | '":app_id:;;-----BEGIN RSA PRIVATE KEY-----\\n_YOUR_P_KEY_\\n-----END RSA PRIVATE KEY-----"' # noqa: E501 207 | ) 208 | 209 | self.token, self.token_expires_at = generate_app_access_token( 210 | self.github_app_id, self.github_private_key, self.github_installation_id 211 | ) 212 | 213 | # Check if the token isn't valid. If not, overwrite it with None 214 | if not self.is_valid_token(): 215 | if self.logger: 216 | self.logger.warning( 217 | "An app token was generated but could not be validated." 218 | ) 219 | self.token = None 220 | self.token_expires_at = None 221 | 222 | def has_calls_remaining(self) -> bool: 223 | """Check if a token has capacity to make more calls. 224 | 225 | Returns: 226 | True if the token is valid and has enough api calls remaining. 227 | """ 228 | if self.token_expires_at is not None: 229 | close_to_expiry = datetime.now( 230 | tz=timezone.utc 231 | ) > self.token_expires_at - timedelta(minutes=self.expiry_time_buffer) 232 | 233 | if close_to_expiry: 234 | self.claim_token() 235 | if self.token is None: 236 | if self.logger: 237 | self.logger.warning("GitHub app token refresh failed.") 238 | return False 239 | else: 240 | if self.logger: 241 | self.logger.info("GitHub app token refresh succeeded.") 242 | 243 | return super().has_calls_remaining() 244 | 245 | 246 | class GitHubTokenAuthenticator(APIAuthenticatorBase): 247 | """Base class for offloading API auth.""" 248 | 249 | @staticmethod 250 | def get_env(): # noqa: ANN205 251 | return dict(environ) 252 | 253 | def prepare_tokens(self) -> list[TokenManager]: 254 | """Prep GitHub tokens""" 255 | 256 | env_dict = self.get_env() 257 | rate_limit_buffer = self._config.get("rate_limit_buffer", None) 258 | expiry_time_buffer = self._config.get("expiry_time_buffer", None) 259 | 260 | personal_tokens: set[str] = set() 261 | if "auth_token" in self._config: 262 | personal_tokens.add(self._config["auth_token"]) 263 | if "additional_auth_tokens" in self._config: 264 | personal_tokens = personal_tokens.union( 265 | self._config["additional_auth_tokens"] 266 | ) 267 | else: 268 | # Accept multiple tokens using environment variables GITHUB_TOKEN* 269 | env_tokens = { 270 | value 271 | for key, value in env_dict.items() 272 | if key.startswith("GITHUB_TOKEN") 273 | } 274 | if len(env_tokens) > 0: 275 | self.logger.info( 276 | f"Found {len(env_tokens)} 'GITHUB_TOKEN' environment variables for authentication." # noqa: E501 277 | ) 278 | personal_tokens = personal_tokens.union(env_tokens) 279 | 280 | personal_token_managers: list[TokenManager] = [] 281 | for token in personal_tokens: 282 | token_manager = PersonalTokenManager( 283 | token, rate_limit_buffer=rate_limit_buffer, logger=self.logger 284 | ) 285 | if token_manager.is_valid_token(): 286 | personal_token_managers.append(token_manager) 287 | else: 288 | logging.warning("A token was dismissed.") 289 | 290 | # Parse App level private keys and generate tokens 291 | # To simplify settings, we use a single env-key formatted as follows: 292 | # "{app_id};;{-----BEGIN RSA PRIVATE KEY-----\n_YOUR_PRIVATE_KEY_\n-----END RSA PRIVATE KEY-----}" # noqa: E501 293 | 294 | app_keys: set[str] = set() 295 | if "auth_app_keys" in self._config: 296 | app_keys = app_keys.union(self._config["auth_app_keys"]) 297 | self.logger.info( 298 | f"Provided {len(app_keys)} app keys via config for authentication." 299 | ) 300 | elif "GITHUB_APP_PRIVATE_KEY" in env_dict: 301 | app_keys.add(env_dict["GITHUB_APP_PRIVATE_KEY"]) 302 | self.logger.info( 303 | "Found 1 app key via environment variable for authentication." 304 | ) 305 | 306 | app_token_managers: list[TokenManager] = [] 307 | for app_key in app_keys: 308 | try: 309 | app_token_manager = AppTokenManager( 310 | app_key, 311 | rate_limit_buffer=rate_limit_buffer, 312 | expiry_time_buffer=expiry_time_buffer, 313 | logger=self.logger, 314 | ) 315 | if app_token_manager.is_valid_token(): 316 | app_token_managers.append(app_token_manager) 317 | except ValueError as e: # noqa: PERF203 318 | self.logger.warning( 319 | f"An error was thrown while preparing an app token: {e}" 320 | ) 321 | 322 | self.logger.info( 323 | f"Tap will run with {len(personal_token_managers)} personal auth tokens " 324 | f"and {len(app_token_managers)} app keys." 325 | ) 326 | return personal_token_managers + app_token_managers 327 | 328 | def __init__(self, stream: RESTStream) -> None: 329 | """Init authenticator. 330 | 331 | Args: 332 | stream: A stream for a RESTful endpoint. 333 | """ 334 | super().__init__(stream=stream) 335 | self.logger: logging.Logger = stream.logger 336 | self.tap_name: str = stream.tap_name 337 | self._config: dict[str, Any] = dict(stream.config) 338 | self.token_managers = self.prepare_tokens() 339 | self.active_token: TokenManager | None = ( 340 | choice(self.token_managers) if self.token_managers else None 341 | ) 342 | 343 | def get_next_auth_token(self) -> None: 344 | current_token = self.active_token.token if self.active_token else "" 345 | token_managers = deepcopy(self.token_managers) 346 | shuffle(token_managers) 347 | for token_manager in token_managers: 348 | if ( 349 | token_manager.has_calls_remaining() 350 | and current_token != token_manager.token 351 | ): 352 | self.active_token = token_manager 353 | self.logger.info("Switching to fresh auth token") 354 | return 355 | 356 | raise RuntimeError( 357 | "All GitHub tokens have hit their rate limit. Stopping here." 358 | ) 359 | 360 | def update_rate_limit( 361 | self, response_headers: requests.models.CaseInsensitiveDict 362 | ) -> None: 363 | # If no token or only one token is available, return early. 364 | if len(self.token_managers) <= 1 or self.active_token is None: 365 | return 366 | 367 | self.active_token.update_rate_limit(response_headers) 368 | 369 | def authenticate_request( 370 | self, 371 | request: requests.PreparedRequest, 372 | ) -> requests.PreparedRequest: 373 | if self.active_token: 374 | # Make sure that our token is still valid or update it. 375 | if not self.active_token.has_calls_remaining(): 376 | self.get_next_auth_token() 377 | request.headers["Authorization"] = f"token {self.active_token.token}" 378 | else: 379 | self.logger.info( 380 | "No auth token detected. " 381 | "For higher rate limits, please specify `auth_token` in config." 382 | ) 383 | return request 384 | -------------------------------------------------------------------------------- /tap_github/client.py: -------------------------------------------------------------------------------- 1 | """REST client handling, including GitHubStream base class.""" 2 | 3 | from __future__ import annotations 4 | 5 | import email.utils 6 | import inspect 7 | import random 8 | import time 9 | from typing import TYPE_CHECKING, Any, ClassVar, cast 10 | from urllib.parse import parse_qs, urlparse 11 | 12 | from dateutil.parser import parse 13 | from nested_lookup import nested_lookup 14 | from singer_sdk.exceptions import FatalAPIError, RetriableAPIError 15 | from singer_sdk.helpers.jsonpath import extract_jsonpath 16 | from singer_sdk.streams import GraphQLStream, RESTStream 17 | 18 | from tap_github.authenticator import GitHubTokenAuthenticator 19 | 20 | if TYPE_CHECKING: 21 | from collections.abc import Iterable 22 | from types import FrameType 23 | 24 | import requests 25 | from backoff.types import Details 26 | from singer_sdk.helpers.types import Context 27 | 28 | EMPTY_REPO_ERROR_STATUS = 409 29 | 30 | 31 | class GitHubRestStream(RESTStream): 32 | """GitHub Rest stream class.""" 33 | 34 | MAX_PER_PAGE = 100 # GitHub's limit is 100. 35 | MAX_RESULTS_LIMIT: int | None = None 36 | DEFAULT_API_BASE_URL = "https://api.github.com" 37 | LOG_REQUEST_METRIC_URLS = True 38 | 39 | # GitHub is missing the "since" parameter on a few endpoints 40 | # set this parameter to True if your stream needs to navigate data in descending order # noqa: E501 41 | # and try to exit early on its own. 42 | # This only has effect on streams whose `replication_key` is `updated_at`. 43 | use_fake_since_parameter = False 44 | 45 | _authenticator: GitHubTokenAuthenticator | None = None 46 | 47 | @property 48 | def authenticator(self) -> GitHubTokenAuthenticator: 49 | if self._authenticator is None: 50 | self._authenticator = GitHubTokenAuthenticator(stream=self) 51 | return self._authenticator 52 | 53 | @property 54 | def url_base(self) -> str: 55 | return self.config.get("api_url_base", self.DEFAULT_API_BASE_URL) 56 | 57 | primary_keys: ClassVar[list[str]] = ["id"] 58 | replication_key: str | None = None 59 | tolerated_http_errors: ClassVar[list[int]] = [] 60 | 61 | @property 62 | def http_headers(self) -> dict[str, str]: 63 | """Return the http headers needed.""" 64 | headers = {"Accept": "application/vnd.github.v3+json"} 65 | headers["User-Agent"] = cast("str", self.config.get("user_agent", "tap-github")) 66 | return headers 67 | 68 | def get_next_page_token( 69 | self, 70 | response: requests.Response, 71 | previous_token: Any | None, # noqa: ANN401 72 | ) -> Any | None: # noqa: ANN401 73 | """Return a token for identifying next page or None if no more pages.""" 74 | if ( 75 | previous_token 76 | and self.MAX_RESULTS_LIMIT 77 | and ( 78 | cast("int", previous_token) * self.MAX_PER_PAGE 79 | >= self.MAX_RESULTS_LIMIT 80 | ) 81 | ): 82 | return None 83 | 84 | # Leverage header links returned by the GitHub API. 85 | if "next" not in response.links: 86 | return None 87 | 88 | resp_json = response.json() 89 | results = resp_json if isinstance(resp_json, list) else resp_json.get("items") 90 | 91 | # Exit early if the response has no items. ? Maybe duplicative the "next" link check. # noqa: E501 92 | if not results: 93 | return None 94 | 95 | # Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do not support # noqa: E501 96 | # the "since" parameter out of the box. So we use a workaround here to exit early. # noqa: E501 97 | # For such streams, we sort by descending dates (most recent first), and paginate # noqa: E501 98 | # "back in time" until we reach records before our "fake_since" parameter. 99 | if self.replication_key and self.use_fake_since_parameter: 100 | request_parameters = parse_qs(str(urlparse(response.request.url).query)) 101 | # parse_qs interprets "+" as a space, revert this to keep an aware datetime 102 | try: 103 | since = ( 104 | request_parameters["fake_since"][0].replace(" ", "+") 105 | if "fake_since" in request_parameters 106 | else "" 107 | ) 108 | except IndexError: 109 | return None 110 | 111 | direction = ( 112 | request_parameters["direction"][0] 113 | if "direction" in request_parameters 114 | else None 115 | ) 116 | 117 | # commit_timestamp is a constructed key which does not exist in the raw response # noqa: E501 118 | replication_date = ( 119 | results[-1][self.replication_key] 120 | if self.replication_key != "commit_timestamp" 121 | else results[-1]["commit"]["committer"]["date"] 122 | ) 123 | # exit early if the replication_date is before our since parameter 124 | if ( 125 | since 126 | and direction == "desc" 127 | and (parse(replication_date) < parse(since)) 128 | ): 129 | return None 130 | 131 | # Use header links returned by the GitHub API. 132 | parsed_url = urlparse(response.links["next"]["url"]) 133 | captured_page_value_list = parse_qs(parsed_url.query).get("page") 134 | next_page_string = ( 135 | captured_page_value_list[0] if captured_page_value_list else None 136 | ) 137 | if next_page_string and next_page_string.isdigit(): 138 | return int(next_page_string) 139 | 140 | return (previous_token or 1) + 1 141 | 142 | def get_url_params( 143 | self, 144 | context: Context | None, 145 | next_page_token: Any | None, # noqa: ANN401 146 | ) -> dict[str, Any]: 147 | """Return a dictionary of values to be used in URL parameterization.""" 148 | params: dict = {"per_page": self.MAX_PER_PAGE} 149 | if next_page_token: 150 | params["page"] = next_page_token 151 | 152 | if self.replication_key == "updated_at": 153 | params["sort"] = "updated" 154 | params["direction"] = "desc" if self.use_fake_since_parameter else "asc" 155 | 156 | # Unfortunately the /starred, /stargazers (starred_at) and /events (created_at) endpoints do not support # noqa: E501 157 | # the "since" parameter out of the box. But we use a workaround in 'get_next_page_token'. # noqa: E501 158 | elif self.replication_key in ["starred_at", "created_at"]: 159 | params["sort"] = "created" 160 | params["direction"] = "desc" 161 | 162 | # Warning: /commits endpoint accept "since" but results are ordered by descending commit_timestamp # noqa: E501 163 | elif self.replication_key == "commit_timestamp": 164 | params["direction"] = "desc" 165 | 166 | elif self.replication_key: 167 | self.logger.warning( 168 | f"The replication key '{self.replication_key}' is not fully supported by this client yet." # noqa: E501 169 | ) 170 | 171 | since = self.get_starting_timestamp(context) 172 | since_key = "since" if not self.use_fake_since_parameter else "fake_since" 173 | if self.replication_key and since: 174 | params[since_key] = since.isoformat(sep="T") 175 | # Leverage conditional requests to save API quotas 176 | # https://github.community/t/how-does-if-modified-since-work/139627 177 | self.http_headers["If-modified-since"] = email.utils.format_datetime(since) 178 | return params 179 | 180 | def validate_response(self, response: requests.Response) -> None: 181 | """Validate HTTP response. 182 | 183 | In case an error is tolerated, continue without raising it. 184 | 185 | In case an error is deemed transient and can be safely retried, then this 186 | method should raise an :class:`singer_sdk.exceptions.RetriableAPIError`. 187 | 188 | Args: 189 | response: A `requests.Response`_ object. 190 | 191 | Raises: 192 | FatalAPIError: If the request is not retriable. 193 | RetriableAPIError: If the request is retriable. 194 | 195 | .. _requests.Response: 196 | https://docs.python-requests.org/en/latest/api/#requests.Response 197 | """ 198 | full_path = urlparse(response.url).path 199 | if response.status_code in ( 200 | [*self.tolerated_http_errors, EMPTY_REPO_ERROR_STATUS] 201 | ): 202 | msg = ( 203 | f"{response.status_code} Tolerated Status Code " 204 | f"(Reason: {response.reason}) for path: {full_path}" 205 | ) 206 | self.logger.info(msg) 207 | return 208 | 209 | if 400 <= response.status_code < 500: 210 | msg = ( 211 | f"{response.status_code} Client Error: " 212 | f"{response.content!s} (Reason: {response.reason}) for path: {full_path}" # noqa: E501 213 | ) 214 | # Retry on rate limiting 215 | if ( 216 | response.status_code == 403 217 | and "rate limit exceeded" in str(response.content).lower() 218 | ): 219 | # Update token 220 | self.authenticator.get_next_auth_token() 221 | # Raise an error to force a retry with the new token. 222 | raise RetriableAPIError(msg, response) 223 | 224 | # Retry on secondary rate limit 225 | if ( 226 | response.status_code == 403 227 | and "secondary rate limit" in str(response.content).lower() 228 | ): 229 | # Wait about a minute and retry 230 | time.sleep(60 + 30 * random.random()) 231 | raise RetriableAPIError(msg, response) 232 | 233 | # The GitHub API randomly returns 401 Unauthorized errors, so we try again. 234 | if ( 235 | response.status_code == 401 236 | # if the token is invalid, we are also told about it 237 | and "bad credentials" not in str(response.content).lower() 238 | ): 239 | raise RetriableAPIError(msg, response) 240 | 241 | # all other errors are fatal 242 | # Note: The API returns a 404 "Not Found" if trying to read a repo 243 | # for which the token is not allowed access. 244 | raise FatalAPIError(msg) 245 | 246 | elif 500 <= response.status_code < 600: 247 | msg = ( 248 | f"{response.status_code} Server Error: " 249 | f"{response.content!s} (Reason: {response.reason}) for path: {full_path}" # noqa: E501 250 | ) 251 | raise RetriableAPIError(msg, response) 252 | 253 | def parse_response(self, response: requests.Response) -> Iterable[dict]: 254 | """Parse the response and return an iterator of result rows.""" 255 | # TODO - Split into handle_reponse and parse_response. 256 | if response.status_code in ( 257 | [*self.tolerated_http_errors, EMPTY_REPO_ERROR_STATUS] 258 | ): 259 | return 260 | 261 | # Update token rate limit info and loop through tokens if needed. 262 | self.authenticator.update_rate_limit(response.headers) 263 | 264 | resp_json = response.json() 265 | 266 | if isinstance(resp_json, list): 267 | results = resp_json 268 | elif resp_json.get("items") is not None: 269 | results = resp_json.get("items") 270 | else: 271 | results = [resp_json] 272 | 273 | yield from results 274 | 275 | def post_process(self, row: dict, context: Context | None = None) -> dict: 276 | """Add `repo_id` by default to all streams.""" 277 | if context is not None and "repo_id" in context: 278 | row["repo_id"] = context["repo_id"] 279 | return row 280 | 281 | def backoff_handler(self, details: Details) -> None: 282 | """Handle retriable error by swapping auth token.""" 283 | self.logger.info("Retrying request with different token") 284 | # use python introspection to obtain the error object 285 | # FIXME: replace this once https://github.com/litl/backoff/issues/158 286 | # is fixed 287 | exc = cast( 288 | "FrameType", 289 | cast("FrameType", cast("FrameType", inspect.currentframe()).f_back).f_back, 290 | ).f_locals["e"] 291 | if ( 292 | exc.response is not None 293 | and exc.response.status_code == 403 294 | and "rate limit exceeded" in str(exc.response.content) 295 | ): 296 | # we hit a rate limit, rotate token 297 | prepared_request = details["args"][0] 298 | self.authenticator.get_next_auth_token() 299 | prepared_request.headers.update(self.authenticator.auth_headers or {}) 300 | 301 | def calculate_sync_cost( 302 | self, 303 | request: requests.PreparedRequest, 304 | response: requests.Response, 305 | context: Context | None, 306 | ) -> dict[str, int]: 307 | """Return the cost of the last REST API call.""" 308 | return {"rest": 1, "graphql": 0, "search": 0} 309 | 310 | 311 | class GitHubDiffStream(GitHubRestStream): 312 | """Base class for GitHub diff streams.""" 313 | 314 | # Known Github API errors for diff requests 315 | tolerated_http_errors: ClassVar[list[int]] = [404, 406, 422, 502] 316 | 317 | @property 318 | def http_headers(self) -> dict: 319 | """Return the http headers needed for diff requests.""" 320 | headers = super().http_headers 321 | headers["Accept"] = "application/vnd.github.v3.diff" 322 | return headers 323 | 324 | def parse_response(self, response: requests.Response) -> Iterable[dict]: 325 | """Parse the response to yield the diff text instead of an object 326 | and prevent buffer overflow.""" 327 | if response.status_code != 200: 328 | contents = response.json() 329 | self.logger.info( 330 | "Skipping %s due to %d error: %s", 331 | self.name.replace("_", " "), 332 | response.status_code, 333 | contents["message"], 334 | ) 335 | yield { 336 | "success": False, 337 | "error_message": contents["message"], 338 | } 339 | return 340 | 341 | if content_length_str := response.headers.get("Content-Length"): 342 | content_length = int(content_length_str) 343 | max_size = 41_943_040 # 40 MiB 344 | if content_length > max_size: 345 | self.logger.info( 346 | "Skipping %s. The diff size (%.2f MiB) exceeded the maximum" 347 | " size limit of 40 MiB.", 348 | self.name.replace("_", " "), 349 | content_length / 1024 / 1024, 350 | ) 351 | yield { 352 | "success": False, 353 | "error_message": "Diff exceeded the maximum size limit of 40 MiB.", 354 | } 355 | return 356 | 357 | yield {"diff": response.text, "success": True} 358 | 359 | 360 | class GitHubGraphqlStream(GraphQLStream, GitHubRestStream): 361 | """GitHub Graphql stream class.""" 362 | 363 | @property 364 | def url_base(self) -> str: 365 | return f"{self.config.get('api_url_base', self.DEFAULT_API_BASE_URL)}/graphql" 366 | 367 | # the jsonpath under which to fetch the list of records from the graphql response 368 | query_jsonpath: str = "$.data.[*]" 369 | 370 | def parse_response(self, response: requests.Response) -> Iterable[dict]: 371 | """Parse the response and return an iterator of result rows. 372 | 373 | Args: 374 | response: A raw `requests.Response`_ object. 375 | 376 | Yields: 377 | One item for every item found in the response. 378 | 379 | .. _requests.Response: 380 | https://docs.python-requests.org/en/latest/api/#requests.Response 381 | """ 382 | resp_json = response.json() 383 | yield from extract_jsonpath(self.query_jsonpath, input=resp_json) 384 | 385 | def get_next_page_token( 386 | self, 387 | response: requests.Response, 388 | previous_token: Any | None, # noqa: ANN401 389 | ) -> Any | None: # noqa: ANN401 390 | """ 391 | Return a dict of cursors for identifying next page or None if no more pages. 392 | 393 | Note - pagination requires the Graphql query to have nextPageCursor_X parameters 394 | with the assosciated hasNextPage_X, startCursor_X and endCursor_X. 395 | 396 | X should be an integer between 0 and 9, increasing with query depth. 397 | 398 | Warning - we recommend to avoid using deep (nested) pagination. 399 | """ 400 | 401 | resp_json = response.json() 402 | 403 | # Find if results contains "hasNextPage_X" flags and if any are True. 404 | # If so, set nextPageCursor_X to endCursor_X for X max. 405 | 406 | next_page_results = nested_lookup( 407 | key="hasNextPage_", 408 | document=resp_json, 409 | wild=True, 410 | with_keys=True, 411 | ) 412 | 413 | has_next_page_indices: list[int] = [] 414 | # Iterate over all the items and filter items with hasNextPage = True. 415 | for key, value in next_page_results.items(): 416 | # Check if key is even then add pair to new dictionary 417 | if any(value): 418 | pagination_index = int(str(key).split("_")[1]) 419 | has_next_page_indices.append(pagination_index) 420 | 421 | # Check if any "hasNextPage" is True. Otherwise, exit early. 422 | if not len(has_next_page_indices) > 0: 423 | return None 424 | 425 | # Get deepest pagination item 426 | max_pagination_index = max(has_next_page_indices) 427 | 428 | # We leverage previous_token to remember the pagination cursors 429 | # for indices below max_pagination_index. 430 | next_page_cursors: dict[str, str] = {} 431 | for key, value in (previous_token or {}).items(): 432 | # Only keep pagination info for indices below max_pagination_index. 433 | pagination_index = int(str(key).split("_")[1]) 434 | if pagination_index < max_pagination_index: 435 | next_page_cursors[key] = value 436 | 437 | # Get the pagination cursor to update and increment it. 438 | next_page_end_cursor_results = nested_lookup( 439 | key=f"endCursor_{max_pagination_index}", 440 | document=resp_json, 441 | ) 442 | 443 | next_page_key = f"nextPageCursor_{max_pagination_index}" 444 | next_page_cursor = next( 445 | cursor for cursor in next_page_end_cursor_results if cursor is not None 446 | ) 447 | next_page_cursors[next_page_key] = next_page_cursor 448 | 449 | return next_page_cursors 450 | 451 | def get_url_params( 452 | self, 453 | context: Context | None, 454 | next_page_token: Any | None, # noqa: ANN401 455 | ) -> dict[str, Any]: 456 | """Return a dictionary of values to be used in URL parameterization.""" 457 | params = dict(context) if context else {} 458 | params["per_page"] = self.MAX_PER_PAGE 459 | if next_page_token: 460 | params.update(next_page_token) 461 | 462 | since = self.get_starting_timestamp(context) 463 | if self.replication_key and since: 464 | params["since"] = since.isoformat(sep="T") 465 | 466 | return params 467 | 468 | def calculate_sync_cost( 469 | self, 470 | request: requests.PreparedRequest, 471 | response: requests.Response, 472 | context: Context | None, 473 | ) -> dict[str, int]: 474 | """Return the cost of the last graphql API call.""" 475 | costgen = extract_jsonpath("$.data.rateLimit.cost", input=response.json()) 476 | # calculate_sync_cost is called before the main response parsing. 477 | # In some cases, the tap crashes here before we have been able to 478 | # properly analyze where the error comes from, so we ignore these 479 | # costs to allow figuring out what happened downstream, by setting 480 | # them to 0. 481 | cost = next(costgen, 0) 482 | return {"rest": 0, "graphql": int(cost), "search": 0} 483 | 484 | def validate_response(self, response: requests.Response) -> None: 485 | """Validate HTTP response. 486 | 487 | The graphql spec is a bit confusing around response codes 488 | (https://github.com/graphql/graphql-over-http/blob/main/spec/GraphQLOverHTTP.md#response) 489 | Github's API is a bit of a free adaptation of standards, so we 490 | choose fail immediately on error here, so that something is logged 491 | at the very minimum. 492 | 493 | Args: 494 | response: A `requests.Response`_ object. 495 | 496 | Raises: 497 | FatalAPIError: If the request is not retriable. 498 | RetriableAPIError: If the request is retriable. 499 | """ 500 | super().validate_response(response) 501 | rj = response.json() 502 | if "errors" in rj: 503 | msg = rj["errors"] 504 | raise FatalAPIError(f"Graphql error: {msg}", response) 505 | -------------------------------------------------------------------------------- /tap_github/organization_streams.py: -------------------------------------------------------------------------------- 1 | """User Stream types classes for tap-github.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING, Any, ClassVar 6 | 7 | from singer_sdk import typing as th # JSON Schema typing helpers 8 | 9 | from tap_github.client import GitHubRestStream 10 | 11 | if TYPE_CHECKING: 12 | from collections.abc import Iterable 13 | 14 | from singer_sdk.helpers.types import Context 15 | 16 | 17 | class OrganizationStream(GitHubRestStream): 18 | """Defines a GitHub Organization Stream. 19 | API Reference: https://docs.github.com/en/rest/reference/orgs#get-an-organization 20 | """ 21 | 22 | name = "organizations" 23 | path = "/orgs/{org}" 24 | 25 | @property 26 | def partitions(self) -> list[dict] | None: 27 | return [{"org": org} for org in self.config["organizations"]] 28 | 29 | def get_child_context(self, record: dict, context: Context | None) -> dict: 30 | return { 31 | "org": record["login"], 32 | } 33 | 34 | def get_records(self, context: Context | None) -> Iterable[dict[str, Any]]: 35 | """ 36 | Override the parent method to allow skipping API calls 37 | if the stream is deselected and skip_parent_streams is True in config. 38 | This allows running the tap with fewer API calls and preserving 39 | quota when only syncing a child stream. Without this, 40 | the API call is sent but data is discarded. 41 | """ 42 | if ( 43 | not self.selected 44 | and "skip_parent_streams" in self.config 45 | and self.config["skip_parent_streams"] 46 | and context is not None 47 | ): 48 | # build a minimal mock record so that self._sync_records 49 | # can proceed with child streams 50 | yield { 51 | "org": context["org"], 52 | } 53 | else: 54 | yield from super().get_records(context) 55 | 56 | schema = th.PropertiesList( 57 | th.Property("login", th.StringType), 58 | th.Property("id", th.IntegerType), 59 | th.Property("node_id", th.StringType), 60 | th.Property("url", th.StringType), 61 | th.Property("repos_url", th.StringType), 62 | th.Property("events_url", th.StringType), 63 | th.Property("hooks_url", th.StringType), 64 | th.Property("issues_url", th.StringType), 65 | th.Property("members_url", th.StringType), 66 | th.Property("public_members_url", th.StringType), 67 | th.Property("avatar_url", th.StringType), 68 | th.Property("description", th.StringType), 69 | ).to_dict() 70 | 71 | 72 | class OrganizationMembersStream(GitHubRestStream): 73 | """ 74 | API Reference: https://docs.github.com/en/rest/orgs/members?apiVersion=2022-11-28#list-organization-members 75 | """ 76 | 77 | name = "organization_members" 78 | primary_keys: ClassVar[list[str]] = ["id"] 79 | path = "/orgs/{org}/members" 80 | ignore_parent_replication_key = True 81 | parent_stream_type = OrganizationStream 82 | state_partitioning_keys: ClassVar[list[str]] = ["org"] 83 | schema = th.PropertiesList( 84 | # Parent keys 85 | th.Property("org", th.StringType), 86 | # Rest 87 | th.Property("login", th.StringType), 88 | th.Property("id", th.IntegerType), 89 | th.Property("node_id", th.StringType), 90 | th.Property("avatar_url", th.StringType), 91 | th.Property("gravatar_id", th.StringType), 92 | th.Property("url", th.StringType), 93 | th.Property("html_url", th.StringType), 94 | th.Property("type", th.StringType), 95 | th.Property("site_admin", th.BooleanType), 96 | ).to_dict() 97 | 98 | 99 | class TeamsStream(GitHubRestStream): 100 | """ 101 | API Reference: https://docs.github.com/en/rest/reference/teams#list-teams 102 | """ 103 | 104 | name = "teams" 105 | primary_keys: ClassVar[list[str]] = ["id"] 106 | path = "/orgs/{org}/teams" 107 | ignore_parent_replication_key = True 108 | parent_stream_type = OrganizationStream 109 | state_partitioning_keys: ClassVar[list[str]] = ["org"] 110 | 111 | def get_child_context(self, record: dict, context: Context | None) -> dict: 112 | new_context = {"team_slug": record["slug"]} 113 | if context: 114 | return { 115 | **context, 116 | **new_context, 117 | } 118 | return new_context 119 | 120 | schema = th.PropertiesList( 121 | # Parent Keys 122 | th.Property("org", th.StringType), 123 | # Rest 124 | th.Property("id", th.IntegerType), 125 | th.Property("node_id", th.StringType), 126 | th.Property("url", th.StringType), 127 | th.Property("html_url", th.StringType), 128 | th.Property("name", th.StringType), 129 | th.Property("slug", th.StringType), 130 | th.Property("description", th.StringType), 131 | th.Property("privacy", th.StringType), 132 | th.Property("permission", th.StringType), 133 | th.Property("members_url", th.StringType), 134 | th.Property("repositories_url", th.StringType), 135 | th.Property( 136 | "parent", 137 | th.ObjectType( 138 | th.Property("id", th.IntegerType), 139 | th.Property("node_id", th.StringType), 140 | th.Property("url", th.StringType), 141 | th.Property("html_url", th.StringType), 142 | th.Property("name", th.StringType), 143 | th.Property("slug", th.StringType), 144 | th.Property("description", th.StringType), 145 | th.Property("privacy", th.StringType), 146 | th.Property("permission", th.StringType), 147 | th.Property("members_url", th.StringType), 148 | th.Property("repositories_url", th.StringType), 149 | ), 150 | ), 151 | ).to_dict() 152 | 153 | 154 | class TeamMembersStream(GitHubRestStream): 155 | """ 156 | API Reference: https://docs.github.com/en/rest/reference/teams#list-team-members 157 | """ 158 | 159 | name = "team_members" 160 | primary_keys: ClassVar[list[str]] = ["id", "team_slug"] 161 | path = "/orgs/{org}/teams/{team_slug}/members" 162 | ignore_parent_replication_key = True 163 | parent_stream_type = TeamsStream 164 | state_partitioning_keys: ClassVar[list[str]] = ["team_slug", "org"] 165 | 166 | def get_child_context(self, record: dict, context: Context | None) -> dict: 167 | new_context = {"username": record["login"]} 168 | if context: 169 | return { 170 | **context, 171 | **new_context, 172 | } 173 | return new_context 174 | 175 | schema = th.PropertiesList( 176 | # Parent keys 177 | th.Property("org", th.StringType), 178 | th.Property("team_slug", th.StringType), 179 | # Rest 180 | th.Property("login", th.StringType), 181 | th.Property("id", th.IntegerType), 182 | th.Property("node_id", th.StringType), 183 | th.Property("avatar_url", th.StringType), 184 | th.Property("gravatar_id", th.StringType), 185 | th.Property("url", th.StringType), 186 | th.Property("html_url", th.StringType), 187 | th.Property("type", th.StringType), 188 | th.Property("site_admin", th.BooleanType), 189 | ).to_dict() 190 | 191 | 192 | class TeamRolesStream(GitHubRestStream): 193 | """ 194 | API Reference: https://docs.github.com/en/rest/reference/teams#get-team-membership-for-a-user 195 | """ 196 | 197 | name = "team_roles" 198 | path = "/orgs/{org}/teams/{team_slug}/memberships/{username}" 199 | ignore_parent_replication_key = True 200 | primary_keys: ClassVar[list[str]] = ["url"] 201 | parent_stream_type = TeamMembersStream 202 | state_partitioning_keys: ClassVar[list[str]] = ["username", "team_slug", "org"] 203 | 204 | schema = th.PropertiesList( 205 | # Parent keys 206 | th.Property("org", th.StringType), 207 | th.Property("team_slug", th.StringType), 208 | th.Property("username", th.StringType), 209 | # Rest 210 | th.Property("url", th.StringType), 211 | th.Property("role", th.StringType), 212 | th.Property("state", th.StringType), 213 | ).to_dict() 214 | -------------------------------------------------------------------------------- /tap_github/schema_objects.py: -------------------------------------------------------------------------------- 1 | """Reusable schema objects for tap-github. 2 | 3 | Below are a few common patterns in the github API 4 | factored out as reusable objects. They help in making the 5 | schema more readable and error-free. 6 | """ 7 | 8 | from singer_sdk import typing as th # JSON Schema typing helpers 9 | 10 | # This user object is common throughout the API results 11 | user_object = th.ObjectType( 12 | th.Property("login", th.StringType), 13 | th.Property("id", th.IntegerType), 14 | th.Property("node_id", th.StringType), 15 | th.Property("avatar_url", th.StringType), 16 | th.Property("gravatar_id", th.StringType), 17 | th.Property("html_url", th.StringType), 18 | th.Property("type", th.StringType), 19 | th.Property("site_admin", th.BooleanType), 20 | ) 21 | 22 | # some objects are shared between issues and pull requests 23 | label_object = th.ObjectType( 24 | th.Property("id", th.IntegerType), 25 | th.Property("node_id", th.StringType), 26 | th.Property("url", th.StringType), 27 | th.Property("name", th.StringType), 28 | th.Property("description", th.StringType), 29 | th.Property("color", th.StringType), 30 | th.Property("default", th.BooleanType), 31 | ) 32 | 33 | milestone_object = th.ObjectType( 34 | th.Property("html_url", th.StringType), 35 | th.Property("node_id", th.StringType), 36 | th.Property("id", th.IntegerType), 37 | th.Property("number", th.IntegerType), 38 | th.Property("state", th.StringType), 39 | th.Property("title", th.StringType), 40 | th.Property("description", th.StringType), 41 | th.Property("creator", user_object), 42 | th.Property("open_issues", th.IntegerType), 43 | th.Property("closed_issues", th.IntegerType), 44 | th.Property("created_at", th.DateTimeType), 45 | th.Property("updated_at", th.DateTimeType), 46 | th.Property("closed_at", th.DateTimeType), 47 | th.Property("due_on", th.DateTimeType), 48 | ) 49 | 50 | reactions_object = th.ObjectType( 51 | th.Property("url", th.StringType), 52 | th.Property("total_count", th.IntegerType), 53 | th.Property("plus_one", th.IntegerType), 54 | th.Property("minus_one", th.IntegerType), 55 | th.Property("laugh", th.IntegerType), 56 | th.Property("hooray", th.IntegerType), 57 | th.Property("confused", th.IntegerType), 58 | th.Property("heart", th.IntegerType), 59 | th.Property("rocket", th.IntegerType), 60 | th.Property("eyes", th.IntegerType), 61 | ) 62 | 63 | files_object = th.ObjectType( 64 | th.Property("sha", th.StringType), 65 | th.Property("filename", th.StringType), 66 | th.Property("status", th.StringType), 67 | th.Property("additions", th.IntegerType), 68 | th.Property("deletions", th.IntegerType), 69 | th.Property("changes", th.IntegerType), 70 | th.Property("blob_url", th.StringType), 71 | th.Property("raw_url", th.StringType), 72 | th.Property("contents_url", th.StringType), 73 | th.Property("patch", th.StringType), 74 | th.Property("previous_filename", th.StringType), 75 | ) 76 | -------------------------------------------------------------------------------- /tap_github/scraping.py: -------------------------------------------------------------------------------- 1 | """Utility functions for scraping https://github.com 2 | 3 | Inspired by https://github.com/dogsheep/github-to-sqlite/pull/70 4 | """ 5 | 6 | from __future__ import annotations 7 | 8 | import logging 9 | import re 10 | import time 11 | from datetime import datetime, timezone 12 | from typing import TYPE_CHECKING, Any, cast 13 | from urllib.parse import urlparse 14 | 15 | import requests 16 | 17 | if TYPE_CHECKING: 18 | from collections.abc import Iterable 19 | 20 | from bs4 import NavigableString, Tag 21 | 22 | used_by_regex = re.compile(" {3}Used by ") 23 | contributors_regex = re.compile(" {3}Contributors ") 24 | 25 | 26 | def scrape_dependents( 27 | response: requests.Response, logger: logging.Logger | None = None 28 | ) -> Iterable[dict[str, Any]]: 29 | from bs4 import BeautifulSoup 30 | 31 | logger = logger or logging.getLogger("scraping") 32 | 33 | soup = BeautifulSoup(response.content, "html.parser") 34 | # Navigate through Package toggle if present 35 | base_url = urlparse(response.url).hostname or "github.com" 36 | options = soup.find_all("a", class_="select-menu-item") 37 | links = [link["href"] for link in options] if len(options) > 0 else [response.url] 38 | 39 | logger.debug(links) 40 | 41 | for link in links: 42 | yield from _scrape_dependents(f"https://{base_url}/{link}", logger) 43 | 44 | 45 | def _scrape_dependents(url: str, logger: logging.Logger) -> Iterable[dict[str, Any]]: 46 | # Optional dependency: 47 | from bs4 import BeautifulSoup 48 | 49 | s = requests.Session() 50 | 51 | while url: 52 | logger.debug(url) 53 | response = s.get(url) 54 | soup = BeautifulSoup(response.content, "html.parser") 55 | 56 | repo_names = [ 57 | (a["href"] if not isinstance(a["href"], list) else a["href"][0]).lstrip("/") 58 | for a in soup.select("a[data-hovercard-type=repository]") 59 | ] 60 | stars = [ 61 | int(s.next_sibling.strip()) 62 | for s in soup.find_all("svg", {"class": "octicon octicon-star"}) 63 | ] 64 | forks = [ 65 | int(s.next_sibling.strip()) 66 | for s in soup.find_all("svg", {"class": "octicon octicon-repo-forked"}) 67 | ] 68 | 69 | if not len(repo_names) == len(stars) == len(forks): 70 | raise IndexError( 71 | "Could not find star and fork info. Maybe the GitHub page format has changed?" # noqa: E501 72 | ) 73 | 74 | repos = [ 75 | {"name_with_owner": name, "stars": s, "forks": f} 76 | for name, s, f in zip(repo_names, stars, forks) 77 | ] 78 | 79 | logger.debug(repos) 80 | 81 | yield from repos 82 | 83 | # next page? 84 | try: 85 | next_link: Tag = soup.select(".paginate-container")[0].find_all( 86 | "a", text="Next" 87 | )[0] 88 | except IndexError: 89 | break 90 | if next_link is not None: 91 | href = next_link["href"] 92 | url = str(href if not isinstance(href, list) else href[0]) 93 | time.sleep(1) 94 | else: 95 | url = "" 96 | 97 | 98 | def parse_counter(tag: Tag | NavigableString | None) -> int: 99 | """ 100 | Extract a count of [issues|PR|contributors...] from an HTML tag. 101 | For very high numbers, we only get an approximate value as github 102 | does not provide the actual number. 103 | """ 104 | if not tag: 105 | return 0 106 | try: 107 | if tag == "\n": 108 | return 0 109 | title = tag["title"] # type: ignore 110 | if isinstance(title, str): 111 | title_string = cast("str", title) 112 | else: 113 | title_string = cast("str", title[0]) 114 | return int(title_string.strip().replace(",", "").replace("+", "")) 115 | except (KeyError, ValueError) as e: 116 | raise IndexError( 117 | f"Could not parse counter {tag}. Maybe the GitHub page format has changed?" 118 | ) from e 119 | 120 | 121 | def scrape_metrics( 122 | response: requests.Response, logger: logging.Logger | None = None 123 | ) -> Iterable[dict[str, Any]]: 124 | from bs4 import BeautifulSoup 125 | 126 | logger = logger or logging.getLogger("scraping") 127 | 128 | soup = BeautifulSoup(response.content, "html.parser") 129 | 130 | try: 131 | issues = parse_counter(soup.find("span", id="issues-repo-tab-count")) 132 | prs = parse_counter(soup.find("span", id="pull-requests-repo-tab-count")) 133 | except IndexError as e: 134 | # These two items should exist. We raise an error if we could not find them. 135 | raise IndexError( 136 | "Could not find issues or prs info. Maybe the GitHub page format has changed?" # noqa: E501 137 | ) from e 138 | 139 | dependents_node = soup.find(string=used_by_regex) 140 | # verify that we didn't hit some random text in the page. 141 | # sometimes the dependents section isn't shown on the page either 142 | dependents_node_parent = getattr(dependents_node, "parent", None) 143 | dependents: int = 0 144 | if dependents_node_parent is not None and "href" in dependents_node_parent: # noqa: SIM102 145 | if dependents_node_parent["href"].endswith("/network/dependents"): 146 | dependents = parse_counter(getattr(dependents_node, "next_element", None)) 147 | 148 | # likewise, handle edge cases with contributors 149 | contributors_node = soup.find(string=contributors_regex) 150 | contributors_node_parent = getattr(contributors_node, "parent", None) 151 | contributors: int = 0 152 | if contributors_node_parent is not None and "href" in contributors_node_parent: # noqa: SIM102 153 | if contributors_node_parent["href"].endswith("/graphs/contributors"): 154 | contributors = parse_counter( 155 | getattr(contributors_node, "next_element", None), 156 | ) 157 | 158 | fetched_at = datetime.now(tz=timezone.utc) 159 | 160 | metrics = [ 161 | { 162 | "open_issues": issues, 163 | "open_prs": prs, 164 | "dependents": dependents, 165 | "contributors": contributors, 166 | "fetched_at": fetched_at, 167 | } 168 | ] 169 | 170 | logger.debug(metrics) 171 | return metrics 172 | -------------------------------------------------------------------------------- /tap_github/streams.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | from typing import TYPE_CHECKING 5 | 6 | from tap_github.organization_streams import ( 7 | OrganizationMembersStream, 8 | OrganizationStream, 9 | TeamMembersStream, 10 | TeamRolesStream, 11 | TeamsStream, 12 | ) 13 | from tap_github.repository_streams import ( 14 | AnonymousContributorsStream, 15 | AssigneesStream, 16 | BranchesStream, 17 | CollaboratorsStream, 18 | CommitCommentsStream, 19 | CommitDiffsStream, 20 | CommitsStream, 21 | CommunityProfileStream, 22 | ContributorsStream, 23 | CustomPropertiesStream, 24 | DependenciesStream, 25 | DependentsStream, 26 | DeploymentsStream, 27 | DeploymentStatusesStream, 28 | EventsStream, 29 | ExtraMetricsStream, 30 | IssueCommentsStream, 31 | IssueEventsStream, 32 | IssuesStream, 33 | LabelsStream, 34 | LanguagesStream, 35 | MilestonesStream, 36 | PullRequestCommitDiffsStream, 37 | PullRequestCommitsStream, 38 | PullRequestDiffsStream, 39 | PullRequestsStream, 40 | ReadmeHtmlStream, 41 | ReadmeStream, 42 | ReleasesStream, 43 | RepositoryStream, 44 | ReviewCommentsStream, 45 | ReviewsStream, 46 | StargazersGraphqlStream, 47 | StargazersStream, 48 | StatsContributorsStream, 49 | TagsStream, 50 | TrafficClonesStream, 51 | TrafficPageViewsStream, 52 | TrafficReferralPathsStream, 53 | TrafficReferrersStream, 54 | WorkflowRunJobsStream, 55 | WorkflowRunsStream, 56 | WorkflowsStream, 57 | ) 58 | from tap_github.user_streams import StarredStream, UserContributedToStream, UserStream 59 | 60 | if TYPE_CHECKING: 61 | from singer_sdk.streams.core import Stream 62 | 63 | 64 | class Streams(Enum): 65 | """ 66 | Represents all streams our tap supports, and which queries (by username, by organization, etc.) you can use. 67 | """ # noqa: E501 68 | 69 | valid_queries: set[str] 70 | streams: list[type[Stream]] 71 | 72 | def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None: 73 | self.valid_queries = valid_queries 74 | self.streams = streams 75 | 76 | REPOSITORY = ( 77 | {"repositories", "organizations", "searches"}, 78 | [ 79 | AnonymousContributorsStream, 80 | AssigneesStream, 81 | BranchesStream, 82 | CollaboratorsStream, 83 | CommitCommentsStream, 84 | CommitsStream, 85 | CommitDiffsStream, 86 | CommunityProfileStream, 87 | ContributorsStream, 88 | DependenciesStream, 89 | DependentsStream, 90 | DeploymentsStream, 91 | DeploymentStatusesStream, 92 | EventsStream, 93 | IssueCommentsStream, 94 | IssueEventsStream, 95 | IssuesStream, 96 | LabelsStream, 97 | LanguagesStream, 98 | MilestonesStream, 99 | PullRequestCommitsStream, 100 | PullRequestCommitDiffsStream, 101 | PullRequestDiffsStream, 102 | PullRequestsStream, 103 | ReadmeHtmlStream, 104 | ReadmeStream, 105 | ReleasesStream, 106 | ExtraMetricsStream, 107 | RepositoryStream, 108 | ReviewCommentsStream, 109 | ReviewsStream, 110 | StargazersGraphqlStream, 111 | StargazersStream, 112 | StatsContributorsStream, 113 | TagsStream, 114 | TrafficClonesStream, 115 | TrafficPageViewsStream, 116 | TrafficReferralPathsStream, 117 | TrafficReferrersStream, 118 | WorkflowRunJobsStream, 119 | WorkflowRunsStream, 120 | WorkflowsStream, 121 | ], 122 | ) 123 | USERS = ( 124 | {"user_usernames", "user_ids"}, 125 | [ 126 | StarredStream, 127 | UserContributedToStream, 128 | UserStream, 129 | ], 130 | ) 131 | ORGANIZATIONS = ( 132 | {"organizations"}, 133 | [ 134 | CustomPropertiesStream, 135 | OrganizationStream, 136 | OrganizationMembersStream, 137 | TeamMembersStream, 138 | TeamRolesStream, 139 | TeamsStream, 140 | ], 141 | ) 142 | 143 | @classmethod 144 | def all_valid_queries(cls) -> set[str]: 145 | return set.union(*[stream.valid_queries for stream in Streams]) 146 | -------------------------------------------------------------------------------- /tap_github/tap.py: -------------------------------------------------------------------------------- 1 | """GitHub tap class.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import os 7 | 8 | from singer_sdk import Stream, Tap 9 | from singer_sdk import typing as th # JSON schema typing helpers 10 | from singer_sdk.helpers._classproperty import classproperty 11 | 12 | from tap_github.streams import Streams 13 | 14 | 15 | class TapGitHub(Tap): 16 | """Singer tap for the GitHub API.""" 17 | 18 | name = "tap-github" 19 | package_name = "meltanolabs-tap-github" 20 | 21 | @classproperty 22 | def logger(cls) -> logging.Logger: # noqa: N805 23 | """Get logger. 24 | 25 | Returns: 26 | Logger with local LOGLEVEL. LOGLEVEL from env takes priority. 27 | """ 28 | 29 | LOGLEVEL = os.environ.get("LOGLEVEL", "INFO").upper() # noqa: N806 30 | assert LOGLEVEL in logging._levelToName.values(), ( 31 | f"Invalid LOGLEVEL configuration: {LOGLEVEL}" 32 | ) 33 | logger = logging.getLogger(cls.name) 34 | logger.setLevel(LOGLEVEL) 35 | return logger 36 | 37 | config_jsonschema = th.PropertiesList( 38 | th.Property( 39 | "user_agent", 40 | th.StringType, 41 | description="User agent to use for API requests.", 42 | ), 43 | th.Property("metrics_log_level", th.StringType), 44 | # Authentication options 45 | th.Property( 46 | "auth_token", 47 | th.StringType, 48 | description="GitHub token to authenticate with.", 49 | ), 50 | th.Property( 51 | "additional_auth_tokens", 52 | th.ArrayType(th.StringType), 53 | description="List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits.", # noqa: E501 54 | ), 55 | th.Property( 56 | "auth_app_keys", 57 | th.ArrayType(th.StringType), 58 | description=( 59 | "List of GitHub App credentials to authenticate with. Each credential " 60 | "can be constructed by combining an App ID and App private key into " 61 | "the format `:app_id:;;-----BEGIN RSA PRIVATE KEY-----\n_YOUR_P_KEY_\n-----END RSA PRIVATE KEY-----`." # noqa: E501 62 | ), 63 | ), 64 | th.Property( 65 | "rate_limit_buffer", 66 | th.IntegerType, 67 | description="Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000.", # noqa: E501 68 | ), 69 | th.Property( 70 | "expiry_time_buffer", 71 | th.IntegerType, 72 | description=( 73 | "When authenticating as a GitHub App, this buffer controls how many " 74 | "minutes before expiry the GitHub app tokens will be refreshed. " 75 | "Defaults to 10 minutes." 76 | ), 77 | ), 78 | th.Property( 79 | "searches", 80 | th.ArrayType( 81 | th.ObjectType( 82 | th.Property("name", th.StringType, required=True), 83 | th.Property("query", th.StringType, required=True), 84 | ) 85 | ), 86 | description=( 87 | "An array of search descriptor objects with the following properties:\n" 88 | '"name" - a human readable name for the search query.\n' 89 | '"query" - a github search string (generally the same as would come after ?q= in the URL)"' # noqa: E501 90 | ), 91 | ), 92 | th.Property("organizations", th.ArrayType(th.StringType)), 93 | th.Property("repositories", th.ArrayType(th.StringType)), 94 | th.Property("user_usernames", th.ArrayType(th.StringType)), 95 | th.Property("user_ids", th.ArrayType(th.StringType)), 96 | th.Property( 97 | "start_date", 98 | th.DateTimeType, 99 | description="Start date for incremental sync.", 100 | ), 101 | th.Property("stream_maps", th.ObjectType()), 102 | th.Property("stream_map_config", th.ObjectType()), 103 | th.Property( 104 | "skip_parent_streams", 105 | th.BooleanType, 106 | description=( 107 | "Set to true to skip API calls for the parent " 108 | "streams (such as repositories) if it is not selected but children are" 109 | ), 110 | ), 111 | th.Property( 112 | "stream_options", 113 | th.ObjectType( 114 | th.Property( 115 | "milestones", 116 | th.ObjectType( 117 | th.Property( 118 | "state", 119 | th.StringType, 120 | description=( 121 | "Configures which states are of interest. " 122 | "Must be one of [open, closed, all], defaults to open." 123 | ), 124 | default="open", 125 | allowed_values=["open", "closed", "all"], 126 | ), 127 | additional_properties=False, 128 | ), 129 | description="Options specific to the 'milestones' stream.", 130 | ), 131 | additional_properties=False, 132 | ), 133 | description="Options which change the behaviour of a specific stream.", 134 | ), 135 | ).to_dict() 136 | 137 | def discover_streams(self) -> list[Stream]: 138 | """Return a list of discovered streams for each query.""" 139 | 140 | # If the config is empty, assume we are running --help or --capabilities. 141 | if ( 142 | self.config 143 | and len(Streams.all_valid_queries().intersection(self.config)) != 1 144 | ): 145 | raise ValueError( 146 | "This tap requires one and only one of the following path options: " 147 | f"{Streams.all_valid_queries()}." 148 | ) 149 | streams = [] 150 | for stream_type in Streams: 151 | if (not self.config) or len( 152 | stream_type.valid_queries.intersection(self.config) 153 | ) > 0: 154 | streams += [ 155 | StreamClass(tap=self) for StreamClass in stream_type.streams 156 | ] 157 | 158 | if not streams: 159 | raise ValueError("No valid streams found.") 160 | return streams 161 | 162 | 163 | # CLI Execution: 164 | 165 | cli = TapGitHub.cli 166 | -------------------------------------------------------------------------------- /tap_github/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test suite for tap-github.""" 2 | 3 | import requests_cache 4 | 5 | # Setup caching for all api calls done through `requests` in order to limit 6 | # rate limiting problems with github. 7 | # Use the sqlite backend as it's the default option and seems to be best supported. 8 | # To clear the cache, just delete the sqlite db file at api_calls_tests_cache.sqlite 9 | # in the root of this repository 10 | requests_cache.install_cache( 11 | ".cache/api_calls_tests_cache", 12 | backend="sqlite", 13 | # make sure that API keys don't end up being cached 14 | # Also ignore user-agent so that various versions of request 15 | # can share the cache 16 | ignored_parameters=["Authorization", "User-Agent", "If-modified-since"], 17 | # tell requests_cache to check headers for the above parameter 18 | match_headers=True, 19 | # expire the cache after 24h (86400 seconds) 20 | expire_after=24 * 60 * 60, 21 | # make sure graphql calls get cached as well 22 | allowable_methods=["GET", "POST"], 23 | ) 24 | -------------------------------------------------------------------------------- /tap_github/tests/fixtures.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | import logging 5 | import os 6 | import sys 7 | from typing import TYPE_CHECKING 8 | 9 | import pytest 10 | 11 | from ..utils.filter_stdout import FilterStdOutput 12 | 13 | if TYPE_CHECKING: 14 | from singer_sdk.helpers.types import Context 15 | 16 | # Filter out singer output during tests 17 | sys.stdout = FilterStdOutput(sys.stdout, r'{"type": ') # type: ignore 18 | 19 | 20 | @pytest.fixture 21 | def search_config(): 22 | return { 23 | "metrics_log_level": "warning", 24 | "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), 25 | "searches": [ 26 | { 27 | "name": "tap_something", 28 | "query": "tap-+language:Python", 29 | } 30 | ], 31 | } 32 | 33 | 34 | @pytest.fixture 35 | def repo_list_config(request): 36 | """ 37 | Get a default list of repos or pass your own by decorating your test with 38 | @pytest.mark.repo_list(['org1/repo1', 'org2/repo2']) 39 | """ 40 | marker = request.node.get_closest_marker("repo_list") 41 | if marker is None: 42 | repo_list = ["MeltanoLabs/tap-github", "mapswipe/mapswipe"] 43 | else: 44 | repo_list = marker.args[0] 45 | 46 | return { 47 | "metrics_log_level": "warning", 48 | "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), 49 | "repositories": repo_list, 50 | "rate_limit_buffer": 100, 51 | } 52 | 53 | 54 | @pytest.fixture 55 | def username_list_config(request): 56 | """ 57 | Get a default list of usernames or pass your own by decorating your test with 58 | @pytest.mark.username_list(['ericboucher', 'aaronsteers']) 59 | """ 60 | marker = request.node.get_closest_marker("username_list") 61 | username_list = ["ericboucher", "aaronsteers"] if marker is None else marker.args[0] 62 | 63 | return { 64 | "metrics_log_level": "warning", 65 | "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), 66 | "user_usernames": username_list, 67 | "rate_limit_buffer": 100, 68 | } 69 | 70 | 71 | @pytest.fixture 72 | def user_id_list_config(request): 73 | """ 74 | Get a default list of usernames or pass your own by decorating your test with 75 | @pytest.mark.user_id_list(['ericboucher', 'aaronsteers']) 76 | """ 77 | marker = request.node.get_closest_marker("user_id_list") 78 | user_id_list = [1, 2] if marker is None else marker.args[0] 79 | 80 | return { 81 | "metrics_log_level": "warning", 82 | "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), 83 | "user_ids": user_id_list, 84 | "rate_limit_buffer": 100, 85 | } 86 | 87 | 88 | @pytest.fixture 89 | def organization_list_config(request): 90 | """ 91 | Get a default list of organizations or pass your own by decorating your test with 92 | @pytest.mark.organization_list(['MeltanoLabs', 'oviohub']) 93 | """ 94 | marker = request.node.get_closest_marker("organization_list") 95 | 96 | organization_list = ["MeltanoLabs"] if marker is None else marker.args[0] 97 | 98 | return { 99 | "metrics_log_level": "warning", 100 | "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), 101 | "organizations": organization_list, 102 | "rate_limit_buffer": 100, 103 | } 104 | 105 | 106 | def alternative_sync_chidren( 107 | self, 108 | child_context: Context, 109 | no_sync: bool = True, 110 | ) -> None: 111 | """ 112 | Override for Stream._sync_children. 113 | Enabling us to use an ORG_LEVEL_TOKEN for the collaborators stream. 114 | """ 115 | for child_stream in self.child_streams: 116 | # Use org:write access level credentials for collaborators stream 117 | if child_stream.name in ["collaborators"]: 118 | ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") # noqa: N806 119 | # TODO - Fix collaborators tests, likely by mocking API responses directly. 120 | # Currently we have to bypass them as they are failing frequently. 121 | if not ORG_LEVEL_TOKEN or no_sync: 122 | logging.warning( 123 | 'No "ORG_LEVEL_TOKEN" found. Skipping collaborators stream sync.' 124 | ) 125 | continue 126 | SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN") # noqa: N806 127 | os.environ["GITHUB_TOKEN"] = ORG_LEVEL_TOKEN 128 | child_stream.sync(context=child_context) 129 | os.environ["GITHUB_TOKEN"] = SAVED_GTHUB_TOKEN or "" 130 | continue 131 | 132 | # default behavior: 133 | if child_stream.selected or child_stream.has_selected_descendents: 134 | child_stream.sync(context=child_context) 135 | -------------------------------------------------------------------------------- /tap_github/tests/test_authenticator.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime, timedelta, timezone 3 | from unittest.mock import MagicMock, call, patch 4 | 5 | import pytest 6 | import requests 7 | from singer_sdk.streams import RESTStream 8 | 9 | from tap_github.authenticator import ( 10 | AppTokenManager, 11 | GitHubTokenAuthenticator, 12 | PersonalTokenManager, 13 | TokenManager, 14 | ) 15 | 16 | 17 | def _now(): 18 | return datetime.now(tz=timezone.utc) 19 | 20 | 21 | class TestTokenManager: 22 | def test_default_rate_limits(self): 23 | token_manager = TokenManager("mytoken", rate_limit_buffer=700) 24 | 25 | assert token_manager.rate_limit == 5000 26 | assert token_manager.rate_limit_remaining == 5000 27 | assert token_manager.rate_limit_reset is None 28 | assert token_manager.rate_limit_used == 0 29 | assert token_manager.rate_limit_buffer == 700 30 | 31 | token_manager_2 = TokenManager("mytoken") 32 | assert token_manager_2.rate_limit_buffer == 1000 33 | 34 | def test_update_rate_limit(self): 35 | mock_response_headers = { 36 | "X-RateLimit-Limit": "5000", 37 | "X-RateLimit-Remaining": "4999", 38 | "X-RateLimit-Reset": "1372700873", 39 | "X-RateLimit-Used": "1", 40 | } 41 | 42 | token_manager = TokenManager("mytoken") 43 | token_manager.update_rate_limit(mock_response_headers) 44 | 45 | assert token_manager.rate_limit == 5000 46 | assert token_manager.rate_limit_remaining == 4999 47 | assert token_manager.rate_limit_reset == datetime( 48 | 2013, 49 | 7, 50 | 1, 51 | 17, 52 | 47, 53 | 53, 54 | tzinfo=timezone.utc, 55 | ) 56 | assert token_manager.rate_limit_used == 1 57 | 58 | def test_is_valid_token_successful(self): 59 | with patch("requests.get") as mock_get: 60 | mock_response = mock_get.return_value 61 | mock_response.raise_for_status.return_value = None 62 | 63 | token_manager = TokenManager("validtoken") 64 | 65 | assert token_manager.is_valid_token() 66 | mock_get.assert_called_once_with( 67 | url="https://api.github.com/rate_limit", 68 | headers={"Authorization": "token validtoken"}, 69 | ) 70 | 71 | def test_is_valid_token_failure(self): 72 | with patch("requests.get") as mock_get: 73 | # Setup for a failed request 74 | mock_response = mock_get.return_value 75 | mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError() 76 | mock_response.status_code = 401 77 | mock_response.content = b"Unauthorized Access" 78 | mock_response.reason = "Unauthorized" 79 | 80 | token_manager = TokenManager("invalidtoken") 81 | token_manager.logger = MagicMock() 82 | 83 | assert not token_manager.is_valid_token() 84 | token_manager.logger.warning.assert_called_once() 85 | assert "401" in token_manager.logger.warning.call_args[0][0] 86 | 87 | def test_has_calls_remaining_succeeds_if_token_never_used(self): 88 | token_manager = TokenManager("mytoken") 89 | assert token_manager.has_calls_remaining() 90 | 91 | def test_has_calls_remaining_succeeds_if_lots_remaining(self): 92 | mock_response_headers = { 93 | "X-RateLimit-Limit": "5000", 94 | "X-RateLimit-Remaining": "4999", 95 | "X-RateLimit-Reset": "1372700873", 96 | "X-RateLimit-Used": "1", 97 | } 98 | 99 | token_manager = TokenManager("mytoken") 100 | token_manager.update_rate_limit(mock_response_headers) 101 | 102 | assert token_manager.has_calls_remaining() 103 | 104 | def test_has_calls_remaining_succeeds_if_reset_time_reached(self): 105 | mock_response_headers = { 106 | "X-RateLimit-Limit": "5000", 107 | "X-RateLimit-Remaining": "1", 108 | "X-RateLimit-Reset": "1372700873", 109 | "X-RateLimit-Used": "4999", 110 | } 111 | 112 | token_manager = TokenManager("mytoken", rate_limit_buffer=1000) 113 | token_manager.update_rate_limit(mock_response_headers) 114 | 115 | assert token_manager.has_calls_remaining() 116 | 117 | def test_has_calls_remaining_fails_if_few_calls_remaining_and_reset_time_not_reached( # noqa: E501 118 | self, 119 | ): 120 | mock_response_headers = { 121 | "X-RateLimit-Limit": "5000", 122 | "X-RateLimit-Remaining": "1", 123 | "X-RateLimit-Reset": str(int((_now() + timedelta(days=100)).timestamp())), 124 | "X-RateLimit-Used": "4999", 125 | } 126 | 127 | token_manager = TokenManager("mytoken", rate_limit_buffer=1000) 128 | token_manager.update_rate_limit(mock_response_headers) 129 | 130 | assert not token_manager.has_calls_remaining() 131 | 132 | 133 | class TestAppTokenManager: 134 | def test_initialization_with_3_part_env_key(self): 135 | with patch.object(AppTokenManager, "claim_token", return_value=None): 136 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 137 | assert token_manager.github_app_id == "12345" 138 | assert token_manager.github_private_key == "key\ncontent" 139 | assert token_manager.github_installation_id == "67890" 140 | 141 | def test_initialization_with_2_part_env_key(self): 142 | with patch.object(AppTokenManager, "claim_token", return_value=None): 143 | token_manager = AppTokenManager("12345;;key\\ncontent") 144 | assert token_manager.github_app_id == "12345" 145 | assert token_manager.github_private_key == "key\ncontent" 146 | assert token_manager.github_installation_id is None 147 | 148 | def test_initialization_with_malformed_env_key(self): 149 | expected_error_expression = re.escape( 150 | "GITHUB_APP_PRIVATE_KEY could not be parsed. The expected format is " 151 | '":app_id:;;-----BEGIN RSA PRIVATE KEY-----\\n_YOUR_P_KEY_\\n-----END RSA PRIVATE KEY-----"' # noqa: E501 152 | ) 153 | with pytest.raises(ValueError, match=expected_error_expression): 154 | AppTokenManager("12345key\\ncontent") 155 | 156 | def test_generate_token_with_invalid_credentials(self): 157 | with ( 158 | patch.object(AppTokenManager, "is_valid_token", return_value=False), 159 | patch( 160 | "tap_github.authenticator.generate_app_access_token", 161 | return_value=("some_token", MagicMock()), 162 | ), 163 | ): 164 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 165 | assert token_manager.token is None 166 | assert token_manager.token_expires_at is None 167 | 168 | def test_successful_token_generation(self): 169 | token_time = MagicMock() 170 | with ( 171 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 172 | patch( 173 | "tap_github.authenticator.generate_app_access_token", 174 | return_value=("valid_token", token_time), 175 | ), 176 | ): 177 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 178 | token_manager.claim_token() 179 | assert token_manager.token == "valid_token" 180 | assert token_manager.token_expires_at == token_time 181 | 182 | def test_has_calls_remaining_regenerates_a_token_if_close_to_expiry(self): 183 | unexpired_time = _now() + timedelta(days=1) 184 | expired_time = _now() - timedelta(days=1) 185 | with ( 186 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 187 | patch( 188 | "tap_github.authenticator.generate_app_access_token", 189 | return_value=("valid_token", unexpired_time), 190 | ), 191 | ): 192 | mock_response_headers = { 193 | "X-RateLimit-Limit": "5000", 194 | "X-RateLimit-Remaining": "4999", 195 | "X-RateLimit-Reset": "1372700873", 196 | "X-RateLimit-Used": "1", 197 | } 198 | 199 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 200 | token_manager.logger = MagicMock() 201 | token_manager.token_expires_at = expired_time 202 | token_manager.update_rate_limit(mock_response_headers) 203 | 204 | assert token_manager.has_calls_remaining() 205 | # calling has_calls_remaining() will trigger the token generation function to be called again, # noqa: E501 206 | # so token_expires_at should have been reset back to the mocked unexpired_time # noqa: E501 207 | assert token_manager.token_expires_at == unexpired_time 208 | token_manager.logger.info.assert_called_once() 209 | assert ( 210 | "GitHub app token refresh succeeded." 211 | in token_manager.logger.info.call_args[0][0] 212 | ) 213 | 214 | def test_has_calls_remaining_logs_warning_if_token_regeneration_fails(self): 215 | unexpired_time = _now() + timedelta(days=1) 216 | expired_time = _now() - timedelta(days=1) 217 | with ( 218 | patch.object( 219 | AppTokenManager, "is_valid_token", return_value=True 220 | ) as mock_is_valid, 221 | patch( 222 | "tap_github.authenticator.generate_app_access_token", 223 | return_value=("valid_token", unexpired_time), 224 | ), 225 | ): 226 | mock_response_headers = { 227 | "X-RateLimit-Limit": "5000", 228 | "X-RateLimit-Remaining": "4999", 229 | "X-RateLimit-Reset": "1372700873", 230 | "X-RateLimit-Used": "1", 231 | } 232 | 233 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 234 | token_manager.logger = MagicMock() 235 | token_manager.token_expires_at = expired_time 236 | token_manager.update_rate_limit(mock_response_headers) 237 | 238 | mock_is_valid.return_value = False 239 | assert not token_manager.has_calls_remaining() 240 | assert isinstance(token_manager.logger.warning, MagicMock) 241 | token_manager.logger.warning.assert_has_calls( 242 | [call("GitHub app token refresh failed.")], 243 | any_order=True, 244 | ) 245 | 246 | def test_has_calls_remaining_succeeds_if_token_new_and_never_used(self): 247 | unexpired_time = _now() + timedelta(days=1) 248 | with ( 249 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 250 | patch( 251 | "tap_github.authenticator.generate_app_access_token", 252 | return_value=("valid_token", unexpired_time), 253 | ), 254 | ): 255 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 256 | assert token_manager.has_calls_remaining() 257 | 258 | def test_has_calls_remaining_succeeds_if_time_and_requests_left(self): 259 | unexpired_time = _now() + timedelta(days=1) 260 | with ( 261 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 262 | patch( 263 | "tap_github.authenticator.generate_app_access_token", 264 | return_value=("valid_token", unexpired_time), 265 | ), 266 | ): 267 | mock_response_headers = { 268 | "X-RateLimit-Limit": "5000", 269 | "X-RateLimit-Remaining": "4999", 270 | "X-RateLimit-Reset": "1372700873", 271 | "X-RateLimit-Used": "1", 272 | } 273 | 274 | token_manager = AppTokenManager("12345;;key\\ncontent;;67890") 275 | token_manager.update_rate_limit(mock_response_headers) 276 | 277 | assert token_manager.has_calls_remaining() 278 | 279 | def test_has_calls_remaining_succeeds_if_time_left_and_reset_time_reached(self): 280 | unexpired_time = _now() + timedelta(days=1) 281 | with ( 282 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 283 | patch( 284 | "tap_github.authenticator.generate_app_access_token", 285 | return_value=("valid_token", unexpired_time), 286 | ), 287 | ): 288 | mock_response_headers = { 289 | "X-RateLimit-Limit": "5000", 290 | "X-RateLimit-Remaining": "1", 291 | "X-RateLimit-Reset": "1372700873", 292 | "X-RateLimit-Used": "4999", 293 | } 294 | 295 | token_manager = AppTokenManager( 296 | "12345;;key\\ncontent;;67890", rate_limit_buffer=1000 297 | ) 298 | token_manager.update_rate_limit(mock_response_headers) 299 | 300 | assert token_manager.has_calls_remaining() 301 | 302 | def test_has_calls_remaining_fails_if_time_left_and_few_calls_remaining_and_reset_time_not_reached( # noqa: E501 303 | self, 304 | ): 305 | unexpired_time = _now() + timedelta(days=1) 306 | with ( 307 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 308 | patch( 309 | "tap_github.authenticator.generate_app_access_token", 310 | return_value=("valid_token", unexpired_time), 311 | ), 312 | ): 313 | mock_response_headers = { 314 | "X-RateLimit-Limit": "5000", 315 | "X-RateLimit-Remaining": "1", 316 | "X-RateLimit-Reset": str( 317 | int((_now() + timedelta(days=100)).timestamp()) 318 | ), 319 | "X-RateLimit-Used": "4999", 320 | } 321 | 322 | token_manager = AppTokenManager( 323 | "12345;;key\\ncontent;;67890", rate_limit_buffer=1000 324 | ) 325 | token_manager.update_rate_limit(mock_response_headers) 326 | 327 | assert not token_manager.has_calls_remaining() 328 | 329 | 330 | @pytest.fixture 331 | def mock_stream(): 332 | stream = MagicMock(spec=RESTStream) 333 | stream.logger = MagicMock() 334 | stream.tap_name = "tap_github" 335 | stream.config = {"rate_limit_buffer": 5} 336 | return stream 337 | 338 | 339 | class TestGitHubTokenAuthenticator: 340 | def test_prepare_tokens_returns_empty_if_none_found(self, mock_stream): 341 | with ( 342 | patch.object( 343 | GitHubTokenAuthenticator, 344 | "get_env", 345 | return_value={"GITHUB_TLJKJFDS": "gt1"}, 346 | ), 347 | patch.object(PersonalTokenManager, "is_valid_token", return_value=True), 348 | ): 349 | auth = GitHubTokenAuthenticator(stream=mock_stream) 350 | token_managers = auth.prepare_tokens() 351 | 352 | assert len(token_managers) == 0 353 | 354 | def test_config_auth_token_only(self, mock_stream): 355 | with ( 356 | patch.object( 357 | GitHubTokenAuthenticator, 358 | "get_env", 359 | return_value={"OTHER_TOKEN": "blah", "NOT_THE_RIGHT_TOKEN": "meh"}, 360 | ), 361 | patch.object(PersonalTokenManager, "is_valid_token", return_value=True), 362 | ): 363 | stream = mock_stream 364 | stream.config.update({"auth_token": "gt5"}) 365 | auth = GitHubTokenAuthenticator(stream=stream) 366 | token_managers = auth.prepare_tokens() 367 | 368 | assert len(token_managers) == 1 369 | assert token_managers[0].token == "gt5" 370 | 371 | def test_config_additional_auth_tokens_only(self, mock_stream): 372 | with ( 373 | patch.object( 374 | GitHubTokenAuthenticator, 375 | "get_env", 376 | return_value={"OTHER_TOKEN": "blah", "NOT_THE_RIGHT_TOKEN": "meh"}, 377 | ), 378 | patch.object(PersonalTokenManager, "is_valid_token", return_value=True), 379 | ): 380 | stream = mock_stream 381 | stream.config.update({"additional_auth_tokens": ["gt7", "gt8", "gt9"]}) 382 | auth = GitHubTokenAuthenticator(stream=stream) 383 | token_managers = auth.prepare_tokens() 384 | 385 | assert len(token_managers) == 3 386 | assert sorted({tm.token for tm in token_managers}) == ["gt7", "gt8", "gt9"] 387 | 388 | def test_env_personal_tokens_only(self, mock_stream): 389 | with ( 390 | patch.object( 391 | GitHubTokenAuthenticator, 392 | "get_env", 393 | return_value={ 394 | "GITHUB_TOKEN1": "gt1", 395 | "GITHUB_TOKENxyz": "gt2", 396 | "OTHER_TOKEN": "blah", 397 | }, 398 | ), 399 | patch.object(PersonalTokenManager, "is_valid_token", return_value=True), 400 | ): 401 | auth = GitHubTokenAuthenticator(stream=mock_stream) 402 | token_managers = auth.prepare_tokens() 403 | 404 | assert len(token_managers) == 2 405 | assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt2"] 406 | 407 | def test_config_app_keys(self, mock_stream): 408 | def generate_token_mock(app_id, private_key, installation_id): 409 | return (f"installationtokenfor{app_id}", MagicMock()) 410 | 411 | with ( 412 | patch.object(TokenManager, "is_valid_token", return_value=True), 413 | patch( 414 | "tap_github.authenticator.generate_app_access_token", 415 | side_effect=generate_token_mock, 416 | ), 417 | ): 418 | stream = mock_stream 419 | stream.config.update( 420 | { 421 | "auth_token": "gt5", 422 | "additional_auth_tokens": ["gt7", "gt8", "gt9"], 423 | "auth_app_keys": [ 424 | "123;;gak1;;13", 425 | "456;;gak1;;46", 426 | "789;;gak1;;79", 427 | ], 428 | } 429 | ) 430 | auth = GitHubTokenAuthenticator(stream=stream) 431 | token_managers = auth.prepare_tokens() 432 | 433 | assert len(token_managers) == 7 434 | 435 | app_token_managers = { 436 | tm for tm in token_managers if isinstance(tm, AppTokenManager) 437 | } 438 | assert len(app_token_managers) == 3 439 | 440 | app_tokens = {tm.token for tm in app_token_managers} 441 | assert app_tokens == { 442 | "installationtokenfor123", 443 | "installationtokenfor456", 444 | "installationtokenfor789", 445 | } 446 | 447 | def test_env_app_key_only(self, mock_stream): 448 | with ( 449 | patch.object( 450 | GitHubTokenAuthenticator, 451 | "get_env", 452 | return_value={ 453 | "GITHUB_APP_PRIVATE_KEY": "123;;key", 454 | "OTHER_TOKEN": "blah", 455 | }, 456 | ), 457 | patch.object(AppTokenManager, "is_valid_token", return_value=True), 458 | patch( 459 | "tap_github.authenticator.generate_app_access_token", 460 | return_value=("installationtoken12345", MagicMock()), 461 | ), 462 | ): 463 | auth = GitHubTokenAuthenticator(stream=mock_stream) 464 | token_managers = auth.prepare_tokens() 465 | 466 | assert len(token_managers) == 1 467 | assert token_managers[0].token == "installationtoken12345" 468 | 469 | def test_all_token_types(self, mock_stream): 470 | # Expectations: 471 | # - the presence of additional_auth_tokens causes personal tokens in the environment to be ignored. # noqa: E501 472 | # - the other types all coexist 473 | with ( 474 | patch.object( 475 | GitHubTokenAuthenticator, 476 | "get_env", 477 | return_value={ 478 | "GITHUB_TOKEN1": "gt1", 479 | "GITHUB_TOKENxyz": "gt2", 480 | "GITHUB_APP_PRIVATE_KEY": "123;;key;;install_id", 481 | "OTHER_TOKEN": "blah", 482 | }, 483 | ), 484 | patch.object(TokenManager, "is_valid_token", return_value=True), 485 | patch( 486 | "tap_github.authenticator.generate_app_access_token", 487 | return_value=("installationtoken12345", MagicMock()), 488 | ), 489 | ): 490 | stream = mock_stream 491 | stream.config.update( 492 | { 493 | "auth_token": "gt5", 494 | "additional_auth_tokens": ["gt7", "gt8", "gt9"], 495 | } 496 | ) 497 | auth = GitHubTokenAuthenticator(stream=stream) 498 | token_managers = auth.prepare_tokens() 499 | 500 | assert len(token_managers) == 5 501 | assert sorted({tm.token for tm in token_managers}) == [ 502 | "gt5", 503 | "gt7", 504 | "gt8", 505 | "gt9", 506 | "installationtoken12345", 507 | ] 508 | 509 | def test_all_token_types_except_additional_auth_tokens(self, mock_stream): 510 | # Expectations: 511 | # - in the absence of additional_auth_tokens, all the other types can coexist 512 | with ( 513 | patch.object( 514 | GitHubTokenAuthenticator, 515 | "get_env", 516 | return_value={ 517 | "GITHUB_TOKEN1": "gt1", 518 | "GITHUB_TOKENxyz": "gt2", 519 | "GITHUB_APP_PRIVATE_KEY": "123;;key;;install_id", 520 | "OTHER_TOKEN": "blah", 521 | }, 522 | ), 523 | patch.object(TokenManager, "is_valid_token", return_value=True), 524 | patch( 525 | "tap_github.authenticator.generate_app_access_token", 526 | return_value=("installationtoken12345", MagicMock()), 527 | ), 528 | ): 529 | stream = mock_stream 530 | stream.config.update( 531 | { 532 | "auth_token": "gt5", 533 | } 534 | ) 535 | auth = GitHubTokenAuthenticator(stream=stream) 536 | token_managers = auth.prepare_tokens() 537 | 538 | assert len(token_managers) == 4 539 | assert sorted({tm.token for tm in token_managers}) == [ 540 | "gt1", 541 | "gt2", 542 | "gt5", 543 | "installationtoken12345", 544 | ] 545 | 546 | def test_auth_token_and_additional_auth_tokens_deduped(self, mock_stream): 547 | with ( 548 | patch.object( 549 | GitHubTokenAuthenticator, 550 | "get_env", 551 | return_value={ 552 | "GITHUB_TOKEN1": "gt1", 553 | "GITHUB_TOKENxyz": "gt2", 554 | "OTHER_TOKEN": "blah", 555 | }, 556 | ), 557 | patch.object(TokenManager, "is_valid_token", return_value=True), 558 | patch( 559 | "tap_github.authenticator.generate_app_access_token", 560 | return_value=("installationtoken12345", MagicMock()), 561 | ), 562 | ): 563 | stream = mock_stream 564 | stream.config.update( 565 | { 566 | "auth_token": "gt1", 567 | "additional_auth_tokens": ["gt1", "gt1", "gt8", "gt8", "gt9"], 568 | } 569 | ) 570 | auth = GitHubTokenAuthenticator(stream=stream) 571 | token_managers = auth.prepare_tokens() 572 | 573 | assert len(token_managers) == 3 574 | assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt8", "gt9"] 575 | 576 | def test_auth_token_and_env_tokens_deduped(self, mock_stream): 577 | with ( 578 | patch.object( 579 | GitHubTokenAuthenticator, 580 | "get_env", 581 | return_value={ 582 | "GITHUB_TOKEN1": "gt1", 583 | "GITHUB_TOKENa": "gt2", 584 | "GITHUB_TOKENxyz": "gt2", 585 | "OTHER_TOKEN": "blah", 586 | }, 587 | ), 588 | patch.object(TokenManager, "is_valid_token", return_value=True), 589 | patch( 590 | "tap_github.authenticator.generate_app_access_token", 591 | return_value=("installationtoken12345", MagicMock()), 592 | ), 593 | ): 594 | stream = mock_stream 595 | stream.config.update({"auth_token": "gt1"}) 596 | auth = GitHubTokenAuthenticator(stream=stream) 597 | token_managers = auth.prepare_tokens() 598 | 599 | assert len(token_managers) == 2 600 | assert sorted({tm.token for tm in token_managers}) == ["gt1", "gt2"] 601 | 602 | def test_handle_error_if_app_key_invalid(self, mock_stream): 603 | # Confirm expected behaviour if an error is raised while setting up the app token manager: # noqa: E501 604 | # - don"t crash 605 | # - print the error as a warning 606 | # - continue with any other obtained tokens 607 | with ( 608 | patch.object( 609 | GitHubTokenAuthenticator, 610 | "get_env", 611 | return_value={"GITHUB_APP_PRIVATE_KEY": "123garbagekey"}, 612 | ), 613 | patch("tap_github.authenticator.AppTokenManager") as mock_app_manager, 614 | ): 615 | mock_app_manager.side_effect = ValueError("Invalid key format") 616 | 617 | auth = GitHubTokenAuthenticator(stream=mock_stream) 618 | auth.prepare_tokens() 619 | 620 | mock_stream.logger.warning.assert_called_with( 621 | "An error was thrown while preparing an app token: Invalid key format" 622 | ) 623 | 624 | def test_exclude_generated_app_token_if_invalid(self, mock_stream): 625 | with ( 626 | patch.object( 627 | GitHubTokenAuthenticator, 628 | "get_env", 629 | return_value={"GITHUB_APP_PRIVATE_KEY": "123;;key"}, 630 | ), 631 | patch.object(AppTokenManager, "is_valid_token", return_value=False), 632 | patch( 633 | "tap_github.authenticator.generate_app_access_token", 634 | return_value=("installationtoken12345", MagicMock()), 635 | ), 636 | ): 637 | auth = GitHubTokenAuthenticator(stream=mock_stream) 638 | token_managers = auth.prepare_tokens() 639 | 640 | assert len(token_managers) == 0 641 | 642 | def test_prepare_tokens_returns_empty_if_all_tokens_invalid(self, mock_stream): 643 | with ( 644 | patch.object( 645 | GitHubTokenAuthenticator, 646 | "get_env", 647 | return_value={ 648 | "GITHUB_TOKEN1": "gt1", 649 | "GITHUB_APP_PRIVATE_KEY": "123;;key", 650 | }, 651 | ), 652 | patch.object(PersonalTokenManager, "is_valid_token", return_value=False), 653 | patch.object(AppTokenManager, "is_valid_token", return_value=False), 654 | patch( 655 | "tap_github.authenticator.generate_app_access_token", 656 | return_value=("installationtoken12345", MagicMock()), 657 | ), 658 | ): 659 | stream = mock_stream 660 | stream.config.update( 661 | { 662 | "auth_token": "gt5", 663 | "additional_auth_tokens": ["gt7", "gt8", "gt9"], 664 | } 665 | ) 666 | auth = GitHubTokenAuthenticator(stream=stream) 667 | token_managers = auth.prepare_tokens() 668 | 669 | assert len(token_managers) == 0 670 | -------------------------------------------------------------------------------- /tap_github/tests/test_core.py: -------------------------------------------------------------------------------- 1 | """Tests standard tap features using the built-in SDK tests library.""" 2 | 3 | import logging 4 | import os 5 | from unittest import mock 6 | from unittest.mock import patch 7 | 8 | from singer_sdk.testing import get_standard_tap_tests 9 | 10 | from tap_github.tap import TapGitHub 11 | from tap_github.utils.filter_stdout import nostdout 12 | 13 | from .fixtures import ( # noqa: F401 14 | alternative_sync_chidren, 15 | organization_list_config, 16 | repo_list_config, 17 | search_config, 18 | username_list_config, 19 | ) 20 | 21 | 22 | # Run standard built-in tap tests from the SDK: 23 | def test_standard_tap_tests_for_search_mode(search_config): # noqa: F811 24 | """Run standard tap tests from the SDK.""" 25 | tests = get_standard_tap_tests(TapGitHub, config=search_config) 26 | with ( 27 | patch( 28 | "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren 29 | ), 30 | nostdout(), 31 | ): 32 | for test in tests: 33 | test() 34 | 35 | 36 | def test_standard_tap_tests_for_repo_list_mode(repo_list_config): # noqa: F811 37 | """Run standard tap tests from the SDK.""" 38 | tests = get_standard_tap_tests(TapGitHub, config=repo_list_config) 39 | with ( 40 | patch( 41 | "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren 42 | ), 43 | nostdout(), 44 | ): 45 | for test in tests: 46 | test() 47 | 48 | 49 | def test_standard_tap_tests_for_username_list_mode(username_list_config): # noqa: F811 50 | """Run standard tap tests from the SDK.""" 51 | tests = get_standard_tap_tests(TapGitHub, config=username_list_config) 52 | with nostdout(): 53 | for test in tests: 54 | test() 55 | 56 | 57 | # This token needs to have read:org access for the organization listed in fixtures.py 58 | # Default is "MeltanoLabs" 59 | ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") 60 | 61 | 62 | @mock.patch.dict(os.environ, {"GITHUB_TOKEN": ORG_LEVEL_TOKEN or ""}) 63 | def test_standard_tap_tests_for_organization_list_mode(organization_list_config): # noqa: F811 64 | """Run standard tap tests from the SDK.""" 65 | if not ORG_LEVEL_TOKEN: 66 | logging.warning('No "ORG_LEVEL_TOKEN" found. Skipping organization tap tests.') 67 | return 68 | tests = get_standard_tap_tests(TapGitHub, config=organization_list_config) 69 | with nostdout(): 70 | for test in tests: 71 | test() 72 | -------------------------------------------------------------------------------- /tap_github/tests/test_tap.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import re 5 | from unittest.mock import patch 6 | 7 | import pytest 8 | from bs4 import BeautifulSoup 9 | from dateutil.parser import isoparse 10 | from singer_sdk._singerlib import Catalog 11 | from singer_sdk.helpers import _catalog as cat_helpers 12 | 13 | from tap_github.scraping import parse_counter 14 | from tap_github.tap import TapGitHub 15 | 16 | from .fixtures import ( # noqa: F401 17 | alternative_sync_chidren, 18 | repo_list_config, 19 | username_list_config, 20 | ) 21 | 22 | repo_list_2 = [ 23 | "MeltanoLabs/tap-github", 24 | # mistype the repo name so we can check that the tap corrects it 25 | "MeltanoLabs/Tap-GitLab", 26 | # mistype the org 27 | "meltanolabs/target-athena", 28 | # a repo that does not exist at all 29 | # this one has no matching record below as it should be removed 30 | # from the list by the TempStream 31 | "brokenOrg/does_not_exist", 32 | ] 33 | # the same list, but without typos, for validation 34 | repo_list_2_corrected = [ 35 | "MeltanoLabs/tap-github", 36 | "MeltanoLabs/tap-gitlab", 37 | "MeltanoLabs/target-athena", 38 | ] 39 | # the github repo ids that match the repo names above 40 | # in the same order 41 | repo_list_2_ids = [ 42 | 365087920, 43 | 416891176, 44 | 361619143, 45 | ] 46 | 47 | 48 | @pytest.mark.repo_list(repo_list_2) 49 | def test_validate_repo_list_config(repo_list_config): # noqa: F811 50 | """Verify that the repositories list is parsed correctly""" 51 | repo_list_context = [ 52 | { 53 | "org": repo[0].split("/")[0], 54 | "repo": repo[0].split("/")[1], 55 | "repo_id": repo[1], 56 | } 57 | for repo in zip(repo_list_2_corrected, repo_list_2_ids) 58 | ] 59 | tap = TapGitHub(config=repo_list_config) 60 | partitions = tap.streams["repositories"].partitions 61 | assert partitions == repo_list_context 62 | 63 | 64 | def run_tap_with_config( 65 | capsys, config_obj: dict, skip_stream: str | None, single_stream: str | None 66 | ) -> str: 67 | """ 68 | Run the tap with the given config and capture stdout, optionally 69 | skipping a stream (this is meant to be the top level stream), or 70 | running a single one. 71 | """ 72 | tap1 = TapGitHub(config=config_obj) 73 | tap1.run_discovery() 74 | catalog = Catalog.from_dict(tap1.catalog_dict) 75 | # Reset and re-initialize with an input catalog 76 | if skip_stream is not None: 77 | cat_helpers.set_catalog_stream_selected( 78 | catalog=catalog, 79 | stream_name=skip_stream, 80 | selected=False, 81 | ) 82 | elif single_stream is not None: 83 | cat_helpers.deselect_all_streams(catalog) 84 | cat_helpers.set_catalog_stream_selected(catalog, "repositories", selected=True) 85 | cat_helpers.set_catalog_stream_selected( 86 | catalog, stream_name=single_stream, selected=True 87 | ) 88 | 89 | # discard previous output to stdout (potentially from other tests) 90 | capsys.readouterr() 91 | with patch( 92 | "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren 93 | ): 94 | tap2 = TapGitHub(config=config_obj, catalog=catalog.to_dict()) 95 | tap2.sync_all() 96 | captured = capsys.readouterr() 97 | return captured.out 98 | 99 | 100 | @pytest.mark.parametrize("skip_parent_streams", [False, True]) 101 | @pytest.mark.repo_list(repo_list_2) 102 | def test_get_a_repository_in_repo_list_mode( 103 | capsys, 104 | repo_list_config, # noqa: F811 105 | skip_parent_streams, 106 | ): 107 | """ 108 | Discover the catalog, and request 2 repository records. 109 | The test is parametrized to run twice, with and without 110 | syncing the top level `repositories` stream. 111 | """ 112 | repo_list_config["skip_parent_streams"] = skip_parent_streams 113 | captured_out = run_tap_with_config( 114 | capsys, 115 | repo_list_config, 116 | "repositories" if skip_parent_streams else None, 117 | single_stream=None, 118 | ) 119 | # Verify we got the right number of records 120 | # one per repo in the list only if we sync the "repositories" stream, 0 if not 121 | assert captured_out.count('{"type":"RECORD","stream":"repositories"') == len( 122 | repo_list_2_ids * (not skip_parent_streams) 123 | ) 124 | # check that the tap corrects invalid case in config input 125 | assert '"repo": "Tap-GitLab"' not in captured_out 126 | assert '"org": "meltanolabs"' not in captured_out 127 | 128 | 129 | @pytest.mark.repo_list(["MeltanoLabs/tap-github"]) 130 | def test_last_state_message_is_valid(capsys, repo_list_config): # noqa: F811 131 | """ 132 | Validate that the last state message is not a temporary one and contains the 133 | expected values for a stream with overridden state partitioning keys. 134 | Run this on a single repo to avoid having to filter messages too much. 135 | """ 136 | repo_list_config["skip_parent_streams"] = True 137 | captured_out = run_tap_with_config( 138 | capsys, repo_list_config, "repositories", single_stream=None 139 | ) 140 | # capture the messages we're interested in 141 | state_messages = re.findall(r'{"type":"STATE","value":.*}', captured_out) 142 | issue_comments_records = re.findall( 143 | r'{"type":"RECORD","stream":"issue_comments",.*}', captured_out 144 | ) 145 | assert state_messages is not None 146 | last_state_msg = state_messages[-1] 147 | 148 | # make sure we don't have a temporary state message at the very end 149 | assert "progress_markers" not in last_state_msg 150 | 151 | last_state = json.loads(last_state_msg) 152 | last_state_updated_at = isoparse( 153 | last_state["value"]["bookmarks"]["issue_comments"]["partitions"][0][ 154 | "replication_key_value" 155 | ] 156 | ) 157 | latest_updated_at = max( 158 | isoparse(json.loads(record)["record"]["updated_at"]) 159 | for record in issue_comments_records 160 | ) 161 | assert last_state_updated_at == latest_updated_at 162 | 163 | 164 | # case is incorrect on purpose, so we can check that the tap corrects it 165 | # and run the test twice, with and without syncing the `users` stream 166 | @pytest.mark.parametrize("skip_parent_streams", [False, True]) 167 | @pytest.mark.username_list(["EricBoucher", "aaRONsTeeRS"]) 168 | def test_get_a_user_in_user_usernames_mode( 169 | capsys, 170 | username_list_config, # noqa: F811 171 | skip_parent_streams, 172 | ): 173 | """ 174 | Discover the catalog, and request 2 repository records 175 | """ 176 | username_list_config["skip_parent_streams"] = skip_parent_streams 177 | captured_out = run_tap_with_config( 178 | capsys, 179 | username_list_config, 180 | "users" if skip_parent_streams else None, 181 | single_stream=None, 182 | ) 183 | # Verify we got the right number of records: 184 | # one per user in the list if we sync the root stream, 0 otherwise 185 | assert captured_out.count('{"type":"RECORD","stream":"users"') == len( 186 | username_list_config["user_usernames"] * (not skip_parent_streams) 187 | ) 188 | # these 2 are inequalities as number will keep changing :) 189 | assert captured_out.count('{"type":"RECORD","stream":"starred"') > 150 190 | assert captured_out.count('{"type":"RECORD","stream":"user_contributed_to"') > 25 191 | assert '{"username":"aaronsteers"' in captured_out 192 | assert '{"username":"aaRONsTeeRS"' not in captured_out 193 | assert '{"username":"EricBoucher"' not in captured_out 194 | 195 | 196 | @pytest.mark.repo_list(["torvalds/linux"]) 197 | def test_large_list_of_contributors(capsys, repo_list_config): # noqa: F811 198 | """ 199 | Check that the github error message for very large lists of contributors 200 | is handled properly (does not return any records). 201 | """ 202 | captured_out = run_tap_with_config( 203 | capsys, repo_list_config, skip_stream=None, single_stream="contributors" 204 | ) 205 | assert captured_out.count('{"type":"RECORD","stream":"contributors"') == 0 206 | 207 | 208 | def test_web_tag_parse_counter(): 209 | """ 210 | Check that the parser runs ok on various forms of counters. 211 | Used in extra_metrics stream. 212 | """ 213 | # regular int 214 | tag = BeautifulSoup( 215 | '57', 216 | "html.parser", 217 | ).span 218 | assert parse_counter(tag) == 57 219 | 220 | # 2k 221 | tag = BeautifulSoup( 222 | '2k', 223 | "html.parser", 224 | ).span 225 | assert parse_counter(tag) == 2028 226 | 227 | # 5k+. The real number is not available in the page, use this approx value 228 | tag = BeautifulSoup( 229 | '5k+', 230 | "html.parser", 231 | ).span 232 | assert parse_counter(tag) == 5_000 233 | -------------------------------------------------------------------------------- /tap_github/user_streams.py: -------------------------------------------------------------------------------- 1 | """User Stream types classes for tap-github.""" 2 | 3 | from __future__ import annotations 4 | 5 | import re 6 | from typing import TYPE_CHECKING, Any, ClassVar 7 | 8 | from singer_sdk import typing as th # JSON Schema typing helpers 9 | from singer_sdk.exceptions import FatalAPIError 10 | 11 | from tap_github.client import GitHubGraphqlStream, GitHubRestStream 12 | from tap_github.schema_objects import user_object 13 | 14 | if TYPE_CHECKING: 15 | from collections.abc import Iterable 16 | 17 | from singer_sdk.helpers.types import Context 18 | from singer_sdk.tap_base import Tap 19 | 20 | 21 | class UserStream(GitHubRestStream): 22 | """Defines 'User' stream.""" 23 | 24 | name = "users" 25 | replication_key = "updated_at" 26 | 27 | @property 28 | def path(self) -> str: # type: ignore 29 | """Return the API endpoint path.""" 30 | if "user_usernames" in self.config: 31 | return "/users/{username}" 32 | elif "user_ids" in self.config: 33 | return "/user/{id}" 34 | 35 | @property 36 | def partitions(self) -> list[dict] | None: 37 | """Return a list of partitions.""" 38 | if "user_usernames" in self.config: 39 | input_user_list = self.config["user_usernames"] 40 | 41 | augmented_user_list = [] 42 | # chunk requests to the graphql endpoint to avoid timeouts and other 43 | # obscure errors that the api doesn't say much about. The actual limit 44 | # seems closer to 1000, use half that to stay safe. 45 | chunk_size = 500 46 | list_length = len(input_user_list) 47 | self.logger.info(f"Filtering user list of {list_length} users") 48 | for ndx in range(0, list_length, chunk_size): 49 | augmented_user_list += self.get_user_ids( 50 | input_user_list[ndx : ndx + chunk_size] 51 | ) 52 | self.logger.info(f"Running the tap on {len(augmented_user_list)} users") 53 | return augmented_user_list 54 | 55 | elif "user_ids" in self.config: 56 | return [{"id": user_id} for user_id in self.config["user_ids"]] 57 | return None 58 | 59 | def get_child_context(self, record: dict, context: Context | None) -> dict: 60 | return { 61 | "username": record["login"], 62 | "user_id": record["id"], 63 | } 64 | 65 | def get_user_ids(self, user_list: list[str]) -> list[dict[str, str]]: 66 | """Enrich the list of userse with their numeric ID from github. 67 | 68 | This helps maintain a stable id for context and bookmarks. 69 | It uses the github graphql api to fetch the databaseId. 70 | It also removes non-existant repos and corrects casing to ensure 71 | data is correct downstream. 72 | """ 73 | 74 | # use a temp handmade stream to reuse all the graphql setup of the tap 75 | class TempStream(GitHubGraphqlStream): 76 | name = "tempStream" 77 | schema = th.PropertiesList( 78 | th.Property("id", th.StringType), 79 | th.Property("databaseId", th.IntegerType), 80 | ).to_dict() 81 | 82 | def __init__(self, tap: Tap, user_list: list[str]) -> None: 83 | super().__init__(tap) 84 | self.user_list = user_list 85 | 86 | @property 87 | def query(self) -> str: 88 | chunks = [] 89 | for i, user in enumerate(self.user_list): 90 | # we use the `repositoryOwner` query which is the only one that 91 | # works on both users and orgs with graphql. REST is less picky 92 | # and the /user endpoint works for all types. 93 | chunks.append( 94 | f'user{i}: repositoryOwner(login: "{user}") ' 95 | "{ login avatarUrl}" 96 | ) 97 | return "query {" + " ".join(chunks) + " rateLimit { cost } }" 98 | 99 | if len(user_list) < 1: 100 | return [] 101 | 102 | users_with_ids: list = [] 103 | temp_stream = TempStream(self._tap, list(user_list)) 104 | 105 | database_id_pattern: re.Pattern = re.compile( 106 | r"https://avatars.githubusercontent.com/u/(\d+)?.*" 107 | ) 108 | # replace manually provided org/repo values by the ones obtained 109 | # from github api. This guarantees that case is correct in the output data. 110 | # See https://github.com/MeltanoLabs/tap-github/issues/110 111 | # Also remove repos which do not exist to avoid crashing further down 112 | # the line. 113 | for record in temp_stream.request_records({}): 114 | for item in record: 115 | if item == "rateLimit": 116 | continue 117 | try: 118 | username = record[item]["login"] 119 | except TypeError: 120 | # one of the usernames returned `None`, which means it does 121 | # not exist, log some details, and move on to the next one 122 | invalid_username = user_list[int(item[4:])] 123 | self.logger.info( 124 | f"Username not found: {invalid_username} \t" 125 | "Removing it from list" 126 | ) 127 | continue 128 | # the databaseId (in graphql language) is not available on 129 | # repositoryOwner, so we parse the avatarUrl to get it :/ 130 | m = database_id_pattern.match(record[item]["avatarUrl"]) 131 | if m is not None: 132 | db_id = m.group(1) 133 | users_with_ids.append({"username": username, "user_id": db_id}) 134 | else: 135 | # If we get here, github's API is not returning what 136 | # we expected, so it's most likely a breaking change on 137 | # their end, and the tap's code needs updating 138 | raise FatalAPIError("Unexpected GitHub API error: Breaking change?") 139 | 140 | self.logger.info(f"Running the tap on {len(users_with_ids)} users") 141 | return users_with_ids 142 | 143 | def get_records(self, context: Context | None) -> Iterable[dict[str, Any]]: 144 | """ 145 | Override the parent method to allow skipping API calls 146 | if the stream is deselected and skip_parent_streams is True in config. 147 | This allows running the tap with fewer API calls and preserving 148 | quota when only syncing a child stream. Without this, 149 | the API call is sent but data is discarded. 150 | """ 151 | if ( 152 | not self.selected 153 | and "skip_parent_streams" in self.config 154 | and self.config["skip_parent_streams"] 155 | and context is not None 156 | ): 157 | # build a minimal mock record so that self._sync_records 158 | # can proceed with child streams 159 | # the id is fetched in `get_user_ids` above 160 | yield { 161 | "login": context["username"], 162 | "id": context["user_id"], 163 | } 164 | else: 165 | yield from super().get_records(context) 166 | 167 | schema = th.PropertiesList( 168 | th.Property("login", th.StringType), 169 | th.Property("id", th.IntegerType), 170 | th.Property("node_id", th.StringType), 171 | th.Property("avatar_url", th.StringType), 172 | th.Property("gravatar_id", th.StringType), 173 | th.Property("url", th.StringType), 174 | th.Property("html_url", th.StringType), 175 | th.Property("followers_url", th.StringType), 176 | th.Property("following_url", th.StringType), 177 | th.Property("gists_url", th.StringType), 178 | th.Property("starred_url", th.StringType), 179 | th.Property("subscriptions_url", th.StringType), 180 | th.Property("organizations_url", th.StringType), 181 | th.Property("repos_url", th.StringType), 182 | th.Property("events_url", th.StringType), 183 | th.Property("received_events_url", th.StringType), 184 | th.Property("type", th.StringType), 185 | th.Property("site_admin", th.BooleanType), 186 | th.Property("name", th.StringType), 187 | th.Property("company", th.StringType), 188 | th.Property("blog", th.StringType), 189 | th.Property("location", th.StringType), 190 | th.Property("email", th.StringType), 191 | th.Property("hireable", th.BooleanType), 192 | th.Property("bio", th.StringType), 193 | th.Property("twitter_username", th.StringType), 194 | th.Property("public_repos", th.IntegerType), 195 | th.Property("public_gists", th.IntegerType), 196 | th.Property("followers", th.IntegerType), 197 | th.Property("following", th.IntegerType), 198 | th.Property("updated_at", th.DateTimeType), 199 | th.Property("created_at", th.DateTimeType), 200 | ).to_dict() 201 | 202 | 203 | class StarredStream(GitHubRestStream): 204 | """Defines 'Stars' stream. Warning: this stream does NOT track star deletions.""" 205 | 206 | name = "starred" 207 | path = "/users/{username}/starred" 208 | # "repo_id" is the starred repo's id. 209 | primary_keys: ClassVar[list[str]] = ["repo_id", "username"] 210 | parent_stream_type = UserStream 211 | # TODO - change partitioning key to user_id? 212 | state_partitioning_keys: ClassVar[list[str]] = ["username"] 213 | replication_key = "starred_at" 214 | ignore_parent_replication_key = True 215 | # GitHub is missing the "since" parameter on this endpoint. 216 | use_fake_since_parameter = True 217 | 218 | @property 219 | def http_headers(self) -> dict: 220 | """Return the http headers needed. 221 | 222 | Overridden to use an endpoint which includes starred_at property: 223 | https://docs.github.com/en/rest/reference/activity#custom-media-types-for-starring 224 | """ 225 | headers = super().http_headers 226 | headers["Accept"] = "application/vnd.github.v3.star+json" 227 | return headers 228 | 229 | def post_process(self, row: dict, context: Context | None = None) -> dict: 230 | """ 231 | Add a repo_id top-level field to be used as state replication key. 232 | """ 233 | row["repo_id"] = row["repo"]["id"] 234 | if context is not None: 235 | row["user_id"] = context["user_id"] 236 | return row 237 | 238 | schema = th.PropertiesList( 239 | # Parent Keys 240 | th.Property("username", th.StringType), 241 | th.Property("repo_id", th.IntegerType), 242 | th.Property("user_id", th.IntegerType), 243 | # Starred Repo Info 244 | th.Property("starred_at", th.DateTimeType), 245 | th.Property( 246 | "repo", 247 | th.ObjectType( 248 | th.Property("id", th.IntegerType), 249 | th.Property("node_id", th.StringType), 250 | th.Property("full_name", th.StringType), 251 | th.Property("description", th.StringType), 252 | th.Property("html_url", th.StringType), 253 | th.Property("owner", user_object), 254 | th.Property( 255 | "license", 256 | th.ObjectType( 257 | th.Property("key", th.StringType), 258 | th.Property("name", th.StringType), 259 | th.Property("url", th.StringType), 260 | th.Property("spdx_id", th.StringType), 261 | ), 262 | ), 263 | th.Property("updated_at", th.DateTimeType), 264 | th.Property("created_at", th.DateTimeType), 265 | th.Property("pushed_at", th.DateTimeType), 266 | th.Property("stargazers_count", th.IntegerType), 267 | th.Property("fork", th.BooleanType), 268 | th.Property( 269 | "topics", 270 | th.ArrayType(th.StringType), 271 | ), 272 | th.Property("visibility", th.StringType), 273 | th.Property("language", th.StringType), 274 | th.Property("forks", th.IntegerType), 275 | th.Property("watchers", th.IntegerType), 276 | th.Property("open_issues", th.IntegerType), 277 | ), 278 | ), 279 | ).to_dict() 280 | 281 | 282 | class UserContributedToStream(GitHubGraphqlStream): 283 | """Defines 'UserContributedToStream' stream.""" 284 | 285 | name = "user_contributed_to" 286 | query_jsonpath = "$.data.user.repositoriesContributedTo.nodes.[*]" 287 | primary_keys: ClassVar[list[str]] = ["username", "name_with_owner"] 288 | replication_key = None 289 | parent_stream_type = UserStream 290 | # TODO - add user_id to schema 291 | # TODO - change partitioning key to user_id? 292 | state_partitioning_keys: ClassVar[list[str]] = ["username"] 293 | ignore_parent_replication_key = True 294 | 295 | @property 296 | def query(self) -> str: 297 | """Return dynamic GraphQL query.""" 298 | # Graphql id is equivalent to REST node_id. To keep the tap consistent, 299 | # we rename "id" to "node_id". 300 | return """ 301 | query userContributedTo($username: String! $nextPageCursor_0: String) { 302 | user (login: $username) { 303 | repositoriesContributedTo (first: 100 after: $nextPageCursor_0 includeUserRepositories: true orderBy: {field: STARGAZERS, direction: DESC}) { 304 | pageInfo { 305 | hasNextPage_0: hasNextPage 306 | startCursor_0: startCursor 307 | endCursor_0: endCursor 308 | } 309 | nodes { 310 | node_id: id 311 | database_id: databaseId 312 | name_with_owner: nameWithOwner 313 | open_graph_image_url: openGraphImageUrl 314 | stargazer_count: stargazerCount 315 | pushed_at: pushedAt 316 | owner { 317 | node_id: id 318 | login 319 | } 320 | } 321 | } 322 | } 323 | rateLimit { 324 | cost 325 | } 326 | } 327 | """ # noqa: E501 328 | 329 | schema = th.PropertiesList( 330 | th.Property("node_id", th.StringType), 331 | th.Property("username", th.StringType), 332 | th.Property("name_with_owner", th.StringType), 333 | th.Property("open_graph_image_url", th.StringType), 334 | th.Property("stargazer_count", th.IntegerType), 335 | th.Property( 336 | "owner", 337 | th.ObjectType( 338 | th.Property("node_id", th.StringType), 339 | th.Property("login", th.StringType), 340 | ), 341 | ), 342 | ).to_dict() 343 | -------------------------------------------------------------------------------- /tap_github/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MeltanoLabs/tap-github/0374f0768b1ffb2a3db0dd53591282830e553cf2/tap_github/utils/__init__.py -------------------------------------------------------------------------------- /tap_github/utils/filter_stdout.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import contextlib 4 | import io 5 | import re 6 | import sys 7 | from re import Pattern 8 | from typing import TYPE_CHECKING, TextIO 9 | 10 | if TYPE_CHECKING: 11 | from collections.abc import Generator 12 | 13 | 14 | class FilterStdOutput: 15 | """Filter out stdout/sterr given a regex pattern.""" 16 | 17 | def __init__(self, stream: TextIO, re_pattern: str | Pattern) -> None: 18 | self.stream = stream 19 | self.pattern = ( 20 | re.compile(re_pattern) if isinstance(re_pattern, str) else re_pattern 21 | ) 22 | self.triggered = False 23 | 24 | def __getattr__(self, attr_name: str) -> object: 25 | return getattr(self.stream, attr_name) 26 | 27 | def write(self, data: str) -> None: 28 | if data == "\n" and self.triggered: 29 | self.triggered = False 30 | else: 31 | if self.pattern.search(data) is None: 32 | self.stream.write(data) 33 | self.stream.flush() 34 | else: 35 | # caught bad pattern 36 | self.triggered = True 37 | 38 | def flush(self) -> None: 39 | self.stream.flush() 40 | 41 | 42 | @contextlib.contextmanager 43 | def nostdout() -> Generator[None, None, None]: 44 | save_stdout = sys.stdout 45 | sys.stdout = io.StringIO() 46 | yield 47 | sys.stdout = save_stdout 48 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build = true 3 | envlist = py3{9,10,11,12,13} 4 | 5 | [testenv] 6 | whitelist_externals = poetry 7 | commands = 8 | poetry install -v 9 | poetry run pytest 10 | --------------------------------------------------------------------------------