├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── CNAME
    ├── code_structure.md
    ├── features.md
    ├── gitxray_buzz.md
    ├── images
    │   ├── console_gitxray.png
    │   ├── html_report_gitxray.png
    │   └── logo_gitxray.png
    ├── index.md
    ├── installing.md
    ├── show_love.md
    └── vulnerable_workflows.md
├── mkdocs.yml
├── pyproject.toml
├── requirements.txt
└── src
    └── gitxray
        ├── gitxray.py
        ├── include
            ├── __init__.py
            ├── gh_api.py
            ├── gh_public_events.py
            ├── gh_reactions.py
            ├── gh_time.py
            ├── gx_arg_parser.py
            ├── gx_context.py
            ├── gx_definitions.py
            ├── gx_output.py
            ├── gx_ugly_openpgp_parser.py
            ├── gx_ugly_ssh_parser.py
            └── html_report
            │   ├── __init__.py
            │   ├── template_contributor.html
            │   ├── template_highlights.html
            │   ├── template_main.html
            │   ├── template_non_contributor.html
            │   ├── template_repository.html
            │   └── template_table.html
        └── xrays
            ├── __init__.py
            ├── association_xray.py
            ├── contributors_xray.py
            ├── repository_xray.py
            └── workflows_xray.py


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v3
13 |       - uses: actions/setup-python@v4
14 |         with:
15 |           python-version: 3.x
16 |       - uses: actions/cache@v4
17 |         with:
18 |           key: ${{ github.ref }}
19 |           path: .cache
20 |       - run: pip install mkdocs-material
21 |       - run: mkdocs gh-deploy --force
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## Release v1.0.17.4 (April 27th, 2025)
  4 | * Removed external links to Bootstrap CSS, JavaScript, and the Gitxray logo; now all assets (styles, scripts, images) are embedded so HTML reports are fully self-contained.
  5 | * Merged a PR by satoridev01 which prevents Gitxray from printing stars, watchers, created, updated, contributors and anonymous if they were not found, as opposed to stating "0".
  6 | 
  7 | ## Release v1.0.17.3 (March 2nd, 2025)
  8 | * Added handling of Blocked repositories. The GitHub API presents a different error than for not-found repos, and includes a reason; which we now include in our reports. Thanks to @satoridev01 for reporting the behavior.
  9 | 
 10 | ## Release v1.0.17.2 (February 16th, 2025)
 11 | * Added a Highlighted findings section to the HTML report. Special thanks to Viktor and Darin from DevOps Paradox for their feedback.
 12 | * Added to every table in the HTML report the ability to collapse and expand.
 13 | * Merged the finding on similar repository names with the finding on the repository being the most starred.
 14 | * Improved the introduction section of the HTML report to guide users with sample use-cases for Gitxray.
 15 | 
 16 | ## Release v1.0.17.1 (January 30th, 2025)
 17 | * Fixed parsing of legacy workflows which include an empty path attribute.
 18 | * Fixed parsing of broken armored PGP keys which appear to be accepted by GitHub when being added to a user's profile.
 19 | 
 20 | ## Release v1.0.17 (January 26th, 2025)
 21 | * Added a new "--shush" parameter which turns "shushable" mode on, discarding any progress output from stdout.
 22 | * Added a new finding under the "personal" category which tells if the contributor has enabled "Available for hire" in their profile (docs describe it here: https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-user-account-settings/about-available-for-hire)
 23 | * Added a "WARNING" label/prefix on a couple of Workflow findings which deserve an extra highlight.
 24 | * Turned gh_api into a class named GitHubRESTAPI which stores a reference to gx_output.
 25 | * Added a new stdout method in gx_output to act as a proxy for print() calls, discarding "shushable" output.
 26 | 
 27 | ## Release v1.0.16.5 (January 18th, 2025)
 28 | * Fixed an error case (an unhandled exception) that showed up when scanning repositories with a very large list of contributors (e.g. torvalds/linux, or MicrosoftDocs/azure-docs), which leads to GitHub REST APIs responding in an undocumented manner, stating that: "The history or contributor list is too large to list contributors for this repository via the API".
 29 | 
 30 | ## Release v1.0.16.4 (October 30th, 2024)
 31 | * Fixed an error case that should be fixed in gh_api.py eventually: GitHub returning unexpected error responses when querying for certain releases while being unauthenticated. Thank you once again @fearcito for your input and testing.
 32 | 
 33 | ## Release v1.0.16.3 (October 28th, 2024)
 34 | * Only showing "updated at" for comments if the created_at and updated_at field values differ. This helps place focus on updated comments which could potentially reveal a contributor trying to hide a past comment. GitHub is kind to show an Edit history for said comments as a menu option next to the comment itself.
 35 | 
 36 | ## Release v1.0.16.2 (October 25th, 2024)
 37 | * Added validation against Null values for fields "author" and "uploader" in Releases and Assets. Special thanks to @fearcito for reporting the issue.
 38 | 
 39 | ## Release v1.0.16.1 (October 22nd, 2024)
 40 | * Fixed a typo in a call to r_log() which led to an uhandled exception when scanning repositories with self-hosted runners. Special thanks to @farnaboldi for reporting the issue.
 41 | 
 42 | ## Release v1.0.16 (October 18th, 2024)
 43 | * Added a brand new HTML output format/report by default, making results a lot easier to navigate! Custom search bar instead of relying on DataTables which can be super slow for large HTML files. We're now also groupping results by Category across all contributors and highlighting results which contain a WARNING keyword.
 44 | * Added certain association results to Contributor results, not all to prevent extra noise.
 45 | * Added the ability to specify a directory for output instead of a file, gitxray creating the filename for you.
 46 | * Removed the concept of 'Verbose' results, merging them with the non-verbose categories.
 47 | * Removed the need for repositories and organizations to start with https://github.com (Thanks to @mattaereal for pointing that out!)
 48 | 
 49 | ## Release v1.0.15 (September 20th, 2024)
 50 | 
 51 | * Added searching for similar repository names in GitHub, Warning if another repository with the same name and better reputation is found.
 52 | * Added commit time analysis, grouping commit hours per contributor and calculating the percentage of commits at each hour. This feature provides insights into contributors' activity patterns and helps identify potential anomalies.
 53 | * Added new Workflows X-Ray module which contains all Workflow-related logic. Moved in some of the logic that was under the Repository x-Ray.
 54 | * Added counts of Workflow Runs to identify when Workflow Runs were DELETED, which may have been the result of an attacker erasing their tracks, or legitimate cleanup.
 55 | * Added a series of basic Workflow security checks which might be an indicator of a vulnerable Workflow.
 56 | * Added to the Workflows X-Ray the ability to print, for each workflow, how many times it was executed by non-contributors as well as contributors.
 57 | * Added to the Workflows X-Ray the ability to parse and print any secret names used in a Workflow.
 58 | * Added a display of Progress % for time consuming queries and a time estimate in seconds-left prior to resuming execution.
 59 | * Added ability to SKIP heavy querying live by handling CTRL+C, which means we've also removed any caps or limits recently introduced.
 60 | * Fixed parsing of dict-formatted results coming from the REST API so that we keep the last key and not the second one.
 61 | * Fixed a few exceptions which arise by hitting CTRL+C and skipping or breaking API calls
 62 | 
 63 | ## Release v1.0.14 (September 1st, 2024)
 64 | 
 65 | * Added a new check on workflow runs for accounts which are NOT contributors, presenting a WARNING on screen. This could help identify hack attempts via Workflow runs.
 66 | * Added a new check on releases to identify accounts which create releases/upload assets and are NOT contributors, also WARNING on screen.
 67 | * Added pulling and analysis of Comments for Commits, Issues and Pull Requests.
 68 | * Added messages to point out when comments get updated (Edited) after a day of being created.
 69 | * Added parsing of reactions for comments in Commits, Issues and Pulls. We're printing the comment that had the most Positive, Neutral and Negative reactions in Commits, Issues and PRs.
 70 | * Added support capped to 5000 Workflow runs for analyzing past workflow runs in a repository. Runs can go very high in the, for example, 50k, which is why we cap.
 71 | * Added a limit of 5000 Artifacts inspection to prevent the analysis from being too expensive in really big repositories.
 72 | * Added support to get repository labels, pointing out specifically those which are custom.
 73 | * Added to the repository summary the printing of stargazers and watchers count even if 0, as it talks about reputation.
 74 | * Added code to fetch environment protection rules; but it is commented out because it is seldom used.
 75 | * Added to contributors_xray.py, a message to the user on how to use the filtering function in order to filter results for non-contributors.
 76 | * Added to gx_context.py, two (2) helper methods, isContributor and areContributors which iterate and check logins against the list of cached repo contributors.
 77 | * Added to the UNRELIABLE ACTIVITY message a clarification that the mismatch may be due to a rebased repository.
 78 | * Added count of Pull Requests to the output line showing the PR link for a contributor.
 79 | * Changed the way we refer to account results in gx_output.py - Instead of stating Contributors we're going to say accounts, as we may have non-contributor results.
 80 | * Moved multiple results that were under the "urls" category to the corresponding category instead (eg. commit urls to a commit category). Makes it easier to navigate visually.
 81 | * Fixed a visual typo (extra space) when printing 'starred' public events in verbose mode.
 82 | * Fixed querying of environments for exceptional repository-cases where the API returns a 404 not found in json format instead of an empty list of results.
 83 | * Fixed gh_api code for limiting results count in pagination when the API returns a dict with total_results followed by a list.
 84 | * Fixed identifying unreliable dates in commits mismatching account creation dates. Now only checking against 'author', and not checking against 'committer'.
 85 | 
 86 | ## Release v1.0.13 (August 19th, 2024)
 87 | 
 88 | * Added the ability to identify unreliable commits containing unreliable dates in a repository. This could be the case when, for example, a contributor's account creation date is newer than a contributor's commit date in the repository. In certain cases, those can be attempts at faking historic activity by malicious actors, or it could also mean that an account was deleted and the same handle re-registered by someone else (https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-your-personal-account/deleting-your-personal-account), among other possibilities. These warnings will show under the "commits" category. Gitxray will present a Warning stating that Unreliable historic activity was detected. 
 89 | * Moved the summary of signed vs. unsigned commits to a new "commits" category.
 90 | * Added support for Tags and Artifacts; a full list can be printed by turning on Verbose mode (-v). Tags are only returned stripped/lightweight by the GitHub API (at least when listing them all), sad - Nonetheless we've included code to support collecting data on taggers; should at some point GH begin returning full tag data.
 91 | * Added Stargazers count to the profiling output of a repository.
 92 | * Added "WARNING" labels to results which may require special attention (eg for fake profiles, updated release assets, ...) Makes it easy to use "-f warning" when running gitxra if you only want to focus on warnings.
 93 | * Added listing branches under repository results, specifically pointing out which ones are unprotected vs. protected branches.
 94 | * Replicating in Contributor-specific results a series of Releases-related messages that were only displayed under Repository results.
 95 | * Added collecting and printing available Environments (https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-deployments/managing-environments-for-deployment).
 96 | * Reduced temporary output lines when executing checks against a Repository.
 97 | * Added CHANGELOG.md file to track changes.
 98 | 
 99 | ## Release v1.0.12 (August 7th, 2024)
100 | 
101 | * Fixed parsing of PGP armored keys by adding support for a "Charset" field.
102 | 
103 | ## Release v1.0.11 (August 6th, 2024)
104 | 
105 | * First public release of Gitxray
106 | * More information available at: https://kulkansecurity.github.io/gitxray/
107 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include gitxray/include/html_report *.html
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Welcome to Gitxray 
  2 | Gitxray (short for Git X-Ray) is a multifaceted security tool designed for use on GitHub repositories. It can serve many purposes, including OSINT and Forensics. `gitxray` leverages public GitHub REST APIs to gather information that would otherwise be very time-consuming to obtain manually. Additionally, it seeks out information in unconventional places.
  3 | 
  4 | [![Build Workflows](https://github.com/kulkansecurity/gitxray/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/kulkansecurity/gitxray) [![Latest Version in PIP](https://img.shields.io/pypi/v/gitxray.svg)](https://pypi.org/project/gitxray) [![Python Versions](https://img.shields.io/pypi/pyversions/gitxray.svg)](https://pypi.org/project/gitxray) [![License](https://img.shields.io/pypi/l/gitxray.svg)](https://github.com/kulkansecurity/gitxray/blob/main/LICENSE) 
  5 | --- 
  6 | ![Gitxray Sample HTML Report](https://kulkansecurity.github.io/gitxray/images/html_report_gitxray.png?ts=42 "Gitxray Sample HTML Report")
  7 | <div style="clear: both;"></div>
  8 | 
  9 | # Use cases
 10 | Gitxray can be used to, for example:
 11 | 
 12 | - Find sensitive information in contributor profiles disclosed by accident within, for example, Armored PGP Keys, or Key Names.
 13 | 
 14 | - Identify threat actors in a Repository. You may spot co-owned or shared accounts, as well as inspect public events to spot fake Stargazers.
 15 | 
 16 | - Identify fake or infected Repositories. It can detect tampered commit dates as well as, for example, Release assets updated post-release.
 17 | 
 18 | - Forensics use-cases, such as filtering results by date in order to check what else happened on the day of an incident.
 19 | 
 20 | - And a lot more! Run a full X-Ray in to collect a ton of data.
 21 | 
 22 | ` gitxray -r https://github.com/some-org/some-repository`
 23 | 
 24 | - If you rather use text output, you may want to filter output with filters:
 25 | 
 26 | ` gitxray -r https://github.com/some-org/some-repository -f user_input -outformat text`
 27 | 
 28 | ` gitxray -r https://github.com/some-org/some-repository -f keys,association,starred -outformat text`
 29 | 
 30 | ` gitxray -r https://github.com/some-org/some-repository -f warning -outformat text`
 31 | 
 32 | ` gitxray -r https://github.com/some-org/some-repository -f 2024-09-01 -outformat text`
 33 | 
 34 | Please refer to the Documentation for additional use-cases and introductory information.
 35 | 
 36 | # Documentation
 37 | - [https://kulkansecurity.github.io/gitxray/](https://kulkansecurity.github.io/gitxray/)
 38 | - [https://www.gitxray.com/](https://www.gitxray.com/)
 39 | 
 40 | # Creating an Access Token to increase Rate Limits
 41 | 
 42 | Gitxray gracefully handles Rate Limits and can work out of the box without a GitHub API token, but you'll likely hit RateLimits pretty fast (A small to medium-size repository with 10+ Contributors could take hours to complete while it waits for RateLimits to reset) This is detailed by GitHub in their [documentation here](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users). 
 43 | 
 44 | [Creating a simple read-only token scoped to PUBLIC repositories](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token) will however help you increase those restrictions considerably. If you're not in a hurry or can leave gitxray running you'll be able to use its full capacity, as it pauses execution while waiting for the limits to lift.
 45 | 
 46 | You may then load the token safely by using (prevents the token from being displayed on screen or getting logged in your shell history):
 47 | 
 48 | ```bash
 49 | read -rs GH_ACCESS_TOKEN
 50 | export
 51 | ```
 52 | 
 53 | # Installing, Updating, and running Gitxray
 54 | 
 55 | gitxray was written with no use of external package dependencies other than the `requests` library.
 56 | 
 57 | ## PyPI (PIP) Way
 58 | 
 59 | `gitxray` is on PyPI and can be installed and updated with:
 60 | 
 61 | ```bash
 62 | pip install gitxray --upgrade
 63 | ```
 64 | 
 65 | Once installed, simply run gitxray from your command line by typing:
 66 | ```bash
 67 | gitxray -h
 68 | ```
 69 | 
 70 | ## Run your first full X-Ray
 71 | ```bash
 72 | gitxray -o https://github.com/kulkansecurity
 73 | ```
 74 | 
 75 | ![Gitxray Console](https://kulkansecurity.github.io/gitxray/images/console_gitxray.png "Gitxray Console") 
 76 | <div style="clear: both;"></div>
 77 | 
 78 | ## Installing from source
 79 | 
 80 | You may also run `gitxray` directly by cloning or downloading its GitHub repository and running.
 81 | 
 82 | ```bash
 83 | python3 -m pip install -r requirements.txt
 84 | cd src/
 85 | python3 -m gitxray.gitxray
 86 | ```
 87 | 
 88 | ## Command Line Arguments
 89 | 
 90 | ### Required Arguments
 91 | 
 92 | One of the following must be specified:
 93 | 
 94 | * `-r, --repository [URL]` - Specify a single repository to check. The URL may optionally begin with `https://github.com/`. **Example**: `--repository https://github.com/example/repo`
 95 | 
 96 | * `-rf, --repositories-file [FILEPATH]` - Provide a file path containing a list of repositories, each on a new line. The file must exist. **Example**: `--repositories-file ./list_of_repos.txt`
 97 | 
 98 | * `-o, --organization [URL]` - Specify an organization to check all repositories under that organization. The URL may optionally begin with `https://github.com/`. **Example**: `--organization https://github.com/exampleOrg`
 99 | 
100 | ### Optional Arguments
101 | 
102 | You'll find these optional but very handy in common gitxray usage.
103 | 
104 | - `-l, --list` - List contributors if a repository is specified or list repositories if an organization is specified. Useful for further focusing on specific entities. **Example**: `--list`
105 | 
106 | - `-c, --contributor [USERNAMES]` - A comma-separated list of GitHub usernames to focus on within the specified repository or organization. **Example**: `--contributor user1,user2`
107 | 
108 | - `-f, --filters [KEYWORDS]` - Comma-separated keywords to filter the results by, such as 'user_input', 'association', or 'mac'. **Example**: `--filters user_input,association,mac`
109 | 
110 | #### Output and Formats
111 | 
112 | - `-out, --outfile [FILEPATH]` - Specify the file path for the output log. Cannot be a directory. **Example**: `--outfile ./output.log`
113 | 
114 | - `-outformat, --output-format [FORMAT]` - Set the format for the log file. Supported formats are `html`, `text` and `json`. Default is `html`. **Example**: `--output-format json`
115 | 
116 | #### Shush output
117 | 
118 | - `--shush` - Makes Gitxray a bit more quiet by not displaying progress-related output. **Example**: `--shush`
119 | 
120 | #### Debug mode
121 | 
122 | - `--debug` - Enable Debug mode for a detailed and extensive output. **Example**: `--debug`
123 |   
124 | # Terms of Use
125 | 
126 | The user is solely responsible for ensuring that this tool is used in compliance with applicable laws and regulations, including obtaining proper authorization for repository scanning and the distribution of any results generated. Unauthorized use or sharing of results may violate local, national, or international laws.
127 | 
128 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | gitxray.com
2 | 


--------------------------------------------------------------------------------
/docs/code_structure.md:
--------------------------------------------------------------------------------
 1 | # Code structure
 2 | 
 3 | A few pointers on how the Gitxray codebase is structured:
 4 | 
 5 | * `gitxray.py` - The main script, creates a gx_context, gx_output and calls X-Ray modules.
 6 | 
 7 | The include directory has files split in two naming conventions:
 8 | 
 9 | * Suffix: `gh_` - Files that handle GitHub API responses or talk to the GitHub API
10 | * Suffix: `gx_` - Files with more Gitxray-specific logic
11 | 
12 | Some of the supporting files in the include directory:
13 | 
14 | * `gx_context.py` - Holds a context of data that is shared across different X-Ray modules and through out execution.
15 | * `gx_output.py` - Takes care of any console printing, as well as text and json output.
16 | 
17 | For parsing SSH and PGP signatures, we wrote our own code and placed it in:
18 | 
19 | * `gx_ugly_openpgp_parser.py` - Parses Armors and BLOBs based on RFC4880
20 | * `gx_ugly_ssh_parser.py` - Parses (if you can call that Parsing) SSH Armors and BLOBs
21 | 
22 | Finally, last but not least important, the X-Rays under the xrays directory:
23 | 
24 | * `contributors_xray.py` - Handles all Contributor-related data and decides what to log.
25 | * `repository_xray.py` - Handles all Repository-related data and decides what to log.
26 | * `workflows_xray.py` - Handles all Workflow-related analysis and decides what to log.
27 | * `associations_xray.py` - Analyzes and reports all associations carried from prior X-Ray modules.
28 | 


--------------------------------------------------------------------------------
/docs/features.md:
--------------------------------------------------------------------------------
  1 | # Features &#128171;
  2 | 
  3 | Because of the amount of data it analyzes, `gitxray` can be a bit overwhelming at first. Let's look at a few examples of potential awesome findings which can better explain why you're here and why `gitxray` is awesome &hearts;. 
  4 | 
  5 | ## A user-friendly HTML report &#x1F4CA;
  6 | 
  7 | `gitxray` now offers a default output format of `html`, creating a [Bootstrap](https://www.getbootstrap.com)-backed HTML report which offers easy navigation through Repository, Contributor and non-Contributor results.<div style="clear: both;"></div> ![Gitxray HTML Report](images/html_report_gitxray.png "HTML Report Gitxray")<div style="clear: both;"></div>
  8 | 
  9 | ## Unintended disclosures in Contributor profiles &#128064;
 10 | 
 11 | `gitxray` reports under a `user_input` category any user-supplied data that repository Contributors may have exposed via their GitHub accounts inadevertently. This is normally the case of PGP and SSH key name fields, which unfortunately are used by Users to record hostnames, computer models, password locations (e.g. in 1Password), or even the _password itself_ to a given key (which we all know might be the same password used elsewhere). To make things more interesting, `gitxray` also identifies any "excess" data found before, or after, PGP Armored keys published in a User's GitHub account. Wondering what that data normally is? Erroneous copy/pastes from the command line while exporting in ASCII/Armored format their keys. And what might that contain? Most of the times, a shell prompt revealing a local username, a hostname and a directory path. May I remind you all of this data is Public-facing.
 12 | 
 13 | You may focus specifically on these types of findings by filtering results with:
 14 | ```py
 15 | gitxray -o https://github.com/SampleOrg -f user_input
 16 | ```
 17 | or, for a specific repository:
 18 | ``` py
 19 | gitxray -r https://github.com/SampleOrg/SampleRepo -f user_input
 20 | ```
 21 | 
 22 | ## Spotting shared, co-owned or fake Contributors &#128123;
 23 | 
 24 | Open source projects are under attack, with malicious actors hiding in plain sight. GitHub has [released a Security alert](https://github.blog/security/vulnerability-research/security-alert-social-engineering-campaign-targets-technology-industry-employees/) describing one of potentially many modus-operandi adopted by Threat actors. So why not panic (a bit) and see if there's anything you could do to help protect the repositories you care about?
 25 | 
 26 | `gitxray` reports under the `association` category information that could help identify cases of suspicious activity or identity. By fingerprinting Keys added to a profile, as well as those historically used to sign a commit, and by looking at, for example, key and account creation times, it becomes possible to cross-reference the data and link _(hence 'association')_ the behavior to 2 or more accounts.
 27 | 
 28 | You can focus specifically on association findings by filtering for `association` with:
 29 | 
 30 | ``` 
 31 | gitxray -o https://github.com/SampleOrg -f association 
 32 | ```
 33 | or targetting a specific Repository with:
 34 | ```
 35 | gitxray -r https://github.com/SampleOrg/SampleRepo -f association
 36 | ```
 37 | 
 38 | ### Important 
 39 | 
 40 | Associations MUST NOT be directly and blindly used to report fake or shadow accounts. They are automatic observations from a piece of well-intended code. Do NOT treat association results as findings directly. We must protect open-source projects by first and foremost respecting open-source developers. Ensure that any actions taken are thoughtful and based on solid evidence, not just automated associations. 
 41 | 
 42 | ## Duplicate Repository Name Check &#128737;
 43 | 
 44 | `gitxray` will always check and detect duplicate repository names across different organizations and user profiles. This helps identify potential cloned or fake repositories. `gitxray` compares your target repository against others with the same name and the highest star count, ensuring you are engaging with the most popular (and likely legitimate) one.
 45 | 
 46 | ## Forensics: What happened on the day of an incident? &#128269;
 47 | 
 48 | Because `gitxray` collects data from multiple sources of activity including Commits, Comments, Workflow Runs, Issues, Deployments and more; and because `gitxray` shows activity in a standarized YYYY-MM-DD format, it becomes possible to use Filtering in order to place focus on specific activity happening at a specific point in time.
 49 | 
 50 | For example, by running `gitxray` with the following arguments, only results from that specific date are returned. You may place focus on a day, or even a month:
 51 | 
 52 | ``` 
 53 | gitxray -r https://github.com/SampleOrg/SampleRepo -f 2024-08 -outformat text
 54 | gitxray -r https://github.com/SampleOrg/SampleRepo -f 2024-09-01 -outformat text
 55 | ```
 56 | 
 57 | An outformat of type `text` can help in this specific use-case more than the defaul `html` report.
 58 | 
 59 | ## Analyzing Commit Hours to Identify Anomalies &#128347;
 60 | 
 61 | `gitxray` provides a summary of contributor commit hours, allowing you to profile contributor activity and detect potential anomalies. This feature helps you understand typical patterns and flag unusual behavior for further investigation.
 62 | 
 63 | ## Untrustworthy Repositories and Activity &#127988;
 64 | 
 65 | `gitxray` can be used to protect yourself, your team and your customers from fake Repositories and suspicious activity. For example, by analyzing commit dates and comparing them to the account creation timestamp of contributors, `gitxray` can flag inconsistencies that may indicate:
 66 | 
 67 | * Tampered Commits: Some repositories may be manipulated to appear older than they actually are, such as claiming that all commits date back six years when the repository was only created a week ago. This tactic is often used to increase perceived value. Unfortunately, altering commit dates is relatively easy in Git, and GitHub is no exception. More on this [here](https://www.reddit.com/r/git/comments/ympce5/is_it_possible_to_change_commit_date/).
 68 | 
 69 | * Reclaimed Usernames: Trusted contributors might have had their accounts deleted and then re-registered by malicious actors. GitHub allows a username to be re-released after 90 days, making this a possible attack vector. Learn more about GitHub’s account deletion policy [here](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-personal-account-on-github/managing-your-personal-account/deleting-your-personal-account#deleting-your-personal-account).
 70 | 
 71 | It is possible to focus on unreliable historic activity by filtering for Warning keywords:
 72 | 
 73 | ``` 
 74 | gitxray -o https://github.com/SampleOrg -f warning
 75 | ```
 76 | 
 77 | ## X-Raying GitHub Workflows &#9881; 
 78 | 
 79 | The Workflows X-Ray module is executed upon identifying existing Workflows or Actions. `gitxray` provides in-depth analysis and monitoring of GitHub workflows, including:
 80 | 
 81 | * Execution Analysis: Provides detailed insights into workflow execution, showing how many times each workflow was executed by contributors and non-contributors. This allows for better understanding of usage patterns and detection of unauthorized or unexpected activity.
 82 | 
 83 | * Detection of deleted runs: This feature helps identify whether workflow runs have been deleted, potentially indicating an attempt to erase traces of malicious activity or legitimate maintenance actions.
 84 | 
 85 | * Security Checks for Workflows: Performs a series of basic security checks on workflows to identify uses of Secrets, Self-hosted Runners and Potentially dangerous triggers (eg pull_request_target) that could pose a security risk.
 86 | 
 87 | ### Disclaimer: Gitxray is NOT a complete Workflow Security Scanner
 88 | 
 89 | For more information on tools which are specialized in scanning Workflows refer to our [Vulnerable Workflows section](vulnerable_workflows.md).
 90 | 
 91 | 
 92 | 
 93 | ## The PR Rejection Awards &#127942; 
 94 | 
 95 | Another `gitxray` feature is the ability to list a TOP 3 of GitHub accounts that have tried to submit Pull Requests to the repository, which ended up closed AND NOT merged. In certain emotional scenarios, this could be paraphrased as _rejected PRs_. Kidding aside, in some cases, this could lead to identifying Contributors who have repeatedly failed at merging a very evidently unaligned piece of code to a branch (I know, it sounds unlikely for an account to try and merge backdoor.py repeatedly... but is it?).
 96 | 
 97 | `gitxray` will show a TOP 3 list specific to Repository Contributors and a separate list for accounts which are NOT Contributors to the Repository.
 98 | 
 99 | These findings, if any exist, are reported under a `contributors` category along with additional information related to other Repository Contributors. You can focus specifically on findings from the contributors category by filtering for `contributors` with:
100 | 
101 | ``` 
102 | gitxray -o https://github.com/SampleOrg -f contributors 
103 | ```
104 | or targetting a specific Repository with:
105 | ``` bash
106 | gitxray -r https://github.com/SampleOrg/SampleRepo -f contributors
107 | ```
108 | ## Fake Starring, Private repos gone Public and more &#128584; 
109 | 
110 | GitHub shares publicly [up to 90 days of past Events](https://docs.github.com/en/rest/activity/events?apiVersion=2022-11-28) for any User account, which include actions such as Repository creation, Watching, Committing, Pull Requesting, and more. `gitxray` includes these events under a `90d_events` category in the results included for each Contributor.
111 | 
112 | For example, Events you may come across that would be interesting include:
113 | 
114 | - A user having very recently _switched a repository from PRIVATE to PUBLIC_. GitHub requires Users to tick several boxes prior to moving an existing private repository to public, lowering the chances of an unintended leak; however, a recent public repository may not have had as much attention and auditing as you would think.
115 | 
116 | - A user [starring](https://docs.github.com/en/rest/activity/starring?apiVersion=2022-11-28) (originally known as _watching_) too many respositories too rapidly. This could be a tell of an account used for [Stargazing](https://research.checkpoint.com/2024/stargazers-ghost-network/). Or it could just be a normal human being in one of those days filled with anxiety.
117 | 
118 | - And more!
119 | 
120 | To find Contributors who recently switched from Private to Public a repository or who have Starred repositories, you may start with:
121 | ``` 
122 | gitxray -o https://github.com/SampleOrg -f starred,private
123 | ```
124 | 
125 | And you could then target a specific Repository Contributor to get more information:
126 | ``` 
127 | gitxray -r https://github.com/SampleOrg/SampleRepo -c some_user
128 | ```
129 | ## Lots of e-mail addresses &#128231; and Profiling data &#128100;
130 | 
131 | `gitxray` will report for each Contributor, an `emails` category listing all unique e-mail address collected from parsing:
132 | 
133 | * The User's profile
134 | * Each commit made by the User
135 | * PGP Primary Keys and PGP SubKeys
136 | 
137 | Additionally, Personal Information (e.g. social networks) voluntarily made Public by the User is extracted from multiple sources including PGP Key BLOBs and reported under a `personal` category.
138 | 
139 | Finally, the `profiling` category tends to display information related to the account itself (e.g. creation date, last updated, and more.)
140 | 
141 | You may focus specifically on `emails`, `personal`, and `profiling` fields with:
142 | ```py
143 | gitxray -o https://github.com/SampleOrg -f emails,personal,profiling
144 | ```
145 | or, for a specific repository, with: 
146 | ``` py
147 | gitxray -r https://github.com/SampleOrg/SampleRepo -f emails,personal,profiling
148 | ```
149 | 
150 | ## Looking out for malicious Releases and Assets &#128065; 
151 | 
152 | It is possible for Threat Actors to compromise credentials of a Repository Maintainer in order to deploy malware by silently updating released Assets (the typical package you would download when a Release includes downloadable Assets); which is why `gitxray` looks at all Repository Releases and informs of:
153 | 
154 | * Assets that were **updated** at least a day **AFTER** their release, which might lead to suggest they've been infected and/or tampered with. Or it could just be a maintainer fixing an asset without wanting to create a new release.
155 | 
156 | * Users who have historically created releases and/or uploaded assets, as well as the % vs. the total amount of releases or assets uploaded in the repository; which may allow you to flag potential suspicious activity. For example, you might notice an account which never created Releases before now uploading assets.
157 | 
158 | 
159 | All of this information is included by `gitxray` in a `releases` category, which means you can focus on those results (if any exist) with:
160 | 
161 | ``` bash
162 | gitxray -o https://github.com/SampleOrg -f releases
163 | ```
164 | 
165 | ## Anonymous contributors &#128065; 
166 | 
167 | As stated in [GitHub documentation](https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-contributors), only the first 500 author email addresses in a Repository will link to actual GitHub users or accounts. The rest will appear as "anonymous" contributors without associated GitHub information.
168 | 
169 | Additionally, when an author's email address in a commit is not associated with a GitHub account, the User will also be considered Anonymous.
170 | 
171 | Lucky for us, `gitxray` also includes within its output the entire list of Anonymous contributors received from GitHub. The list is first processed to combine all variations of Names used by the author for a same e-mail, which means the list can also be pretty useful when, for example, executing OSINT.
172 | 
173 | To filter for anonymous contributors, you may use:
174 | ``` bash
175 | gitxray -o https://github.com/SampleOrg -f anonymous
176 | ```
177 | 
178 | ## And so much more.
179 | 
180 | We've covered a large amount of use-cases for `gitxray`, yet we're nowhere finished. Start X-Raying today your favorite Organizations and Repositories and discover more ways of connecting dots. 
181 | 


--------------------------------------------------------------------------------
/docs/gitxray_buzz.md:
--------------------------------------------------------------------------------
 1 | # Gitxray Buzz
 2 | 
 3 | A section dedicated to Gitxray being mentioned in articles, blog posts and community discussions; or being included in OS distributions.
 4 | 
 5 | * Shout out to Fabian Affolter for <a href='https://search.nixos.org/packages?channel=unstable&show=gitxray&from=0&size=50&sort=relevance&type=packages&query=gitxray'>adding Gitxray to NixOS.</a>
 6 | * Gitxray is listed under the "<a href='https://owasp.org/www-community/Free_for_Open_Source_Application_Security_Tools'>Intel and Repository Analysis Tools</a>" category in owasp.org.
 7 | * tl;dr's issue 262 presents <a href='https://tldrsec.com/p/tldr-sec-262'>Gitxray under the AppSec category</a>; cheers to Clint Gibler for an amazing infosec newsletter.
 8 | * Satori CI has <a href='https://satori.ci/run/?playbook=satori://email/contrib.yml'>added Gitxray under multiple Playbooks</a>.
 9 | * <a href='https://www.devopsparadox.com/episodes/hands-on-with-gitxray-2025-01-24/'>Hands on With Gitxray</a>: an Episode of the <a href='https://devopsparadox.com/'>DevOps Paradox podcast</a>, by Darin Pope and Viktor Farcic.
10 | 


--------------------------------------------------------------------------------
/docs/images/console_gitxray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/docs/images/console_gitxray.png


--------------------------------------------------------------------------------
/docs/images/html_report_gitxray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/docs/images/html_report_gitxray.png


--------------------------------------------------------------------------------
/docs/images/logo_gitxray.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/docs/images/logo_gitxray.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to Gitxray 
 2 | Gitxray (short for Git X-Ray) is a multifaceted security tool designed for use on GitHub repositories. It can serve many purposes, including OSINT and Forensics. `gitxray` leverages public GitHub REST APIs to gather information that would otherwise be very time-consuming to obtain manually. Additionally, it seeks out information in unconventional places.
 3 | 
 4 | The Octocat getting X-Rayed  | [![Build Workflows](https://github.com/kulkansecurity/gitxray/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/kulkansecurity/gitxray) [![Latest Version in PIP](https://img.shields.io/pypi/v/gitxray.svg)](https://pypi.org/project/gitxray) [![Python Versions](https://img.shields.io/pypi/pyversions/gitxray.svg)](https://pypi.org/project/gitxray) [![License](https://img.shields.io/pypi/l/gitxray.svg)](https://github.com/kulkansecurity/gitxray/blob/main/LICENSE)
 5 | --- | ---
 6 | ![Gitxray Logo](https://kulkansecurity.github.io/gitxray/images/logo_gitxray.png "Gitxray Logo") | ![Gitxray Console](https://kulkansecurity.github.io/gitxray/images/console_gitxray.png "Gitxray Console")<br/> ![Gitxray HTML Report](images/html_report_gitxray.png "HTML Report Gitxray")<div style="clear: both;"></div>
 7 | 
 8 | # What is it for?
 9 | * Identifying threat actors in a Repository. [You may spot co-owned or shared accounts](/features/#spotting-shared-co-owned-or-fake-contributors), as well as inspect public events to [spot fake Stargazers](/features/#fake-stars-private-repos-gone-public-and-more).
10 | * Forensics use-cases, such as [finding out what else happened on the day of an Incident](/features/#forensics-what-happened-on-the-day-of-an-incident).
11 | * [Finding sensitive information in contributor profiles](/features/#unintended-disclosures-in-contributor-profiles) disclosed by accident within, for example, Armored PGP Keys, or Key Names.
12 | * Collecting [email addresses and analyzing contributor accounts](/features/#lots-of-e-mail-addresses-and-profiling-data) belonging to GitHub organizations and repositories.
13 | * Identifying fake or infected Repositories. It can [detect tampered commit dates](/features/#untrustworthy-repositories-and-activity) as well as, for example, [Release assets updated post-release](/features/#looking-out-for-malicious-releases-and-assets).
14 | * And so. much. more.
15 | 
16 | # Getting started
17 | * [Installing Gitxray](installing.md)
18 | * [Features](features.md) &#128171;
19 | 
20 | ## Rate Limits and the GitHub API
21 | 
22 | Gitxray gracefully handles Rate Limits and can work out of the box without a GitHub API key, _but_ you'll likely hit RateLimits pretty fast. This is detailed by GitHub in their [documentation here](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users). A simple read-only token created for PUBLIC repositories will however help you increase those restrictions considerably. If you're not in a hurry or can leave `gitxray` running you'll be able to use its full capacity, as it pauses execution while waiting for the limits to lift.
23 | 
24 | ## License
25 | 
26 | `gitxray` is provided under the terms and conditions of the [GNU Affero GPL v3 License](https://www.gnu.org/licenses/agpl-3.0.txt).
27 | 


--------------------------------------------------------------------------------
/docs/installing.md:
--------------------------------------------------------------------------------
 1 | # Installing and Updating Gitxray
 2 | 
 3 | gitxray was written with no use of external package dependencies other than the `requests` library.
 4 | 
 5 | ## PyPI (PIP) Way
 6 | 
 7 | `gitxray` is on PyPI and can be installed and updated with:
 8 | 
 9 | ```bash
10 | pip install gitxray --upgrade
11 | ```
12 | 
13 | Once installed, simply run gitxray from your command line by typing:
14 | ```bash
15 | gitxray -o https://github.com/SampleOrg
16 | ```
17 | or
18 | ```bash
19 | gitxray -r https://github.com/SampleOrg/SampleRepo
20 | ```
21 | 
22 | Including https://github.com/ in the Repository or Organization is optional.
23 | 
24 | ## Installing from source
25 | 
26 | You may also run `gitxray` directly by cloning or downloading its GitHub [repository](https://github.com/kulkansecurity/gitxray/) and running:
27 | 
28 | ```bash
29 | python3 -m pip install -r requirements.txt
30 | cd src/
31 | python3 -m gitxray.gitxray
32 | ```
33 | 
34 | ## Creating an Access Token to increase Rate Limits
35 | 
36 | Gitxray gracefully handles Rate Limits and can work out of the box without a GitHub API token, but you'll likely hit RateLimits pretty fast (A small to medium-size repository with 10+ Contributors could take hours to complete while it waits for RateLimits to reset) This is detailed by GitHub in their [documentation here](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users). 
37 | 
38 | [Creating a simple read-only token scoped to PUBLIC repositories](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token) will however help you increase those restrictions considerably. If you're not in a hurry or can leave gitxray running you'll be able to use its full capacity, as it pauses execution while waiting for the limits to lift.
39 | 
40 | You may then load the token safely by using (prevents the token from being displayed on screen or getting logged in your shell history):
41 | 
42 | ```bash
43 | read -rs GH_ACCESS_TOKEN
44 | export
45 | ```
46 | 
47 | ## Command Line Arguments
48 | 
49 | ### Required Arguments
50 | 
51 | One of the following must be specified:
52 | 
53 | * `-r, --repository [URL]` - Specify a single repository URL to check. The URL must begin with `https://`. **Example**: `--repository https://github.com/example/repo`
54 | 
55 | * `-rf, --repositories-file [FILEPATH]` - Provide a file path containing a list of repositories, each on a new line. The file must exist. **Example**: `--repositories-file ./list_of_repos.txt`
56 | 
57 | * `-o, --organization [URL]` - Specify an organization URL to check all repositories under that organization. The URL must begin with `https://`. **Example**: `--organization https://github.com/exampleOrg`
58 | 
59 | ### Optional Arguments
60 | 
61 | You'll find these optional but very handy in common gitxray usage.
62 | 
63 | - `-l, --list` - List contributors if a repository is specified or list repositories if an organization is specified. Useful for further focusing on specific entities. **Example**: `--list`
64 | 
65 | - `-c, --contributor [USERNAMES]` - A comma-separated list of GitHub usernames to focus on within the specified repository or organization. **Example**: `--contributor user1,user2`
66 | 
67 | - `-f, --filters [KEYWORDS]` - Comma-separated keywords to filter the results by, such as 'user_input', 'association', or 'mac'. **Example**: `--filters user_input,association,mac`
68 | 
69 | #### Output and Formats
70 | 
71 | - `-out, --outfile [FILEPATH]` - Specify the file path for the output log. Cannot be a directory. **Example**: `--outfile ./output.log`
72 | 
73 | - `-outformat, --output-format [FORMAT]` - Set the format for the log file. Supported formats are `html`, `text` and `json`. Default is `html`. **Example**: `--output-format json`
74 | 
75 | #### Debug
76 | 
77 | - `--debug` - Enable Debug mode for a detailed and extensive output. **Example**: `--debug`
78 | 
79 | 


--------------------------------------------------------------------------------
/docs/show_love.md:
--------------------------------------------------------------------------------
 1 | # Show your love &hearts;
 2 | 
 3 | If and only If `gitxray` has been helpful to you, your support in spreading the `gitxray` word is appreciated. 
 4 | 
 5 | * <a href="https://twitter.com/intent/tweet?text=I'm%20using%20Gitxray%20(https://github.com/kulkansecurity/gitxray/)%20developed%20by%20%40kulkansecurity%20and%20I'm%20loving%20it!" target='_blank'>Share on Twitter</a>
 6 | * <a href="https://www.linkedin.com/shareArticle?mini=true&title=Gitxray&summary=Alalala&url=https%3A//github.com/kulkansecurity/gitxray/" target='_blank'>Share on LinkedIn - Write about your Gitxray experience</a>
 7 | 
 8 | And if you know of anyone looking for a Penetration Testing vendor, send them our way:
 9 | 
10 | * <a href="https://www.kulkan.com/" target='_blank'>Kulkan Security</a>
11 | 


--------------------------------------------------------------------------------
/docs/vulnerable_workflows.md:
--------------------------------------------------------------------------------
 1 | # Vulnerable Workflows
 2 | 
 3 | You may have landed here because Git X-Ray suggested that you further inspect a specific Workflow in a repository that you were X-Raying, or because of some other reason.
 4 | 
 5 | Either way, here's a list of specialized software and documentation on how to proceed with analyzing the security of your workflow.
 6 | 
 7 | # Tools for Workflow analysis
 8 | 
 9 | * [https://github.com/synacktiv/octoscan](https://github.com/synacktiv/octoscan) - A SAST tool for GitHub Workflows.
10 | 
11 | * [https://github.com/AdnaneKhan/Gato-X](https://github.com/AdnaneKhan/Gato-X) - Excellent for identifying at scale vulnerable Workflows.
12 | 
13 | * [https://semgrep.dev/p/github-actions](https://semgrep.dev/p/github-actions) - Semgrep rules for GitHub Workflows.
14 | 
15 | * [https://github.com/tindersec/gh-workflow-auditor](https://github.com/tindersec/gh-workflow-auditor) - A script by Tinder Security which analyzes multiple aspects of a Workflow.
16 | 
17 | # Articles about GitHub Workflows and Security
18 | 
19 | ## Official by GitHub
20 | 
21 | * [https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/](https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/)
22 | * [https://securitylab.github.com/resources/github-actions-untrusted-input/](https://securitylab.github.com/resources/github-actions-untrusted-input/)
23 | * [https://securitylab.github.com/resources/github-actions-building-blocks/](https://securitylab.github.com/resources/github-actions-building-blocks/)
24 | * [https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions)
25 | 
26 | ## By independent researchers and organizations
27 | 
28 | * [https://medium.com/tinder/exploiting-github-actions-on-open-source-projects-5d93936d189f](https://medium.com/tinder/exploiting-github-actions-on-open-source-projects-5d93936d189f)
29 | 
30 | * [OSSF: Mitigating Attack Vectors in GitHub Workflows](https://openssf.org/blog/2024/08/12/mitigating-attack-vectors-in-github-workflows/)
31 | 
32 | # Videos 
33 | 
34 | * [https://www.youtube.com/watch?v=Ers-LcA7Nmc](https://www.youtube.com/watch?v=Ers-LcA7Nmc) - A great video and slides by Rob Bos on GitHub Actions with security in mind
35 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Gitxray
 2 | site_url: https://www.gitxray.com/
 3 | repo_url: https://github.com/kulkansecurity/gitxray
 4 | theme:
 5 |   name: readthedocs
 6 |   language: en
 7 |   features:
 8 |     - navigation.expand
 9 |     - content.code.copy
10 | 
11 | nav:
12 |   - 'installing.md'
13 |   - 'features.md'
14 |   - 'vulnerable_workflows.md'
15 |   - 'code_structure.md'
16 |   - 'gitxray_buzz.md'
17 | 
18 | markdown_extensions:
19 |   - toc:
20 |       permalink: 
21 |   - attr_list
22 |   - md_in_html
23 | 
24 | copyright: |
25 |   Made with &hearts; by <a href="https://www.kulkan.com" target="_blank">Kulkan Security</a> - your favorite <a href="https://www.kulkan.com" target="_blank">Penetration Testing Partner</a>.
26 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "gitxray"
 7 | version = "1.0.17.4"
 8 | authors = [
 9 |   { name="Lucas Lavarello", email="llavarello@kulkan.com" },
10 | ]
11 | description = "A multifaceted security tool which leverages Public GitHub REST APIs for OSINT, Forensics, Pentesting and more."
12 | readme = "README.md"
13 | requires-python = ">=3.8"
14 | dependencies = [
15 |     "requests>=2.32.3",
16 | ]
17 | classifiers = [
18 |     "Programming Language :: Python :: 3",
19 |     "License :: OSI Approved :: GNU Affero General Public License v3",
20 |     "Operating System :: OS Independent",
21 |     "Topic :: Security",
22 |     "Topic :: Utilities",
23 | ]
24 | 
25 | [project.urls]
26 | Homepage = "https://github.com/kulkansecurity/gitxray"
27 | Issues = "https://github.com/kulkansecurity/gitxray/issues"
28 | 
29 | [project.scripts]
30 | gitxray = "gitxray.gitxray:gitxray_cli"
31 | 
32 | [tool.setuptools.package-data]
33 | "gitxray" = ["include/html_report/*.html"]
34 | 
35 | [tool.setuptools]
36 | license-files = []
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.32.3
2 | 


--------------------------------------------------------------------------------
/src/gitxray/gitxray.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import os, sys, datetime
  3 | from gitxray.include import gh_api as gh_api_class, gx_output as gx_output_class, gx_context as gx_context_class, gx_definitions
  4 | from gitxray.xrays import repository_xray
  5 | from gitxray.xrays import contributors_xray
  6 | from gitxray.xrays import association_xray
  7 | from gitxray.xrays import workflows_xray
  8 | 
  9 | def gitxray_cli():
 10 |     print("""
 11 |            ███   █████                                              
 12 |           ░░░   ░░███                                               
 13 |   ███████ ████  ███████   █████ █████ ████████   ██████   █████ ████
 14 |  ███░░███░░███ ░░░███░   ░░███ ░░███ ░░███░░███ ░░░░░███ ░░███ ░███ 
 15 | ░███ ░███ ░███   ░███     ░░░█████░   ░███ ░░░   ███████  ░███ ░███ 
 16 | ░███ ░███ ░███   ░███ ███  ███░░░███  ░███      ███░░███  ░███ ░███ 
 17 | ░░███████ █████  ░░█████  █████ █████ █████    ░░████████ ░░███████ 
 18 |  ░░░░░███░░░░░    ░░░░░  ░░░░░ ░░░░░ ░░░░░      ░░░░░░░░   ░░░░░███ 
 19 |  ███ ░███                                                  ███ ░███ 
 20 | ░░██████                                                  ░░██████  
 21 |  ░░░░░░                                                    ░░░░░░   
 22 | gitxray: X-Ray and analyze GitHub Repositories and their Contributors. Trust no one!
 23 | v1.0.17.4 - Developed by Kulkan Security [www.kulkan.com] - Penetration testing by creative minds.
 24 | """+"#"*gx_definitions.SCREEN_SEPARATOR_LENGTH)
 25 | 
 26 |     # Let's initialize a Gitxray context, which parses arguments and more.
 27 |     gx_context = gx_context_class.Context()
 28 | 
 29 |     # Let's initialize our Output object that handles stdout and file writing in text or json
 30 |     gx_output = gx_output_class.Output(gx_context)
 31 | 
 32 |     # And GitHub's REST API, sharing a ref to the Output object
 33 |     gh_api = gh_api_class.GitHubRESTAPI(gx_output)
 34 | 
 35 |     # Let's warn the user that unauth RateLimits are pretty low
 36 |     if not gx_context.usingToken():
 37 |         gx_output.warn(f"{gx_definitions.ENV_GITHUB_TOKEN} environment variable not set, using GitHub RateLimits unauthenticated.")
 38 |         gx_output.warn("Unauthenticated requests to the GitHub API will enforce a very low and strict RateLimit.")
 39 |         gx_output.warn("Without setting a GitHub token you may only be able to scan small repositories uninterruptedly.")
 40 |     else:
 41 |         gx_output.notify(f"GitHub Token loaded from {gx_definitions.ENV_GITHUB_TOKEN} env variable.")
 42 | 
 43 |     gx_output.notify(f"Output format set to [{gx_context.getOutputFormat().upper()}] - You may change it with -outformat.")
 44 | 
 45 |     if gx_context.getOutputFile():
 46 |         gx_output.notify(f"Output file set to: {str(gx_context.getOutputFile())} - You may change it with -out.")
 47 |         if gx_context.getOrganizationTarget():
 48 |             # Let's warn the user that in Organization mode, the output file will contain a repository name preffix
 49 |             gx_output.warn("The Output file name when targetting an Organization will include an Organization and Repository prefix.")
 50 | 
 51 |     if gx_context.getOutputFilters():
 52 |         gx_output.notify(f"You have ENABLED filters - You will only see results containing the following keywords: {str(gx_context.getOutputFilters())}")
 53 | 
 54 |     if gx_context.getOrganizationTarget():
 55 |         org_repos = gh_api.fetch_repositories_for_org(gx_context.getOrganizationTarget())
 56 |         gx_output.stdout("#"*gx_definitions.SCREEN_SEPARATOR_LENGTH)
 57 |         if isinstance(org_repos, list) and len(org_repos) > 0: 
 58 |             gx_output.notify(f"YOU HAVE EXPANDED THE SCOPE TO AN ORGANIZATION: A list of {len(org_repos)} repositories have been discovered. Sit tight.")
 59 |             if gx_context.listAndQuit():
 60 |                 gx_output.notify(f"LISTING REPOSITORIES FOR THE ORGANIZATION AND EXITING..", False)
 61 |                 gx_output.stdout(", ".join([r.get('full_name') for r in org_repos]), False)
 62 |                 sys.exit()
 63 |             gx_context.setRepositoryTargets([r.get('html_url') for r in org_repos])
 64 |         else: 
 65 |             gx_output.warn("Unable to pull repositories for the organization URL that was provided. Is it a valid Organization URL?")
 66 |             if gx_context.debugEnabled():
 67 |                 gx_output.stdout(org_repos, shushable=False)
 68 |             sys.exit()
 69 | 
 70 |     try:
 71 |         for repo in gx_context.getRepositoryTargets():
 72 |             r_started_at = datetime.datetime.now()
 73 | 
 74 |             repository = gh_api.fetch_repository(repo)
 75 |             if "full_name" not in repository.keys():
 76 |                 if "block" in repository.keys(): repository["full_name"] = "/".join(repo.rstrip("/").split("/")[-2:])
 77 |                 else: 
 78 |                     print("Unable to pull data for the repository that was provided. Is it a valid repo URL?")
 79 |                     sys.exit()
 80 | 
 81 |             gx_output.r_log(f"X-Ray on repository started at: {r_started_at}", repository=repository.get('full_name'), rtype="metrics")
 82 |             gx_output.stdout("#"*gx_definitions.SCREEN_SEPARATOR_LENGTH)
 83 |             gx_output.stdout("Now verifying repository: {}".format(repository.get('full_name')))
 84 |   
 85 |             if "block" in repository.keys(): gx_output.r_log(f"WARNING: The repository was DISABLED and BLOCKED by GitHub. Reason: {repository['block']['reason']}", rtype="profiling")
 86 | 
 87 |             # Let's keep track of the repository that we're X-Raying
 88 |             gx_context.setRepository(repository)
 89 | 
 90 |             # if an Organization is in target, add a repository prefix to the output filename
 91 |             if gx_context.getOrganizationTarget() and gx_context.getOutputFile(): gx_context.setOutputFilePrefix(repository.get("full_name"))
 92 | 
 93 |             # Now call our xray modules! Specifically by name, until we make this more plug and play
 94 |             # The standard is that a return value of False leads to skipping additional modules
 95 |             if not contributors_xray.run(gx_context, gx_output, gh_api): continue
 96 |             if not repository_xray.run(gx_context, gx_output, gh_api): continue
 97 |             if not workflows_xray.run(gx_context, gx_output, gh_api): continue
 98 | 
 99 |             # Now that we're done, let's cross reference everything in the repository.
100 |             association_xray.run(gx_context, gx_output, gh_api)
101 | 
102 |             r_ended_at = datetime.datetime.now()
103 |             gx_output.r_log(f"X-Ray on repository ended at: {r_ended_at} - {((r_ended_at-r_started_at).seconds/60):.2f} minutes elapsed", rtype="metrics")
104 |             gx_output.doOutput()
105 | 
106 |             gx_output.stdout(f"\rRepository has been analyzed.." + " "*40)
107 | 
108 |             # We're resetting our context on every new repo; eventually we'll maintain a context per Org.
109 |             gx_context.reset() 
110 | 
111 |     except KeyboardInterrupt:
112 |         gx_output.warn("\r\nMain program flow interrupted - Printing all results obtained this far.")
113 |         gx_output.doOutput()
114 | 
115 | if __name__ == "__main__":
116 |     gitxray_cli()
117 | 


--------------------------------------------------------------------------------
/src/gitxray/include/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/src/gitxray/include/__init__.py


--------------------------------------------------------------------------------
/src/gitxray/include/gh_api.py:
--------------------------------------------------------------------------------
  1 | import os, requests, base64, re, time, urllib
  2 | 
  3 | class GitHubRESTAPI:
  4 |     def __init__(self, gx_output):
  5 |         self.gx_output = gx_output
  6 |         # GitHub API URL
  7 |         self.GITHUB_API_BASE_URL = "https://api.github.com"
  8 |         # Get an optional token to get better rate limits
  9 |         self.GITHUB_TOKEN = os.environ.get("GH_ACCESS_TOKEN", None)
 10 | 
 11 |     def make_request(self, url, headers, params):
 12 |         response = requests.get(url, headers=headers, params=params)
 13 |         if response.status_code == 401 or (response.status_code == 403 and "not accessible by" in response.text):
 14 |             self.gx_output.stdout(response.text)
 15 |             raise Exception(f"\r\n\033[33mUnauthorized: Check your GitHub Access Token for permissions.\r\nIf testing against a private repository, you will need read-only: Contents, Custom Properties, Deployments, Actions, Issues and Pull Requests.\033[0m")
 16 |     
 17 |         data = response.json() if len(response.content) > 0 else []
 18 |         rate_limit_remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
 19 |         rate_limit_reset = int(response.headers.get('X-RateLimit-Reset', time.time() + 1))
 20 |         links = response.headers.get('Link', '')
 21 |         return data, links, rate_limit_remaining, rate_limit_reset
 22 | 
 23 |     def get_total_pages_from_link_header(self, links):
 24 |         if not links:
 25 |             return None
 26 |     
 27 |         # Parse the Link header to find the "last" page
 28 |         for link in links.split(','):
 29 |             if 'rel="last"' in link:
 30 |                 last_page_url = link.split(';')[0].strip('<> ')
 31 |                 # Extract the page number from the URL
 32 |                 if 'page=' in last_page_url:
 33 |                     try:
 34 |                         return int(last_page_url.split('page=')[-1].split('&')[0])
 35 |                     except ValueError:
 36 |                         pass
 37 |         return None
 38 | 
 39 |     def get_last_two_path_segments(self, url):
 40 |         parsed_url = urllib.parse.urlparse(url)
 41 |         path = parsed_url.path  
 42 |         parts = [part for part in path.split("/") if part]  
 43 |         if len(parts) >= 2:
 44 |             return f"{parts[-2]}/{parts[-1]}"
 45 |         elif len(parts) == 1:
 46 |             return parts[-1]
 47 |         else:
 48 |             return "" 
 49 | 
 50 |     def github_request_json(self, url, params=None, limit_results=None):
 51 |         # https://docs.github.com/en/rest/about-the-rest-api/api-versions?apiVersion=2022-11-28
 52 |         headers = {"X-GitHub-Api-Version":"2022-11-28"}
 53 |         if self.GITHUB_TOKEN:
 54 |             headers["Authorization"] = f"token {self.GITHUB_TOKEN}"
 55 | 
 56 |         if params is None:
 57 |             params = {}
 58 |         params['per_page'] = 100
 59 | 
 60 |         all_results = None
 61 |         next_url = url
 62 |         remaining = -1
 63 |         pages_fetched = 0
 64 |         total_pages = None
 65 |         start_time = time.time()
 66 | 
 67 |         while next_url:
 68 | 
 69 |             try:
 70 | 
 71 |                 try:
 72 |                     data, links, remaining, reset = self.make_request(next_url, headers, params)
 73 |                 except Exception as ex:
 74 |                     self.gx_output.stdout(ex, shushable=False)
 75 |                     self.gx_output.stdout(f"Failed to talk to the GitHub API when fetching URL: {next_url} - Quitting.", shushable=False)
 76 |                     exit(-1)
 77 | 
 78 |                 if remaining == 0:
 79 |                     # Calculate how long to sleep, then sleep
 80 |                     sleep_time = reset - time.time()
 81 |                     if sleep_time > 0:
 82 |                         hours, remainder = divmod(int(sleep_time), 3600)
 83 |                         minutes, seconds = divmod(remainder, 60)
 84 |                         message = f"GitHub Rate limit reached. Sleeping for {hours} hours, {minutes} minutes, and {seconds} seconds. You may go and make coffee.."
 85 |                         self.gx_output.stdout(f"\r\n\033[33m{message}\033[0m", shushable=False, flush=True)
 86 |                         if self.GITHUB_TOKEN == None:
 87 |                             message = f"You should try using a GitHub Access Token, improves the experience significantly and it's easy!"
 88 |                             self.gx_output.stdout(f"\033[33m{message}\033[0m", flush=True)
 89 |                             self.gx_output.stdout("For information on how to create a GitHub API Access Token refer to: ")
 90 |                             self.gx_output.stdout("https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens")
 91 |                             self.gx_output.stdout("For information on GitHub Rate Limits refer to: ")
 92 |                             self.gx_output.stdout("https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api")
 93 |         
 94 |                         time.sleep(sleep_time + 1)  # Sleep until the reset time, plus a little buffer
 95 |                         continue # and restart the loop
 96 | 
 97 |                 if all_results is None:
 98 |                    all_results = data
 99 |                 # if we come from all_results being a list, then we're extending it.
100 |                 elif isinstance(all_results, list):
101 |                     all_results.extend(data)
102 |                 elif isinstance(all_results, dict) and data.get('total_count') != None:
103 |                     all_results[list(all_results.keys())[-1]].extend(list(data.values())[-1])
104 |                 else:
105 |                     all_results.update(data)
106 | 
107 |                 pages_fetched += 1
108 |                 if total_pages is None:
109 |                     total_pages = self.get_total_pages_from_link_header(links)
110 | 
111 |                 # Print progress if total pages is known
112 |                 if total_pages:
113 |                     progress = (pages_fetched / total_pages) * 100
114 |                     elapsed_time = time.time() - start_time
115 |                     avg_time_per_page = elapsed_time / pages_fetched
116 |                     remaining_pages = total_pages - pages_fetched
117 |                     estimated_time_left = remaining_pages * avg_time_per_page
118 |                     time_estimate = f": {estimated_time_left:.0f} seconds left."
119 |                     urlpath = self.get_last_two_path_segments(url)
120 |                     self.gx_output.stdout(f"\rFetching {urlpath} [Hit CTRL^C to skip]: ({progress:.2f}%) {time_estimate}" + " " * 30, flush=True, end="")
121 | 
122 |                 # Reset next_url
123 |                 next_url = None
124 | 
125 |                 # Using "limit" we can cap the amount of results in order to prevent huge amounts of requests.
126 |                 if limit_results == None or \
127 |                     ((isinstance(all_results, list) and len(all_results) < limit_results) \
128 |                     or (isinstance(all_results, dict) and all_results.get('total_count') != None and len(list(all_results.values())[-1]) < limit_results)):
129 |                     if 'rel="next"' in links:
130 |                         for link in links.split(','):
131 |                             if 'rel="next"' in link:
132 |                                 next_url = link.split(';')[0].strip('<> ')
133 |                                 break
134 | 
135 |             except KeyboardInterrupt:
136 |                 self.gx_output.stdout("\r\n\033[33mReceived CTRL+C - Skipping..\033[0m")
137 |                 next_url = None
138 | 
139 | 
140 |         return all_results
141 | 
142 | 
143 |     def fetch_domains_from_code(self, repository):
144 |         matches = self.github_request_json(f"{self.GITHUB_API_BASE_URL}/search/code?q=repo:{repository}%20in:file%20http")
145 |         for m in matches['items']:
146 |             code = base64.b64decode(self.github_request_json(m['url'])["content"]).decode()
147 |             # This by no means is a complete regex - do not rely on this code picking up ALL possible domains
148 |             url_pattern = r'https?://([\w.-]+)'
149 |             # Find all matches in the code content
150 |             matches = re.findall(url_pattern, code)
151 |             return matches
152 | 
153 |     def fetch_repository(self, github_url):
154 |         # Extract owner and repository name from the GitHub URL
155 |         parts = github_url.strip('/').split('/')
156 |         owner = parts[-2]
157 |         repo_name = parts[-1]
158 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{owner}/{repo_name}")
159 | 
160 |     def fetch_repositories_for_org(self, org_url):
161 |         # Extract the Org from the URL
162 |         org = org_url.strip('/').split('/')[-1]
163 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/orgs/{org}/repos")
164 | 
165 |     def fetch_repository_file_contents(self, repository, path):
166 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repository.get('full_name')}/contents/{path}")
167 | 
168 |     def fetch_commits(self, repo, author=None):
169 |         return self.github_request_json(repo.get('commits_url').replace("{/sha}", f'?author={author}' if author != None else ""))
170 | 
171 |     def fetch_ssh_signing_keys(self, login):
172 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/users/{login}/ssh_signing_keys")
173 | 
174 |     def fetch_ssh_auth_keys(self, login):
175 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/users/{login}/keys")
176 | 
177 |     def fetch_gpg_keys(self, login):
178 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/users/{login}/gpg_keys")
179 | 
180 |     def fetch_repository_stargazers(self, repo, limit):
181 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/stargazers", limit_results=limit)
182 | 
183 |     def fetch_repository_custom_values(self, repo):
184 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/properties/values")
185 | 
186 |     def fetch_repository_public_events(self, repo):
187 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/events")
188 | 
189 |     def fetch_repository_commit_comments(self, repo):
190 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/comments")
191 | 
192 |     def fetch_repository_issues_comments(self, repo):
193 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/issues/comments")
194 | 
195 |     def fetch_repository_pulls_comments(self, repo):
196 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/pulls/comments")
197 | 
198 |     def fetch_repository_actions_workflows(self, repo):
199 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/actions/workflows")
200 | 
201 |     def fetch_repository_actions_artifacts(self, repo, limit=None):
202 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/actions/artifacts", limit_results=limit)
203 | 
204 |     def fetch_repository_actions_runs(self, repo, workflow_file=None, limit=None):
205 |         if workflow_file != None:
206 |             return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/actions/workflows/{workflow_file}/runs", limit_results=limit)
207 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/actions/runs", limit_results=limit)
208 | 
209 |     def fetch_repository_releases(self, repo):
210 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/releases")
211 | 
212 |     def fetch_repository_tags(self, repo):
213 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/tags")
214 | 
215 |     def fetch_repository_labels(self, repo):
216 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/labels")
217 | 
218 |     def fetch_repository_branches(self, repo):
219 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/branches")
220 | 
221 |     def fetch_repository_contributors(self, repo):
222 |         data = self.github_request_json(repo.get('contributors_url'), {'anon':1})
223 |         # Disregarding if anon is set to 1/True or not, in extremely large repositories, GH returns a 403 status with:
224 |         # "The history or contributor list is too large to list contributors for this repository via the API."
225 |         if isinstance(data, dict) and data.get("status", "0") == "403": 
226 |             self.gx_output.stdout(f"\r\n\033[33mGitHub's REST API declined returning the list of Contributors with a message:\033[0m", flush=True)
227 |             self.gx_output.stdout(f"\033[33m{data.get('message','')}\033[0m", flush=True)
228 |             return {}
229 |         return data
230 | 
231 |     def fetch_repository_deployments(self, repo):
232 |         return self.github_request_json(repo.get('deployments_url'))
233 | 
234 |     def fetch_repository_environments(self, repo):
235 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/environments")
236 | 
237 |     def fetch_environment_protection_rules(self, repo, environment):
238 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/repos/{repo.get('full_name')}/environments/{environment}/deployment_protection_rules")
239 | 
240 |     def fetch_repository_pull_requests(self, repo):
241 |         return self.github_request_json(repo.get('pulls_url', str()).replace("{/number}",""), {'state':'all'})
242 | 
243 |     def fetch_repository_issues(self, repo):
244 |         return self.github_request_json(repo.get('issues_url').replace("{/number}",""), {'state':'all'})
245 | 
246 |     def fetch_contributor(self, contributor_obj):
247 |         return self.github_request_json(contributor_obj['url'])
248 | 
249 |     def fetch_contributor_contributions(self, repo, contributor_obj):
250 |         return self.github_request_json(repo.get('commits_url').replace("{/sha}", ""), {'author':contributor_obj['login']})
251 | 
252 |     def fetch_contributor_events(self, contributor_obj):
253 |         return self.github_request_json(contributor_obj.get('events_url').replace("{/privacy}", ""))
254 | 
255 |     def search_repositories_by_name(self, name, limit):
256 |         return self.github_request_json(f"{self.GITHUB_API_BASE_URL}/search/repositories", {'q':name, 'type':'repositories','s':'stars','o':'desc'}, limit_results=limit)
257 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gh_public_events.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from gitxray.include import gh_time
  3 | import datetime
  4 | 
  5 | def log_events(events, gx_output, for_repository=False):
  6 |     if events == None or len(events) == 0: return
  7 | 
  8 |     logging_func = gx_output.c_log if not for_repository else gx_output.r_log
  9 | 
 10 |     event_summary = defaultdict(int)
 11 | 
 12 |     for event in events:
 13 |         etype = event.get('type')
 14 |         ts = event.get('created_at')
 15 |         payload = event.get('payload', {})
 16 |         actor = event.get('actor', {}).get('login', '')+' ' if for_repository else ""
 17 |         repo_name = event.get('repo', {}).get('name', 'unknown')
 18 | 
 19 |         # We're going to summarize recent events by YYYY-MM in order not to flood our console 
 20 |         try: 
 21 |             event_date = gh_time.parse_date(ts)
 22 |             month_key = event_date.strftime('%Y-%m')
 23 |         except Exception:
 24 |             gx_output.debug(f"Invalid date format for event: {ts}")
 25 |             continue
 26 | 
 27 |         action = payload.get('action', 'performed')
 28 |         repo_name = event.get('repo', {}).get('name', 'unknown')
 29 | 
 30 |         # Add one more event type to the summary
 31 |         summary_key = (month_key, etype, action)
 32 |         event_summary[summary_key] += 1
 33 | 
 34 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#commitcommentevent
 35 |         if etype == "CommitCommentEvent":
 36 |                 logging_func(f"{ts}: {actor}created a comment in a commit: [{payload.get('comment').get('html_url')}]", rtype="90d_events")
 37 |                 pass
 38 | 
 39 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#createeventA
 40 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#deleteevent
 41 |         elif etype == "CreateEvent" or etype == "DeleteEvent":
 42 |             action = "created" if etype == "CreateEvent" else "deleted"
 43 |             if payload.get('ref_type') == "repository":
 44 |                 logging_func(f"{ts}: {actor}{action} a repository: [{event.get('repo').get('name')}]", rtype="90d_events")
 45 |             else:
 46 |                 logging_func(f"{ts}: {actor}{action} a {payload.get('ref_type')}: [{payload.get('ref')}] in repo [{event.get('repo').get('name')}]", rtype="90d_events")
 47 | 
 48 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#forkevent
 49 |         elif etype == "ForkEvent":
 50 |             logging_func(f"{ts}: {actor}forked a repository: {event.get('repo').get('name')} into {payload.get('forkee').get('full_name')}", rtype="90d_events")
 51 | 
 52 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#gollumevent
 53 |         elif etype == "GollumEvent":
 54 |             for page in payload.get('pages'):
 55 |                 logging_func(f"{ts}: {actor}{page.get('action')} Wiki page at [{page.get('html_url')}]", rtype="90d_events")
 56 | 
 57 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#issuecommentevent
 58 |         elif etype == "IssueCommentEvent":
 59 |             logging_func(f"{ts}: {actor}{action} a comment in an Issue [{payload.get('issue').get('html_url')}]", rtype="90d_events")
 60 | 
 61 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#issuesevent
 62 |         elif etype == "IssuesEvent":
 63 |                 logging_func(f"{ts}: {actor}{action} an Issue: [{payload.get('issue').get('html_url')}]", rtype="90d_events")
 64 | 
 65 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#memberevent
 66 |         elif etype == "MemberEvent":
 67 |             added_who = payload.get('member').get('login')
 68 |             to_repo = event.get('repo').get('name')
 69 |             logging_func(f"{ts}: {actor}{action} a user [{added_who}] as a collaborator to repo: [{to_repo}]", rtype="90d_events")
 70 | 
 71 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#publicevent
 72 |         elif etype == "PublicEvent":
 73 |             logging_func(f"{ts}: {actor}switched a repository from PRIVATE to PUBLIC, repo: [{event.get('repo').get('name')}]", rtype="90d_events")
 74 | 
 75 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#pullrequestevent
 76 |         elif etype == "PullRequestEvent":
 77 |             logging_func(f"{ts}: {actor}{action} a PR: [{payload.get('pull_request').get('html_url')}]", rtype="90d_events")
 78 | 
 79 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#pullrequestreviewevent
 80 |         elif etype == "PullRequestReviewEvent":
 81 |             logging_func(f"{ts}: {actor}{action} a PR Review: [{payload.get('pull_request').get('html_url')}]", rtype="90d_events")
 82 | 
 83 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#pullrequestreviewcommentevent
 84 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#pullrequestreviewthreadevent
 85 |         elif etype == "PullRequestReviewCommentEvent" or etype == "PullRequestReviewThreadEvent":
 86 |             logging_func(f"{ts}: {actor}{action} a comment in PR: [{payload.get('pull_request').get('html_url')}]", rtype="90d_events")
 87 | 
 88 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#pushevent
 89 |         elif etype == "PushEvent":
 90 |             logging_func(f"{ts}: {actor}pushed a total of {len(payload.get('commits'))} commits from: [{payload.get('ref')}]", rtype="90d_events")
 91 | 
 92 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#releaseevent
 93 |         elif etype == "ReleaseEvent":
 94 |             logging_func(f"{ts}: {actor}published a Release at [{payload.get('release').get('html_url')}]", rtype="90d_events")
 95 | 
 96 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#sponsorshipevent
 97 |         elif etype == "SponsorshipEvent":
 98 |             logging_func(f"{ts}: {actor}{action} a Sponsorship Event]", rtype="90d_events")
 99 | 
100 |         # https://docs.github.com/en/rest/using-the-rest-api/github-event-types?apiVersion=2022-11-28#watchevent
101 |         elif etype == "WatchEvent":
102 |             logging_func(f"{ts}: {actor}starred a repository: [{event.get('repo').get('name')}]", rtype="90d_events")
103 |         else:
104 |             logging_func(f"Missing parser in recent events for: {etype} with {payload}", rtype="debug")
105 | 
106 | 
107 |     # Now let's create the non-debug summarized version of messages.
108 |     for (month, etype, action), count in event_summary.items():
109 |         summary_message = f"In {month}, "
110 |         if etype == "WatchEvent":
111 |             if for_repository:
112 |                 summary_message += f"{count} users starred the repository."
113 |             else:
114 |                 summary_message += f"the user starred {count} repositories."
115 |         elif etype == "ForkEvent":
116 |             if for_repository:
117 |                 summary_message += f"the repository was forked {count} times."
118 |             else:
119 |                 summary_message += f"the user forked {count} repositories"
120 |         elif etype == "SponsorshipEvent":
121 |             if for_repository:
122 |                 summary_message += f"the repository had a sponsorship event."
123 |             else:
124 |                 summary_message += f"the user created a sponsorship event."
125 |         elif etype == "ReleaseEvent":
126 |             if for_repository:
127 |                 summary_message += f"the repository had published {count} releases."
128 |             else:
129 |                 summary_message += f"the user published {count} releases."
130 |         elif etype == "PushEvent":
131 |             if for_repository:
132 |                 summary_message += f"users pushed commits to the repository {count} times."
133 |             else:
134 |                 summary_message += f"the user pushed commits {count} times."
135 |         elif etype == "PullRequestReviewCommentEvent" or etype =="PullRequestReviewThreadEvent":
136 |             if for_repository:
137 |                 summary_message += f"users {action} comments in PRs {count} times."
138 |             else:
139 |                 summary_message += f"the user {action} comments in PRs {count} times."
140 |         elif etype == "PullRequestEvent":
141 |             if for_repository:
142 |                 summary_message += f"users {action} PRs {count} times."
143 |             else:
144 |                 summary_message += f"the user {action} PRs {count} times."
145 |         elif etype == "PullRequestReviewEvent":
146 |             if for_repository:
147 |                 summary_message += f"users {action} PR Reviews {count} times."
148 |             else:
149 |                 summary_message += f"the user {action} PR Reviews {count} times."
150 |         elif etype == "PublicEvent":
151 |             if for_repository:
152 |                 summary_message += f"the repository's visibility switched from private to PUBLIC!"
153 |             else:
154 |                 summary_message += f"the user switched a repository from private to PUBLIC!"
155 |         elif etype == "GollumEvent":
156 |             if for_repository:
157 |                 summary_message += f"users {action} the repository Wiki page {count} times."
158 |             else:
159 |                 summary_message += f"the user {action} repository Wikis {count} times."
160 |         elif etype == "IssueCommentEvent":
161 |             if for_repository:
162 |                 summary_message += f"users {action} comments in repository Issues {count} times."
163 |             else:
164 |                 summary_message += f"the user {action} comments in repository Issues {count} times."
165 |         elif etype == "IssuesEvent":
166 |             if for_repository:
167 |                 summary_message += f"users {action} Issues {count} times."
168 |             else:
169 |                 summary_message += f"the user {action} Issues on a repository {count} times."
170 |         elif etype == "IssuesEvent":
171 |             if for_repository:
172 |                 summary_message += f"users were {action} as collaborators of the repository {count} times."
173 |             else:
174 |                 summary_message += f"the user {action} other users as collaborators to repositories {count} times."
175 |         elif etype == "CommitCommentEvent":
176 |             if for_repository:
177 |                 summary_message += f"users created comments in commits {count} times."
178 |             else:
179 |                 summary_message += f"the user created comments in commits {count} times."
180 |         elif etype == "CreateEvent" or etype == "DeleteEvent":
181 |             action = "created" if etype == "CreateEvent" else "deleted"
182 |             if for_repository:
183 |                 summary_message += f"users {action} a branch or tag {count} times."
184 |             else:
185 |                 summary_message += f"the user {action} a repository, branch or tag {count} times."
186 |         elif etype == "MemberEvent":
187 |             if for_repository:
188 |                 summary_message += f"users were {action} as collaborators {count} times."
189 |             else:
190 |                 summary_message += f"the user {action} a user as a collaborator to a repo {count} times."
191 |         else:
192 |             summary_message += f"{etype}"
193 | 
194 | 
195 |         logging_func(summary_message, rtype="90d_events")
196 |     return
197 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gh_reactions.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from .gx_definitions import REACTIONS_POSITIVE, REACTIONS_NEUTRAL, REACTIONS_NEGATIVE
 3 | 
 4 | def sort_reactions(d):
 5 |     return sorted(d.items(), key=lambda item: item[1], reverse=True)
 6 | 
 7 | def categorize_reactions(comment, positive_reactions, negative_reactions, neutral_reactions):
 8 | 
 9 |     for reaction in REACTIONS_POSITIVE:
10 |         positive_reactions[comment.get('html_url')] += comment.get('reactions').get(reaction, 0)
11 | 
12 |     for reaction in REACTIONS_NEGATIVE:
13 |         negative_reactions[comment.get('html_url')] += comment.get('reactions').get(reaction, 0)
14 | 
15 |     for reaction in REACTIONS_NEUTRAL:
16 |         neutral_reactions[comment.get('html_url')] += comment.get('reactions').get(reaction, 0)
17 | 
18 |     return
19 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gh_time.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | # GitHub states clearly in https://docs.github.com/en/rest/using-the-rest-api/timezones-and-the-rest-api?apiVersion=2022-11-28
 3 | # that "Timestamps returned by the API are in UTC time, ISO 8601 format."
 4 | # Therefore we need to handle as UTC every date we process.
 5 | def parse_date(date_str):
 6 |     # Parse the ISO 8601 date, keeping timezone information
 7 |     if date_str.endswith('Z'):
 8 |         date_str = date_str[:-1] + '+00:00'  # Replace 'Z' with '+00:00' to make it compatible
 9 |     return datetime.fromisoformat(date_str).replace(tzinfo=timezone.utc if not date_str.endswith('Z') else None)
10 | 
11 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_arg_parser.py:
--------------------------------------------------------------------------------
  1 | # Our argparser is called by the Gitxray Context, we don't talk to it directly.
  2 | import os, sys, argparse, re, datetime
  3 | 
  4 | def parse_repositories_from_file(filepath):
  5 |     if not os.path.exists(filepath):
  6 |         raise argparse.ArgumentTypeError(f"File not found: {filepath}")
  7 | 
  8 |     with open(filepath, 'r') as f:
  9 |         repositories = f.read().splitlines()
 10 | 
 11 |     for repo in repositories:
 12 |         validate_repository(repo)
 13 | 
 14 |     print("Loaded {} repositories.".format(len(repositories)))
 15 |     return repositories
 16 | 
 17 | def validate_repository_org_link(repo):
 18 |     if not repo.startswith("https://"):
 19 |         return f'https://github.com/{repo}'
 20 |     return repo
 21 | 
 22 | def validate_contributors(username_string):
 23 |     # Regex pattern to match valid GitHub usernames
 24 |     pattern = r"^[a-zA-Z0-9,-]+(-[a-zA-Z0-9]+)*$"
 25 |     if not re.match(pattern, username_string):
 26 |         raise argparse.ArgumentTypeError(f"Invalid GitHub usernames. Usernames must consist of alphanumeric characters or single hyphens, and cannot begin or end with a hyphen.")
 27 |     usernames = [username.strip() for username in username_string.split(',')]
 28 |     return usernames
 29 | 
 30 | def validate_filters(filter_string):
 31 |     filters = [filter_name.strip() for filter_name in filter_string.split(',')]
 32 |     return filters
 33 | 
 34 | def parse_arguments():
 35 |     parser = argparse.ArgumentParser(description="Gitxray")
 36 | 
 37 |     group = parser.add_mutually_exclusive_group(required=True)
 38 | 
 39 |     group.add_argument('-r', '--repository',
 40 |             type=validate_repository_org_link,
 41 |             action='store',
 42 |             help='The repository to check (Including https://github.com/ is optional)')
 43 | 
 44 |     group.add_argument('-rf', '--repositories-file',
 45 |             type=parse_repositories_from_file,
 46 |             action='store',
 47 |             help='A file containing repositories separated by newlines.')
 48 | 
 49 |     group.add_argument('-o', '--organization',
 50 |             type=validate_repository_org_link,
 51 |             action='store',
 52 |             help='An organization to check all of their repositories (Including https://github.com/ is optional)')
 53 | 
 54 |     group_two = parser.add_mutually_exclusive_group(required=False)
 55 | 
 56 |     group_two.add_argument('-c', '--contributor',
 57 |             type=validate_contributors,
 58 |             action='store',
 59 |             help="A comma-separated list of contributor usernames to focus on within the Repository or Organization that you Gitxray.")
 60 | 
 61 |     group_two.add_argument('-l', '--list',
 62 |             action='store_true',
 63 |             default=False,
 64 |             help="List contributors (if a repository was specified) or List repositories (if an Org was specified). Useful if you intend to then focus on a specific username or repository.")
 65 | 
 66 |     parser.add_argument('-f', '--filters',
 67 |             type=validate_filters,
 68 |             action='store',
 69 |             help="Comma separated keywords to filter results by (e.g. private,macbook).")
 70 | 
 71 |     parser.add_argument('--debug', 
 72 |             action='store_true', 
 73 |             default=False, 
 74 |             help='Enable Debug mode - be prepared for an excessive amount of output.')
 75 | 
 76 |     parser.add_argument('--shush', 
 77 |             action='store_true',
 78 |             default=False, 
 79 |             help='Reduced output in stdout, shushing Gitxray to remove any progress output.')
 80 | 
 81 |     parser.add_argument('-out', '--outfile', 
 82 |             type=str, 
 83 |             action='store',
 84 |             help='Set the location for the output log file.')
 85 | 
 86 |     parser.add_argument('-outformat', '--output-format', type=str, action='store',
 87 |             default='html',
 88 |             help='Format for log file (html,text,json) - default: html',
 89 |             choices = ['html', 'text', 'json'])
 90 | 
 91 |     args = parser.parse_args()
 92 | 
 93 |     # If output format is 'html' and outfile is not specified, set it to current date and time
 94 |     current_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
 95 |     if args.output_format == 'html' and not args.outfile:
 96 |         args.outfile = f"gitxray_{current_datetime}.html"
 97 | 
 98 |     if args.outfile:
 99 |         if os.path.isdir(args.outfile):
100 |             args.outfile = f"{args.outfile}gitxray_{current_datetime}.html"
101 |         if os.path.isfile(args.outfile):
102 |             target = args.outfile
103 |         else:
104 |             target = os.path.dirname(args.outfile)
105 |             if target == '':
106 |                 target = '.'
107 | 
108 |         if not os.access(target, os.W_OK):
109 |             print("[!] Cannot write to output file, exiting")
110 |             sys.exit()
111 | 
112 |     return args
113 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_context.py:
--------------------------------------------------------------------------------
  1 | from . import gx_arg_parser, gx_definitions
  2 | from collections import defaultdict
  3 | import os, re
  4 | 
  5 | class Context:
  6 |     def __init__(self):
  7 |         self._cmd_args = gx_arg_parser.parse_arguments()
  8 |         self._USING_TOKEN = os.environ.get(gx_definitions.ENV_GITHUB_TOKEN, None)
  9 |         self.reset()
 10 | 
 11 |     def reset(self):
 12 |         self._identifier_user_relationship = defaultdict(list)
 13 |         self._outfile_prefix = None
 14 | 
 15 |     def usingToken(self):
 16 |         return self._USING_TOKEN != None
 17 | 
 18 |     def debugEnabled(self):
 19 |         return self._cmd_args.debug
 20 | 
 21 |     def shushEnabled(self):
 22 |         return self._cmd_args.shush
 23 | 
 24 |     def listAndQuit(self):
 25 |         return self._cmd_args.list
 26 | 
 27 |     def getOutputFile(self):
 28 |         outfile = self._cmd_args.outfile
 29 |         if self._outfile_prefix:
 30 |             directory, filename = os.path.split(outfile)
 31 |             slug = re.sub(r'[^A-Za-z0-9_]', '_', self._outfile_prefix)
 32 |             slug = re.sub(r'_+', '_', slug).strip('_')
 33 |             prefixed_filename = f'{slug}_{filename}'
 34 |             return os.path.join(directory, prefixed_filename)
 35 |     
 36 |         return outfile
 37 | 
 38 |     def setOutputFilePrefix(self, prefix):
 39 |         self._outfile_prefix = prefix
 40 |         return
 41 | 
 42 |     def getOutputFormat(self):
 43 |         return self._cmd_args.output_format if self._cmd_args.output_format is not None else "html"
 44 | 
 45 |     def getOutputFilters(self):
 46 |         return self._cmd_args.filters
 47 | 
 48 |     def getContributorScope(self):
 49 |         return self._cmd_args.contributor
 50 | 
 51 |     def getRepositoryTargets(self):
 52 |         return self._cmd_args.repositories_file if self._cmd_args.repositories_file is not None else [self._cmd_args.repository]
 53 | 
 54 |     def setRepositoryTargets(self, targets_list):
 55 |         self._cmd_args.repositories_file = targets_list
 56 |         return
 57 | 
 58 |     def getOrganizationTarget(self):
 59 |         return self._cmd_args.organization
 60 | 
 61 |     def setRepository(self, repository):
 62 |         self._repository = repository
 63 |         return
 64 | 
 65 |     def setContributor(self, contributor):
 66 |         self._contributor = contributor
 67 |         return
 68 | 
 69 |     def setContributors(self, contributors):
 70 |         if contributors is None: contributors = []
 71 |         self._contributors = contributors
 72 |         return
 73 | 
 74 |     def getRepository(self):
 75 |         return self._repository
 76 | 
 77 |     def getContributor(self):
 78 |         return self._contributor
 79 | 
 80 |     def getContributors(self):
 81 |         return self._contributors
 82 | 
 83 |     def isContributor(self, contributor_login):
 84 |         return any(contributor.get('login') == contributor_login for contributor in self.getContributors())
 85 | 
 86 |     def areContributors(self, contributors_logins):
 87 |         return any(contributor.get('login') in contributors_logins for contributor in self.getContributors())
 88 | 
 89 |     # We also use our gitxray context to cross-reference identifiers.
 90 |     def linkIdentifier(self, identifierType, identifierValues, contributorLogin):
 91 |         for identifierValue in identifierValues:
 92 |             if contributorLogin not in self._identifier_user_relationship[(identifierType, identifierValue)]:
 93 |                 self._identifier_user_relationship[(identifierType, identifierValue)].append(contributorLogin)
 94 |         return
 95 | 
 96 |     def getCollisions(self):
 97 |         collisions = defaultdict(list)
 98 |         for (identifierType, identifierValue), contributors in self._identifier_user_relationship.items():
 99 |             if len(contributors) > 1:
100 |                 collisions[(identifierType, identifierValue)].extend(contributors)
101 | 
102 |         return dict(collisions)
103 | 
104 |     def getIdentifierValues(self, identifierType):
105 |         results = defaultdict(list)
106 |         for (currentIdentifierType, identifierValue), contributors in self._identifier_user_relationship.items():
107 |             if currentIdentifierType == identifierType:
108 |                 for contributor in contributors:
109 |                     if identifierValue not in results[contributor]:
110 |                         results[contributor].append(identifierValue)
111 |         return dict(results)
112 | 
113 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_definitions.py:
--------------------------------------------------------------------------------
  1 | # Name of the Environment variable to use for GitHub Tokens
  2 | ENV_GITHUB_TOKEN = "GH_ACCESS_TOKEN"
  3 | 
  4 | # GitHub has historically signed commits made via its web editor with a Key that expired in 2024
  5 | # The latest Key however has no expiration set. The "web-flow" GitHub account owns these keys:
  6 | # GitHub (web-flow commit signing) <noreply@github.com>
  7 | # https://api.github.com/users/web-flow/gpg_keys
  8 | GITHUB_WEB_EDITOR_SIGNING_KEYS = ['4AEE18F83AFDEB23', 'B5690EEEBB952194']
  9 | 
 10 | # This is ocd related, I needed my separators to match.
 11 | SCREEN_SEPARATOR_LENGTH = 100
 12 | 
 13 | REACTIONS_POSITIVE = ['+1', 'heart']
 14 | REACTIONS_NEGATIVE = ['-1']
 15 | REACTIONS_NEUTRAL = ['laugh', 'hooray', 'confused', 'rocket', 'eyes']
 16 | 
 17 | COMMIT_HOURS = {i: f"{i%12 if i%12 else 12}{'am' if i < 12 else 'pm'} UTC" for i in range(24)}
 18 | 
 19 | # For the HTML output format
 20 | HTML_REPORT_EMOJIS = {
 21 |     "metrics": "&#128200;",
 22 |     "urls": "&#127760;",
 23 |     "personal": "&#127380;",
 24 |     "emails": "&#9993;",
 25 |     "profiling": "&#128373;",
 26 |     "commits": "&#128229;",
 27 |     "keys": "&#128273;",
 28 |     "user_input": "&#9000;",
 29 |     "90d_events": "&#128467;",
 30 |     "releases": "&#128640;",
 31 |     "contributors": "&#128101;",
 32 |     "labels": "&#128278;",
 33 |     "comments": "&#128172;",
 34 |     "deployments": "&#128736;",
 35 |     "environments": "&#127757;",
 36 |     "branches": "&#127807;",
 37 |     "tags": "&#127991;",
 38 |     "workflows": "&#128421;",
 39 |     "artifacts": "&#128230;",
 40 |     "signatures": "&#9997;",
 41 |     "association": "&#129309;",
 42 |     "prs": "&#128256;"
 43 | }
 44 | 
 45 | # Identifies user-supplied data as per: https://securitylab.github.com/resources/github-actions-untrusted-input/
 46 | WORKFLOWS_USER_INPUT = {
 47 |     "Issue Title": r'\${{\s*github\.event\.issue\.title\s*}}',
 48 |     "Issue Body": r'\${{\s*github\.event\.issue\.body\s*}}',
 49 |     "Pull Request Title": r'\${{\s*github\.event\.pull_request\.title\s*}}',
 50 |     "Pull Request Body": r'\${{\s*github\.event\.pull_request\.body\s*}}',
 51 |     "Comment Body": r'\${{\s*github\.event\.comment\.body\s*}}',
 52 |     "Review Body": r'\${{\s*github\.event\.review\.body\s*}}',
 53 |     "Review Comment Body": r'\${{\s*github\.event\.review_comment\.body\s*}}',
 54 |     "Page Name in Pages Event": r'\${{\s*github\.event\.pages(?:\.\w+|\[\d+\])\.page_name\s*}}',
 55 |     "Head Commit Message": r'\${{\s*github\.event\.head_commit\.message\s*}}',
 56 |     "Head Commit Author\'s Email": r'\${{\s*github\.event\.head_commit\.author\.email\s*}}',
 57 |     "Head Commit Author\'s Name": r'\${{\s*github\.event\.head_commit\.author\.name\s*}}',
 58 |     "Commit Author\'s Email": r'\${{\s*github\.event\.commits(?:\.\w+|\[\d+\])\.author\.email\s*}}',
 59 |     "Commit Author\'s Name": r'\${{\s*github\.event\.commits(?:\.\w+|\[\d+\])\.author\.name\s*}}',
 60 |     "Pull Request Head Ref": r'\${{\s*github\.event\.pull_request\.head\.ref\s*}}',
 61 |     "Pull Request Head Label": r'\${{\s*github\.event\.pull_request\.head\.label\s*}}',
 62 |     "Pull Request Default Branch": r'\${{\s*github\.event\.pull_request\.head\.repo\.default_branch\s*}}',
 63 |     "Head Ref": r'\${{\s*github\.head_ref\s*}}',
 64 |     "Inputs in Event": r'\${{\s*github\.event\.inputs(?:\.\w+|\[\w+\])\s*}}',
 65 | }
 66 | 
 67 | OPENPGP_SIG_TYPES = {
 68 |     0x00: "Signature of a binary document",
 69 |     0x01: "Signature of a canonical text document",
 70 |     0x02: "Standalone signature",
 71 |     0x10: "Generic certification of a User ID and Public-Key packet",
 72 |     0x11: "Persona certification of a User ID and Public-Key packet",
 73 |     0x12: "Casual certification of a User ID and Public-Key packet",
 74 |     0x13: "Positive certification of a User ID and Public-Key packet",
 75 |     0x18: "Subkey Binding Signature",
 76 |     0x19: "Primary Key Binding Signature",
 77 |     0x1F: "Signature directly on a key",
 78 |     0x20: "Key revocation signature",
 79 |     0x28: "Subkey revocation signature",
 80 |     0x30: "Certification revocation signature"
 81 | }
 82 | 
 83 | OPENPGP_PK_ALGOS = {
 84 |     1: "RSA (Encrypt or Sign)",
 85 |     2: "RSA Encrypt-Only",
 86 |     3: "RSA Sign-Only",
 87 |     16: "Elgamal Encrypt-Only",
 88 |     17: "DSA",
 89 |     18: "Reserved for Elliptic Curve",
 90 |     19: "Reserved for ECDSA",
 91 |     20: "Reserved (formerly Elgamal Encrypt or Sign)",
 92 |     21: "Reserved for Diffie-Hellman"
 93 | }
 94 | 
 95 | OPENPGP_HASH_ALGOS = {
 96 |     1: "MD5",
 97 |     2: "SHA-1",
 98 |     3: "RIPEMD-160",
 99 |     8: "SHA256",
100 |     9: "SHA384",
101 |     10: "SHA512",
102 |     11: "SHA224"
103 | }
104 | 
105 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_output.py:
--------------------------------------------------------------------------------
  1 | import json, random, os, html, re, datetime, sys
  2 | from collections import defaultdict
  3 | from . import gx_definitions
  4 | 
  5 | class Output:
  6 |     ANSI_COLORS = { "RED": "\033[31m", "GREEN": "\033[32m", "YELLOW": "\033[33m", "BLUE": "\033[34m", "MAGENTA": "\033[35m", "CYAN": "\033[36m", "BRIGHT_RED": "\033[91m", "BRIGHT_GREEN": "\033[92m", "BRIGHT_YELLOW": "\033[93m", "BRIGHT_BLUE": "\033[94m", "BRIGHT_MAGENTA": "\033[95m", "BRIGHT_CYAN": "\033[96m", "BRIGHT_WHITE": "\033[97m", "RESET": "\033[0m" }
  7 | 
  8 | 
  9 |     def __init__(self, gx_context, outfile=None, outformat='text'):
 10 | 
 11 |         self._debug = gx_context.debugEnabled()
 12 | 
 13 |         self._outformat = gx_context.getOutputFormat()
 14 | 
 15 |         self._filters = gx_context.getOutputFilters()
 16 |         self._rtype_color_map = {}
 17 | 
 18 |         self._cscope = gx_context.getContributorScope() 
 19 | 
 20 |         self._gxcontext = gx_context
 21 | 
 22 |         self.reset()
 23 | 
 24 |     def reset(self):
 25 |         self._repositories = {}
 26 |         self._contributors = {}
 27 |         self._anonymous = {}
 28 |         self._keys = {}
 29 |         self._index = None
 30 |         self._repository = None
 31 |         self._contributor = None
 32 |  
 33 | 
 34 |     def _log(self, log_type, data="Default log data", rtype="info", identifier=None):
 35 |         if not identifier:
 36 |             raise Exception("You need to specify an identifier.")
 37 | 
 38 |         storage = getattr(self, f"_{log_type}")
 39 | 
 40 |         if identifier not in storage:
 41 |             storage[identifier] = {}
 42 | 
 43 |         if rtype not in storage[identifier]:
 44 |             storage[identifier][rtype] = []
 45 | 
 46 |         storage[identifier][rtype].append(data)
 47 | 
 48 |     def r_log(self, data="You called r_log without specifying data", rtype="info", repository=None):
 49 |         if repository: self._repository = repository
 50 |         self._log('repositories', data, rtype, self._repository)
 51 | 
 52 |     def c_log(self, data="You called c_log without specifying data", rtype="info", contributor=None):
 53 |         if contributor: self._contributor = contributor
 54 | 
 55 |         if self._cscope and (self._contributor not in self._cscope): return
 56 |         self._log('contributors', data, rtype, self._contributor)
 57 | 
 58 |     def a_log(self, data="You called a_log without specifying data", rtype="info", anonymous=None):
 59 |         self._log('anonymous', data, rtype, anonymous)
 60 | 
 61 | 
 62 |     # Direct output, not really waiting for results to print this out.
 63 |     def debug(self, message):
 64 |         if self._debug:
 65 |             colored_message = f"{self.ANSI_COLORS['YELLOW']}[D]{self.ANSI_COLORS['RESET']} {message}"
 66 |             return print(colored_message)
 67 | 
 68 |     def debug_enabled(self):
 69 |         return self._debug
 70 |    
 71 |     # Direct output, not really waiting for results to print this out.
 72 |     def warn(self, message, shushable=True):
 73 |         colored_message = f"{self.ANSI_COLORS['YELLOW']}{message}{self.ANSI_COLORS['RESET']}"
 74 |         return self.stdout(colored_message, shushable)
 75 | 
 76 |     def notify(self, message, shushable=True):
 77 |         colored_message = f"{self.ANSI_COLORS['BRIGHT_BLUE']}{message}{self.ANSI_COLORS['RESET']}"
 78 |         return self.stdout(colored_message, shushable)
 79 | 
 80 |     # Stdout goes through here
 81 |     def stdout(self, message, shushable=True, end='\n', flush=True):
 82 |         if shushable and self._gxcontext.shushEnabled(): return
 83 |         return print(message, end=end, flush=flush)
 84 | 
 85 |     def get_rtype_color(self, rtype):
 86 |         if rtype not in self._rtype_color_map:
 87 |             self._rtype_color_map[rtype] = random.choice(list(self.ANSI_COLORS.values())[:-1])
 88 |         return self._rtype_color_map[rtype]
 89 | 
 90 |     def _print_output(self, data_source, entity_string, skip_ansi=False):
 91 |         output = ""
 92 | 
 93 |         if skip_ansi: temp_colors = {} # empty colors
 94 |         else: temp_colors = self.ANSI_COLORS
 95 | 
 96 |         reset_color = temp_colors.get('RESET','')
 97 | 
 98 |         # Gather all unique rtypes from both repositories and contributors
 99 |         all_rtypes = {rtype for data in self._repositories.values() for rtype in data.keys()}
100 |         all_rtypes.update(rtype for data in self._contributors.values() for rtype in data.keys())
101 | 
102 |         # Find the longest rtype for formatting purposes
103 |         max_rtype_length = max(len(rtype) for rtype in all_rtypes) if len(all_rtypes) > 0 else 0
104 | 
105 |         no_results = []
106 |         for entity, data in data_source.items():
107 | 
108 |             result_lines = []
109 |             for rtype in data.keys():
110 |                 if not self.debug_enabled() and ("debug" in rtype.lower()):
111 |                     continue
112 | 
113 |                 random_color = "" if skip_ansi else (self.get_rtype_color(rtype) if data_source != self._anonymous else temp_colors.get('BLUE',''))
114 |                 formatted_rtype = f"[{rtype}]:".ljust(max_rtype_length + 1)
115 | 
116 |                 for line in data[rtype]:
117 |                     outline = f"{random_color}{formatted_rtype}{reset_color} {line}\n" 
118 |                     if self._filters != None and (all(f.lower() not in outline.lower() for f in self._filters)):
119 |                         continue
120 |                     result_lines.append(outline)
121 | 
122 | 
123 |             if len(result_lines) > 0: 
124 |                 output += f"#" * gx_definitions.SCREEN_SEPARATOR_LENGTH + "\n"
125 |                 output += f"{temp_colors.get('GREEN','')}Found results{temp_colors.get('RESET','')} for {entity_string}.".replace("ENTITY_STR", entity)
126 |                 if self._filters != None:
127 |                     color = temp_colors.get('BRIGHT_BLUE','')
128 |                     output += f" {color}Filters applied: {str(self._filters)}{reset_color}\n"
129 |                 else: output += "\r\n"
130 |                 output += "".join(result_lines)
131 | 
132 |             else:
133 |                 output += f"#" * gx_definitions.SCREEN_SEPARATOR_LENGTH + "\n"
134 |                 output += f"No results to show for {entity_string}.".replace("ENTITY_STR", entity)
135 |                 if self._filters: output += f" Try removing filters."
136 |                 output += "\n"
137 | 
138 |         if len(no_results) > 0:
139 |             output += f"#" * gx_definitions.SCREEN_SEPARATOR_LENGTH + "\n"
140 |             output += f"No results found for {entity_string}.\n".replace("ENTITY_STR", ",".join(no_results))
141 |             
142 |         return output
143 | 
144 | 
145 |     def html_data_sanitize_and_process(self, text):
146 |         # html.escape already escapes { and } to prevent expression injection
147 |         sanitized_text = html.escape(text, quote=True)
148 | 
149 |         # New pattern to match URLs both inside and outside of brackets
150 |         url_pattern = re.compile(r'\[(https?://[^\s\]]+)\]|\b(https?://[^\s]+)')
151 |     
152 |         # Function to handle matching groups
153 |         def replace_url(match):
154 |             url = match.group(1) or match.group(2)
155 |             return f'<a href="{url}" target="_blank">{url}</a>'
156 | 
157 |         # Substitute using the custom function
158 |         clickable_text = url_pattern.sub(replace_url, sanitized_text)
159 |         return clickable_text
160 | 
161 |     def _create_html_output(self):
162 |         TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "html_report")
163 | 
164 |         # Load all template files
165 |         templates = {
166 |             name: open(os.path.join(TEMPLATE_DIR, f"template_{name}.html"), "r", encoding="utf-8").read()
167 |             for name in ["main", "repository", "contributor", "non_contributor", "table", "highlights"]
168 |         }
169 | 
170 |         category_sections = ""
171 |         contributor_sections = ""
172 |         more_sections = ""
173 |         highlights_section = ""
174 |         repository_sections = ""
175 |         repository_sidebar_links = ""
176 |         contributor_sidebar_links = ""
177 |         category_sidebar_links = ""
178 |         more_sidebar_links = ""
179 |         highlights_rows = []
180 | 
181 |         for entity, data in self._repositories.items():
182 |             sanitized_entity_raw = self.html_data_sanitize_and_process(entity)
183 |             sanitized_entity = sanitized_entity_raw.replace("/","_")
184 |             r_template = templates['repository'].replace("{{repository_id}}", str(sanitized_entity))
185 | 
186 |             r_tables = []
187 |             r_sidebar_links = []
188 |             for rtype in data.keys():
189 |                 if not self.debug_enabled() and ("debug" in rtype.lower()): continue
190 |                 data_rows = []
191 |                 for line in data[rtype]:
192 |                     if "warning: " in line.lower(): highlights_rows.append(f"<tr><td>{rtype}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
193 |                     if self._filters != None and (all(f.lower() not in f'{rtype.lower()} {line.lower()}' for f in self._filters)): continue
194 |                     data_rows.append(f"<tr><td>{rtype}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
195 | 
196 |                 if len(data_rows) > 0:
197 |                     r_sidebar_links.append('<li class="nav-item"><a class="nav-link" href="#repository_'+str(sanitized_entity)+'_'+str(rtype)+'">'+str(rtype)+' '+gx_definitions.HTML_REPORT_EMOJIS.get(rtype,"")+'</a></li>')
198 |                     r_tables.append(templates['table'].replace("{{table_rows}}", "".join(data_rows)).replace("{{table_title}}", f"{rtype} {gx_definitions.HTML_REPORT_EMOJIS.get(rtype,'')}").replace("{{table_id}}", "repository_"+str(sanitized_entity)+"_"+rtype))
199 | 
200 |             if len(r_tables) > 0:
201 |                 repository_sidebar_links += '<ul class="nav flex-column mb-0"><li class="nav-item"><a class="nav-link collapsed" data-bs-toggle="collapse" role="button" aria-expanded="false" aria-controls="nav_'+str(sanitized_entity)+'" href="#nav_'+str(sanitized_entity)+'">'+str(sanitized_entity_raw)+' &#128193;</a><div class="collapse" id="nav_'+str(sanitized_entity)+'"><ul class="nav flex-column ms-3">'
202 |                 repository_sidebar_links += "".join(r_sidebar_links)
203 |                 repository_sidebar_links += '</ul></div></li></ul>'
204 |                 r_template = r_template.replace("{{repository_tables}}", "".join(r_tables))
205 |                 repository_sections += r_template
206 |             else:
207 |                 repository_sections += "<h5>No Results</h5>"
208 | 
209 | 
210 |         # We now merge all rtypes across all contributor results
211 |         tables_by_rtype = {}
212 |         for entity, data in self._contributors.items():
213 |             # Skip non-contributors
214 |             if not self._gxcontext.isContributor(entity): continue
215 |             
216 |             sanitized_entity = self.html_data_sanitize_and_process(entity)
217 |             
218 |             # Loop through all rtypes for the current contributor
219 |             for rtype in data.keys():
220 |                 if not self.debug_enabled() and ("debug" in rtype.lower()): continue
221 |                 
222 |                 if rtype not in tables_by_rtype:
223 |                     tables_by_rtype[rtype] = ""
224 | 
225 |                 for line in data[rtype]:
226 |                     tables_by_rtype[rtype] += f"<tr><td><a href='#contributor-section-{sanitized_entity}'>{sanitized_entity}</a></td><td>{self.html_data_sanitize_and_process(line)}</td></tr>"
227 | 
228 |         for rtype, table_rows in tables_by_rtype.items():
229 |             if self._filters != None and (all(f.lower() not in f'{rtype.lower()} {table_rows.lower()}' for f in self._filters)): continue
230 |             category_sidebar_links += '<ul class="nav flex-column mb-0"><li class="nav-item"><a href="#nav_category_'+str(rtype)+'">'+str(rtype)+' '+gx_definitions.HTML_REPORT_EMOJIS.get(rtype,"")+'</a></li></ul>'
231 |             table_html = templates['table'].replace("{{table_rows}}", table_rows) \
232 |                                        .replace("{{table_title}}", f"{rtype} {gx_definitions.HTML_REPORT_EMOJIS.get(rtype, '')}") \
233 |                                        .replace("{{table_id}}", f"nav_category_{rtype}")
234 |             category_sections += table_html
235 | 
236 | 
237 |         for entity, data in self._contributors.items():
238 |             # In the HTML report we skip any non-contributor results when showing contributor results
239 |             if not self._gxcontext.isContributor(entity): 
240 |                 continue
241 | 
242 |             sanitized_entity = self.html_data_sanitize_and_process(entity)
243 |             c_template = templates['contributor'].replace("{{contributor_id}}", str(sanitized_entity))
244 |             c_template = c_template.replace("{{contributor_name}}", str(sanitized_entity) + "&#128193;")
245 | 
246 |             c_tables = []
247 |             c_sidebar_links = []
248 |             for rtype in data.keys():
249 |                 if not self.debug_enabled() and ("debug" in rtype.lower()): continue
250 |                 data_rows = []
251 |                 for line in data[rtype]:
252 |                     if "warning: " in line.lower(): highlights_rows.append(f"<tr><td>{rtype}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
253 |                     if self._filters != None and (all(f.lower() not in f'{rtype.lower()} {line.lower()}' for f in self._filters)): continue
254 |                     data_rows.append(f"<tr><td>{sanitized_entity}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
255 | 
256 |                 if len(data_rows) > 0:
257 |                     c_sidebar_links.append('<li class="nav-item"><a class="nav-link" href="#contributor_'+str(sanitized_entity)+'_'+str(rtype)+'">'+str(rtype)+' '+gx_definitions.HTML_REPORT_EMOJIS.get(rtype,"")+'</a></li>')
258 |                     c_tables.append(templates['table'].replace("{{table_rows}}", "".join(data_rows)).replace("{{table_title}}", f"{rtype} {gx_definitions.HTML_REPORT_EMOJIS.get(rtype,'')}").replace("{{table_id}}", "contributor_"+str(sanitized_entity)+"_"+str(rtype)))
259 | 
260 |             if len(c_tables) > 0:
261 |                 contributor_sidebar_links += '<ul class="nav flex-column mb-0"><li class="nav-item"><a class="nav-link collapsed" data-bs-toggle="collapse" role="button" aria-expanded="false" aria-controls="nav_'+str(sanitized_entity)+'" href="#nav_'+str(sanitized_entity)+'">'+str(sanitized_entity)+' &#128193;</a><div class="collapse" id="nav_'+str(sanitized_entity)+'"><ul class="nav flex-column ms-3">'
262 |                 contributor_sidebar_links += "".join(c_sidebar_links)
263 |                 contributor_sidebar_links += '</ul></div></li></ul>'
264 |                 c_template = c_template.replace("{{contributor_tables}}", "".join(c_tables))
265 |                 contributor_sections += c_template
266 | 
267 |         if len(self._anonymous) > 0 and len(next(iter(self._anonymous.values()))) > 1:
268 |             for entity, data in self._anonymous.items():
269 |                 sanitized_entity = "Anonymous"
270 |                 a_template = templates['non_contributor'].replace("{{non_contributor_id}}", str(sanitized_entity))
271 |                 a_template = a_template.replace("{{non_contributor_name}}", f'{sanitized_entity} &#128123;')
272 |                 more_sidebar_links += '<ul class="nav flex-column mb-0"><li class="nav-item"><a class="nav-link collapsed" data-bs-toggle="collapse" role="button" aria-expanded="false" aria-controls="nav_'+str(sanitized_entity)+'" href="#nav_'+str(sanitized_entity)+'">'+str(sanitized_entity)+' &#128123;</a><div class="collapse" id="nav_'+str(sanitized_entity)+'"><ul class="nav flex-column ms-3">'
273 |                 a_tables = ""
274 |                 for rtype in data.keys():
275 |                     data_rows = []
276 |                     for line in data[rtype]:
277 |                         if "warning: " in line.lower(): highlights_rows.append(f"<tr><td>{rtype}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
278 |                         if self._filters != None and (all(f.lower() not in f'{rtype.lower()} {line.lower()}' for f in self._filters)): continue
279 |                         data_rows.append(f"<tr><td>{rtype}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>")
280 | 
281 |                     if len(data_rows) > 0:
282 |                         more_sidebar_links += '<li class="nav-item"><a class="nav-link" href="#contributor_'+str(sanitized_entity)+'_'+str(rtype)+'">'+str(rtype)+'</a></li>'
283 | 
284 |                     a_tables += templates['table'].replace("{{table_rows}}", "".join(data_rows)).replace("{{table_title}}", str(rtype)).replace("{{table_id}}", "contributor_"+str(sanitized_entity)+"_"+str(rtype))
285 | 
286 |                 more_sidebar_links += '</ul></div></li></ul>'
287 |                 a_template = a_template.replace("{{non_contributor_tables}}", a_tables)
288 |                 more_sections += a_template
289 |         else:
290 |             more_sidebar_links = '<ul class="nav flex-column mb-0"><li class="nav-item"><ul class="nav flex-column ms-3"><li class="nav-item">No results</li></ul></li></ul>'
291 |             more_sections = '<h5>No anonymous Contributors found</h5>'
292 | 
293 | 
294 |         # We now merge all rtypes across all non-contributor results
295 |         tables_by_rtype = {}
296 |         for entity, data in self._contributors.items():
297 |             # Skip contributors this time
298 |             if self._gxcontext.isContributor(entity): continue
299 | 
300 |             sanitized_entity = self.html_data_sanitize_and_process(entity)
301 |             for rtype in data.keys():
302 |                 if not self.debug_enabled() and ("debug" in rtype.lower()): continue
303 | 
304 |                 if rtype not in tables_by_rtype:
305 |                     tables_by_rtype[rtype] = ""
306 | 
307 |                 for line in data[rtype]:
308 |                     tables_by_rtype[rtype] += f"<tr><td>{sanitized_entity}</td><td>{self.html_data_sanitize_and_process(line)}</td></tr>"
309 | 
310 |         for rtype, table_rows in tables_by_rtype.items():
311 |             if self._filters != None and (all(f.lower() not in f'{rtype.lower()} {table_rows.lower()}' for f in self._filters)): continue
312 |             more_sidebar_links += '<ul class="nav flex-column mb-0"><li class="nav-item"><a href="#nav_more_'+str(rtype)+'">'+str(rtype)+' '+gx_definitions.HTML_REPORT_EMOJIS.get(rtype,"")+'</a></li></ul>'
313 |             table_html = templates['table'].replace("{{table_rows}}", table_rows) \
314 |                                        .replace("{{table_title}}", f"{rtype} {gx_definitions.HTML_REPORT_EMOJIS.get(rtype, '')}") \
315 |                                        .replace("{{table_id}}", f"nav_more_{rtype}")
316 |             more_sections += table_html
317 | 
318 | 
319 |         # We now have all highlights under highlights_rows; let's fill the highlights table and section of the report
320 |         if len(highlights_rows) > 0:
321 |             highlights_section = templates['table'].replace("{{table_rows}}", "".join(highlights_rows)).replace("{{table_title}}", "Highlights").replace("{{table_id}}", "highlights")
322 |         else: highlights_section = "<br/><h5>No results were highlighted by Gitxray.</h5>"
323 | 
324 |         output = templates['main'].replace("{{repository_sections}}", repository_sections)
325 |         # repository sidebar links
326 |         output = output.replace("{{repository_sidebar_links}}", repository_sidebar_links)
327 |         # category sidebar links
328 |         output = output.replace("{{category_sidebar_links}}", category_sidebar_links)
329 |         # contributors sidebar links
330 |         output = output.replace("{{contributor_sidebar_links}}", contributor_sidebar_links)
331 |         # more sidebar links
332 |         output = output.replace("{{more_sidebar_links}}", more_sidebar_links)
333 | 
334 |         # highlights section
335 |         output = output.replace("{{highlights_section}}", highlights_section)
336 | 
337 |         output = output.replace("{{category_sections}}", category_sections)
338 |         output = output.replace("{{contributor_sections}}", contributor_sections)
339 |         output = output.replace("{{report_date}}", datetime.datetime.now().strftime("%B %d, %Y"))
340 |         output = output.replace("{{more_sections}}", more_sections)
341 | 
342 | 
343 |         if self._filters != None:
344 |             output = output.replace("{{filters_html_text}}",f" with <strong>FILTERS ENABLED</strong>: <i>{self._filters}</i>. Disable Filters to get more results")
345 |         else:
346 |             output = output.replace("{{filters_html_text}}","")
347 | 
348 |         return output
349 | 
350 |     def _create_text_output(self, skip_ansi):
351 |         output = self._print_output(self._repositories, f"Repository https://github.com/ENTITY_STR", skip_ansi)
352 |         output += self._print_output(self._contributors, f"account ENTITY_STR", skip_ansi)
353 | 
354 |         if len(self._anonymous) > 0 and len(next(iter(self._anonymous.values()))) > 1: # Did this so that I don't hardcode "#" as an index
355 |             output += self._print_output(self._anonymous, "Anonymous Contributors (those with no GitHub account)", skip_ansi)
356 |         else:
357 |             output += f"#" * gx_definitions.SCREEN_SEPARATOR_LENGTH + "\n"
358 |         return output
359 | 
360 |     def _create_json_output(self):
361 |         data = { "repositories": [] }
362 | 
363 |         for repo_name, repo_info in self._repositories.items():
364 |             repo_data = { "name": repo_name, "contributors": [], "anonymous_contributors": [], "results": {} }
365 | 
366 |             for rtype, rtype_values in repo_info.items():
367 |                 repo_data["results"][rtype] = rtype_values
368 | 
369 |         for contributor_name, contrib_details in self._contributors.items():
370 |             contrib_data = { "name": contributor_name, "results": {} }
371 | 
372 |             for rtype, rtype_values in contrib_details.items():
373 |                 contrib_data["results"][rtype] = rtype_values
374 | 
375 |             repo_data["contributors"].append(contrib_data)
376 | 
377 |         for contributor_email, contrib_details in self._anonymous.items():
378 |             contrib_data = { }
379 | 
380 |             for rtype, rtype_values in contrib_details.items():
381 |                 contrib_data[rtype] = rtype_values
382 | 
383 |             repo_data["anonymous_contributors"].append(contrib_data)
384 | 
385 | 
386 |         data["repositories"].append(repo_data)
387 | 
388 |         json_output = json.dumps(data, indent=4)  # 'indent' for pretty printing
389 |         return json_output
390 | 
391 |     def doOutput(self):
392 |         if self._outformat == 'html':
393 |             output = self._create_html_output()
394 |         elif self._outformat == 'text':
395 |             output = self._create_text_output(self._gxcontext.getOutputFile())
396 |         elif self._outformat == 'json':
397 |             output = self._create_json_output()
398 |         else:
399 |             raise ValueError("Unsupported format!")
400 | 
401 |         if self._gxcontext.getOutputFile(): 
402 |             self._outfile = open(self._gxcontext.getOutputFile(), 'w+')
403 |             self.warn(f"Writing output to [{self._outfile.name}] in format [{self._outformat}]", shushable=False)
404 |             self._outfile.write(output)
405 |             self._outfile.write("\n")
406 |         else:
407 |             print(output)
408 | 
409 |         # Now reset persisting data!
410 |         self.reset()
411 | 
412 |     def testOutputFile(self):
413 |         outfile = self._gxcontext.getOutputFile()
414 |         if outfile:
415 |             if os.path.isdir(outfile):
416 |                 print("[!] Can't specify a directory as the output file, exiting.")
417 |                 sys.exit()
418 |             if os.path.isfile(outfile):
419 |                 target = outfile
420 |             else:
421 |                 target = os.path.dirname(outfile)
422 |                 if target == '':
423 |                     target = '.'
424 | 
425 |             if not os.access(target, os.W_OK):
426 |                 print("[!] Cannot write to output file, exiting")
427 |                 sys.exit()
428 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_ugly_openpgp_parser.py:
--------------------------------------------------------------------------------
  1 | # Calling this Ugly because I can, but also because it assumes more than it should. 
  2 | # Made this with trial and error, and love.
  3 | # https://www.rfc-editor.org/rfc/rfc4880 - for specs on OpenPGP parsing
  4 | # https://cirw.in/gpg-decoder/  - for OpenPGP debugging, cheers to Conrad Irwin
  5 | # Oh, and why write our own? Rather reinvent the wheel in this case than rely on more ext dependencies.
  6 | import re, base64, datetime
  7 | from .gx_definitions import OPENPGP_SIG_TYPES, OPENPGP_PK_ALGOS, OPENPGP_HASH_ALGOS 
  8 | 
  9 | 
 10 | # data must point to the first field containing the length of (un)hashed subpackets
 11 | def ugly_inhouse_subpacket_parser(data, output_attributes):
 12 |     subpackets_length = int.from_bytes(data[:2], byteorder='big')
 13 |     subpacket_data_start = 2
 14 |     subpacket_data_end = subpacket_data_start + subpackets_length
 15 |     subpacket_data = data[subpacket_data_start:subpacket_data_end]
 16 |     current_pos = 0
 17 |     while current_pos < len(subpacket_data):
 18 |         subpacket_length = subpacket_data[current_pos]
 19 |         subpacket_type = subpacket_data[current_pos + 1]
 20 |         #print(f'{hex(subpacket_type)} subpacket_type found, len is {subpacket_length}')
 21 |         if subpacket_type == 16:  # Issuer Key ID subpacket
 22 |             key_id_bytes = subpacket_data[current_pos + 2:current_pos + 2 + subpacket_length - 1]
 23 |             key_id = key_id_bytes.hex().upper()
 24 |             output_attributes.update({"pgp_keyid":key_id})
 25 |         if subpacket_type == 2:  # Signature Creation Time subpacket
 26 |             # Ensure the subpacket length is enough for 4-byte timestamp
 27 |             if subpacket_length >= 5:
 28 |                 creation_time_data = subpacket_data[current_pos + 2:current_pos + 6]
 29 |                 creation_time = int.from_bytes(creation_time_data, byteorder='big')
 30 |                 output_attributes.update({"pgp_signature_creation_time":datetime.datetime.utcfromtimestamp(creation_time)})
 31 | 
 32 |         current_pos += subpacket_length + 1
 33 | 
 34 |     return subpacket_data_end
 35 | 
 36 | def parse_openpgp_fields(decoded_data):
 37 |     attributes = {}
 38 |     total_bytes = len(decoded_data)
 39 | 
 40 |     unhashed_subpackets = False
 41 |     if decoded_data[0] & 0x80:  
 42 |         if decoded_data[0] & 0x40:  # We assume no unhashed subpackets
 43 |             packet_type = decoded_data[0] & 0x3F
 44 |         else:  # Supposedly an Old format; it contains unhashed subpackets
 45 |             packet_type = (decoded_data[0] & 0x3C) >> 2
 46 |             unhashed_subpackets = True
 47 | 
 48 |         offset = 3
 49 | 
 50 |         version = decoded_data[offset]
 51 |         if packet_type == 2:  # Signature packet
 52 |             attributes['pgp_signature_version'] = version
 53 | 
 54 |             if version == 4:
 55 |                 offset += 1 
 56 |                 attributes['pgp_sig_type'] = OPENPGP_SIG_TYPES.get(decoded_data[offset], "Unknown")
 57 |                 offset += 1 
 58 |                 attributes['pgp_publicKeyAlgorithm'] = OPENPGP_PK_ALGOS.get(decoded_data[offset], "Unknown")
 59 |                 offset += 1 
 60 |                 attributes['pgp_hashAlgorithm'] = OPENPGP_HASH_ALGOS.get(decoded_data[offset], "Unknown")
 61 |                 offset += 1 
 62 | 
 63 |                 subpacket_data_end = ugly_inhouse_subpacket_parser(decoded_data[offset:], attributes)
 64 |                 if unhashed_subpackets: 
 65 |                     ugly_inhouse_subpacket_parser(decoded_data[subpacket_data_end+offset:], attributes)
 66 | 
 67 |         elif packet_type == 6: # Public key packet
 68 |             attributes['key_version'] = version
 69 |             offset += 1
 70 |             attributes['creation_time'] = int.from_bytes(decoded_data[offset:offset+4], byteorder='big')
 71 |             offset += 4 
 72 |             if version == 4:
 73 |                 attributes['pgp_publicKeyAlgorithm'] = OPENPGP_PK_ALGOS.get(decoded_data[offset], "Unknown")
 74 |                 offset += 1
 75 |                 # we're going to skip analysis on N for now and go straight to trivial data
 76 |                 n_length = int.from_bytes(decoded_data[offset:offset+2], byteorder='big') // 8
 77 |                 offset += 2
 78 |                 # here comes N but we're going to get past it
 79 |                 offset += n_length
 80 |                 # and same for E; all of those RSA CTF challenges giving us 'the look' right now.
 81 |                 e_length = int.from_bytes(decoded_data[offset:offset+2], byteorder='big') // 8
 82 |                 offset += 2 
 83 |                 # here comes E
 84 |                 offset += e_length
 85 | 
 86 |                 if offset < total_bytes and decoded_data[offset] == 0x01 and decoded_data[offset+1] == 0xb4:
 87 |                     offset += 2
 88 |                     userid_length = decoded_data[offset]
 89 |                     offset += 1 
 90 |                     userId = decoded_data[offset:offset+userid_length]
 91 |                     try: # ugly temporary fix for encodings in userId
 92 |                         attributes['userId'] = userId.decode('utf-8')
 93 |                     except: # some exceptional cases appear to be triggering exceptions
 94 |                         attributes['userId'] = userId
 95 |                     
 96 | 
 97 |         else:
 98 |             print(f'OMG packet_type was: {packet_type} - This was unexpected!!')
 99 | 
100 |     return attributes
101 | 
102 | 
103 | 
104 | # More information on how I'm parsing signature and key blobs in RFC4880 (https://www.rfc-editor.org/rfc/rfc4880)
105 | def ugly_inhouse_openpgp_block(pgp_armored_input):
106 | 
107 |     # We've found data before in weird places, likely tasty user input.. hidden messages, ..we want to capture them <3
108 |     malformed_beginning = re.search(r'(.+)\r?\n?-----BEGIN', pgp_armored_input.replace('\r','').replace('\n',''), re.MULTILINE)
109 |     malformed_ending = re.search(r'END PGP PUBLIC KEY BLOCK-----\r?\n?(.+)$', pgp_armored_input.replace('\r','').replace('\n',''), re.MULTILINE)
110 |     if malformed_beginning != None or malformed_ending != None: 
111 |         return {
112 |             "malformed_beginning": malformed_beginning.group(1) if malformed_beginning else None,
113 |             "malformed_ending": malformed_ending.group(1) if malformed_ending else None
114 |         }
115 | 
116 |     # If we get here, there was nothing malformed prior or after the Key. Signatures are created by GitHub so.. unlikely they are broken.
117 |     # And the magic awful parsing begins..
118 | 
119 |     # format the data a bit by removing unwanted strings and chars, also consider a potential Version
120 |     base64_str = re.sub(r'-----BEGIN PGP SIGNATURE-----|-----BEGIN PGP PUBLIC KEY BLOCK-----', '', pgp_armored_input)
121 |     base64_str = re.sub(r'Charset: (.+)\r?\n?', '', base64_str)
122 |     base64_str = re.sub(r'Version: (.+)\r?\n?', '', base64_str)
123 |     base64_str = re.sub(r'Comment: (.+)\r?\n?', '', base64_str)
124 |     base64_str = re.sub(r'-----END PGP (.+)', '', base64_str)
125 |     base64_str = re.sub(r'\s+', '', base64_str)
126 |     decoded_blob = base64.b64decode(base64_str.encode('ascii', 'ignore'))
127 | 
128 |     try:
129 |         openpgp_findings = parse_openpgp_fields(decoded_blob)
130 |     except Exception as ex:
131 |         print(f"Exception triggered in parse_openpgp_fields")
132 |         print(f"Printing base64 blob contents for debugging purposes.")
133 |         print(base64_str)
134 |         raise ex
135 | 
136 |     # Add any comment and version values from the armored ascii to our findings.
137 |     version_match = re.search(r'Version: (.+)\r?\n?', pgp_armored_input)
138 |     comment_match = re.search(r'Comment: (.+)\r?\n?', pgp_armored_input)
139 |     if version_match: openpgp_findings["armored_version"] = version_match.group(1).replace('\r','').replace('\n','')
140 |     if comment_match: openpgp_findings["armored_comment"] = comment_match.group(1).replace('\r','').replace('\n','')
141 | 
142 |     return openpgp_findings
143 | 
144 | 


--------------------------------------------------------------------------------
/src/gitxray/include/gx_ugly_ssh_parser.py:
--------------------------------------------------------------------------------
 1 | # Calling this Ugly because I can, but also because it assumes more than it should. 
 2 | # Made this with trial and error, and love.
 3 | # Oh, and why write our own? Rather reinvent the wheel in this case than rely on more ext dependencies.
 4 | import re, base64, datetime
 5 | 
 6 | def ugly_inhouse_ssh_signature_block(armored_ssh_signature):
 7 | 
 8 |     # Bring down the armor! Expose thy binary signature.
 9 |     base64_str = re.sub(r'-----BEGIN SSH SIGNATURE-----', '', armored_ssh_signature)
10 |     base64_str = re.sub(r'-----END SSH SIGNATURE-----', '', base64_str)
11 |     base64_str = re.sub(r'\s+', '', base64_str)
12 | 
13 |     decoded_blob = base64.b64decode(base64_str.encode('ascii', 'ignore'))
14 |    
15 |     # This appears to be standard
16 |     if decoded_blob[:6] == b"SSHSIG":
17 |         # Yet this offset here is likely too hardcoded
18 |         algorithm_length = int(decoded_blob[17])
19 |         # The length of the algorithm helps us get the entire string.
20 |         algorithm = decoded_blob[18:18+algorithm_length]
21 |         return {"ssh_signature_algorithm":algorithm}
22 | 
23 |     return None
24 | 
25 | def ugly_inhouse_ssh_key(ssh_key):
26 |     # First keep the algorithm
27 |     algorithm_match = re.match(r'^(\S+)', ssh_key)
28 | 
29 |     # Then just split with space and decode the second part
30 |     # Stopped here; but eventually we could parse multiple formats to get key strength at least?
31 |     # decoded_blob = base64.b64decode(ssh_key.split()[1].encode('ascii', 'ignore'))
32 | 
33 |     # fingerprint = sha256(decoded_blog)
34 |     if algorithm_match: 
35 |         return algorithm_match.group(1) 
36 | 
37 |     return None
38 | 


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/src/gitxray/include/html_report/__init__.py


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/template_contributor.html:
--------------------------------------------------------------------------------
1 | <div id="contributor-section-{{contributor_id}}">
2 |     <h3>Contributor: {{contributor_name}}</h3>
3 | 	 <p>This section provides findings on the activity for this specific Contributor within the repository. The data presented here has been gathered through a thorough inspection of the repository's activity, specifically focusing on contributions made by this account. While some of the GitHub APIs used to gather information may return universal or cross-repository data (e.g. such as 90-day activity metrics that cover all of a contributor’s activity across all repositories), most of the findings presented in this report are exclusively tied to their interactions within this particular repository. Put another way; this is focused to this repository; it is NOT a full analysis on the account nor on its interactions with other repositories.</p>
4 | 
5 |     {{contributor_tables}}    
6 | </div>
7 | <p>&nbsp;</p>
8 | 


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/template_highlights.html:
--------------------------------------------------------------------------------
1 | <div id="highlights-section-{{repository_id}}">
2 |     {{highlights_tables}}    
3 | </div>
4 | <p>&nbsp;</p>
5 | 


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/template_non_contributor.html:
--------------------------------------------------------------------------------
1 | <div id="non-contributor-section-{{non_contributor_id}}">
2 |     <h3>{{non_contributor_name}}</h3>
3 |     {{non_contributor_tables}}    
4 | </div>
5 | 


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/template_repository.html:
--------------------------------------------------------------------------------
1 | <div id="repository-section-{{repository_id}}">
2 |     {{repository_tables}}    
3 | </div>
4 | 


--------------------------------------------------------------------------------
/src/gitxray/include/html_report/template_table.html:
--------------------------------------------------------------------------------
 1 |     <div class="table-section py-3">
 2 |         <div class="d-flex justify-content-between align-items-center">
 3 |        	<a id="{{table_id}}"></a>
 4 |         <h3 class="flex-grow-1 text-start">{{table_title}}</h3>
 5 |           	<button class="btn btn-link" type="button" data-bs-toggle="collapse" data-bs-target="#tableCollapse_{{table_id}}" aria-expanded="true" aria-controls="tableCollapse_{{table_id}}">
 6 | 	            <span class="toggle-icon">&#8661;</span>
 7 |           	</button>
 8 |         </div>
 9 | 	<div class="collapse show" id="tableCollapse_{{table_id}}">
10 |         <table class="table table-striped table-bordered" style="width:100%">
11 |             <thead>
12 |                 <tr>
13 |                     <th>Identifier</th>
14 |                     <th>Value</th>
15 |                 </tr>
16 |             </thead>
17 |             <tbody>
18 |                 {{table_rows}}
19 |             </tbody>
20 |         </table>
21 | 	</div>
22 |     </div>
23 | 


--------------------------------------------------------------------------------
/src/gitxray/xrays/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kulkansecurity/gitxray/d7852c0f40a4bbbc4ab6a8718785a40851825975/src/gitxray/xrays/__init__.py


--------------------------------------------------------------------------------
/src/gitxray/xrays/association_xray.py:
--------------------------------------------------------------------------------
 1 | from gitxray.include import gx_definitions
 2 | 
 3 | def run(gx_context, gx_output, gh_api):
 4 |     gx_output.stdout("Checking for potential associations across accounts by cross-referencing all gathered data...")
 5 | 
 6 |     collisions = gx_context.getCollisions()
 7 | 
 8 |     for colType, colValue in collisions:
 9 |         affected_accounts = collisions.get((colType, colValue))
10 |         if not affected_accounts: continue
11 |         msg = None
12 | 
13 |         if colType == "PGP_KEYID" and (colValue not in gx_definitions.GITHUB_WEB_EDITOR_SIGNING_KEYS):
14 |                 msg = f"WARNING: A Personal/Private PGP Key with ID {colValue} found shared by accounts: {affected_accounts}."
15 |         elif colType == "PGP_SCT":
16 |             msg = f"PGP Signature Creation Time ({colValue}) shared by accounts: {affected_accounts}."
17 |         elif colType == "PGP_SUBKEY_CREATED_AT":
18 |             msg = f"The following contributor accounts have PGP Subkeys that were created in the same day: {affected_accounts}."
19 |         elif colType == "SSH_SIGNING_KEY_CREATED_AT":
20 |             msg = f"The following contributor accounts have SSH signing keys that were created in the same day: {affected_accounts}."
21 |         elif colType == "KEY_ARMORED_VERSION":
22 |             msg = f"Exact same Version field extracted from a Key for accounts: {affected_accounts}: {colValue}."
23 |         elif colType == "KEY_ARMORED_COMMENT":
24 |             msg = f"Exact same Comment field extracted from a Key for accounts: {affected_accounts}: {colValue}."
25 |         elif colType == "EMAIL":
26 |             msg = f"Email {colValue} shared by accounts: {affected_accounts}."
27 |         elif colType == "DAYS_SINCE_CREATION":
28 |             msg = f"The following contributor accounts were created in the same day, precisely {colValue} days ago: {affected_accounts}."
29 |         elif colType == "DAYS_SINCE_UPDATED" :
30 |             msg = f"The following contributor accounts were last updated in the same day, precisely {colValue} days ago: {affected_accounts}."
31 | 
32 |         if msg != None: 
33 |             gx_output.r_log(msg, rtype="association")
34 |             for account in affected_accounts: gx_output.c_log(msg, rtype="association", contributor=account)
35 | 
36 |         # Now check some more which add lots of noise, we only want these in r_log, not per contributor.
37 |         msg = None
38 |         if colType == "PGP_HA":
39 |             msg = f"PGP Hash Algorithm ({colValue}) shared by accounts: {affected_accounts}."
40 |         elif colType == "PGP_PKA":
41 |             msg = f"PGP Public Key Algorithm ({colValue}) shared by accounts: {affected_accounts}."
42 |         elif colType == "PGP_KEYID" and colValue in gx_definitions.GITHUB_WEB_EDITOR_SIGNING_KEYS:
43 |             msg = f"GitHub's Web Editor (Key ID: {colValue}) was used by accounts: {affected_accounts}."
44 |         elif colType == "PGP_SIG_TYPE":
45 |             msg = f"PGP Signature Type ({colValue}) shared by accounts: {affected_accounts}."
46 |         elif colType == "SSH_SA":
47 |             msg = f"SSH Signature Algorithm ({colValue}) shared by accounts: {affected_accounts}."
48 | 
49 |         if msg != None: 
50 |             gx_output.r_log(msg, rtype="association")
51 | 
52 |     return True
53 | 


--------------------------------------------------------------------------------
/src/gitxray/xrays/contributors_xray.py:
--------------------------------------------------------------------------------
  1 | from gitxray.include import gh_time, gh_public_events, gx_definitions
  2 | from gitxray.include import gx_ugly_openpgp_parser, gx_ugly_ssh_parser 
  3 | from datetime import datetime, timezone
  4 | from collections import defaultdict
  5 | import sys, re, base64
  6 | 
  7 | def run(gx_context, gx_output, gh_api):
  8 | 
  9 |     repository = gx_context.getRepository()
 10 |     contributor_scope = gx_context.getContributorScope()
 11 | 
 12 |     if contributor_scope != None:
 13 |         gx_output.notify(f"YOU HAVE SCOPED THIS GITXRAY TO CONTRIBUTORS: {contributor_scope} - OTHER USERS WON'T BE ANALYZED.")
 14 | 
 15 |     gx_output.stdout(f"Querying GitHub for repository contributors.. Please wait.", shushable=True, end='', flush=True)
 16 | 
 17 |     # Let's store the whole set of contributors in the context
 18 |     gx_context.setContributors(gh_api.fetch_repository_contributors(repository))
 19 | 
 20 |     c_users = []
 21 |     c_anon = []
 22 | 
 23 |     c_len = len(gx_context.getContributors())
 24 |     gx_output.stdout(f"\rIdentified {c_len} contributors.." + ' '*70, shushable=True, flush=True)
 25 | 
 26 |     # If focused on a contributor, let's first make sure the contributor exists in the repository
 27 |     if contributor_scope != None:
 28 |         if not gx_context.areContributors(contributor_scope): 
 29 |             gx_output.warn(f"One of the collaborators you specified {contributor_scope} were not found as a contributor in the repo.")
 30 |             gx_output.warn(f"If you intend to filter results for a non-contributor, using the filter function instead (eg. -f johnDoe03). Quitting..")
 31 |             return False
 32 | 
 33 |     # Were were invoked to just list contributors and quit?
 34 |     if gx_context.listAndQuit():
 35 |         gx_output.notify(f"LISTING CONTRIBUTORS (INCLUDING THOSE WITHOUT A GITHUB USER ACCOUNT) AND EXITING..", shushable=False)
 36 |         gx_output.stdout(", ".join([c.get('login', c.get('email')) for c in gx_context.getContributors()]), shushable=False)
 37 |         return False
 38 | 
 39 |     if c_len > 500:
 40 |         gx_output.stdout(f"IMPORTANT: The repository has 500+ contributors. GitHub states > 500 contributors will appear as Anonymous")
 41 |         gx_output.stdout(f"More information at: https://docs.github.com/en/rest/repos/repos?apiVersion=2022-11-28#list-repository-contributors")
 42 | 
 43 |     for i, c in enumerate(gx_context.getContributors()):
 44 |         if contributor_scope != None and c.get('login') not in contributor_scope: continue
 45 |         gx_output.stdout('\rFetching repository contributor details [{}/{}]'.format(i+1, c_len), end='', flush=True)
 46 |         ctype = c.get('type')
 47 |         if ctype in ["User", "Bot"]:
 48 |             c_users.append(gh_api.fetch_contributor(c))
 49 |         elif ctype == "Anonymous":
 50 |             c_anon.append(c)
 51 |         else:
 52 |             print(c)
 53 |             raise Exception("Contributor of Type !== User/Anonymous found. Failing almost gracefully")
 54 | 
 55 |     if contributor_scope == None and len(gx_context.getContributors()) != 0:
 56 |         gx_output.stdout(f"\r\nDiscovered {len(c_users)} contributors with GitHub User accounts, and {len(c_anon)} Anonymous contributors", end='', flush=True)
 57 |         gx_output.r_log(f"Repository has {len(c_anon)} Anonymous contributors.", rtype="contributors")
 58 |         gx_output.r_log(f"Repository has {len(c_users)} contributors with GitHub User accounts.", rtype="contributors")
 59 | 
 60 |     gx_output.stdout(f"\r\nPlease wait, beginning to collect keys and commits for User contributors..", end='', flush=True)
 61 | 
 62 |     c_users_index = 1
 63 |     for contributor in c_users:
 64 |         if contributor is None: continue
 65 |         unique_pgp_keyids = []
 66 |         contributor_emails = []
 67 |         contributor_login = contributor.get('login')
 68 |         c_started_at = datetime.now()
 69 |         gx_output.c_log(f"X-Ray on contributor started at {c_started_at}", contributor=contributor_login, rtype="metrics")
 70 | 
 71 |         gx_output.stdout(f"\r[{c_users_index}/{len(c_users)}] Analyzing Profile data for {contributor.get('login')}"+' '*40, end = '', flush=True)
 72 |         gx_output.c_log(f"Contributor URL: {contributor.get('html_url')}", rtype="urls")
 73 |         gx_output.c_log(f"Owned repositories: https://github.com/{contributor_login}?tab=repositories", rtype="urls")
 74 | 
 75 |         if contributor.get('name') != None:
 76 |             gx_output.c_log(f"[Name: {contributor.get('name')}] obtained from the user's profile.", rtype="personal")
 77 | 
 78 |         if contributor.get('twitter_username') != None:
 79 |             gx_output.c_log(f"[X/Twitter account: {contributor.get('twitter_username')}] obtained from the user's profile.", rtype="personal")
 80 |         if contributor.get('bio') != None:
 81 |             bio = contributor.get('bio').replace("\r\n"," | ")
 82 |             gx_output.c_log(f"[Bio: {bio}] obtained from the profile.", rtype="personal")
 83 |         if contributor.get('company') != None:
 84 |             gx_output.c_log(f"[Company: {contributor.get('company')}] obtained from the user's profile.", rtype="personal")
 85 |         if contributor.get('blog') != None and len(contributor.get('blog')) > 0:
 86 |             gx_output.c_log(f"[Blog: {contributor.get('blog')}] obtained from the user's profile.", rtype="personal")
 87 |         if contributor.get('location') != None:
 88 |             gx_output.c_log(f"[Location: {contributor.get('location')}] obtained from the user's profile.", rtype="personal")
 89 |         if contributor.get('hireable') != None:
 90 |             gx_output.c_log(f"[Hireable: The user has set 'Available for Hire'] in their GitHub profile.", rtype="personal")
 91 | 
 92 |         if contributor.get('email') != None:
 93 |             gx_output.c_log(f"[{contributor.get('email')}] obtained from the user's profile.", rtype="emails")
 94 |             gx_context.linkIdentifier("EMAIL", [contributor.get('email')], contributor_login)
 95 | 
 96 |         contributor_created_at_time = gh_time.parse_date(contributor.get('created_at'))
 97 |         days_since_account_creation = (datetime.now(timezone.utc) - contributor_created_at_time).days 
 98 | 
 99 |         # Let's keep track of when the accounts were created.
100 |         gx_context.linkIdentifier("DAYS_SINCE_CREATION", [days_since_account_creation], contributor_login)
101 | 
102 |         message = f"{days_since_account_creation} days old"
103 |         if days_since_account_creation > 365:
104 |             years = "{:.2f}".format(days_since_account_creation / 365)
105 |             message = f"{years} years old"
106 | 
107 |         gx_output.c_log(f"Contributor account created: {contributor.get('created_at')}, is {message}.", rtype="profiling")
108 | 
109 |         if contributor.get('updated_at') != None:
110 |             days_since_updated = (datetime.now(timezone.utc) - gh_time.parse_date(contributor.get('updated_at'))).days 
111 |             gx_output.c_log(f"The account was last updated at {contributor.get('updated_at')}, {days_since_updated} days ago.", rtype="profiling")
112 |             # Let's keep track of when the accounts were last updated.
113 |             gx_context.linkIdentifier("DAYS_SINCE_UPDATED", [days_since_updated], contributor_login)
114 | 
115 |         if contributor.get('site_admin') != False:
116 |             gx_output.c_log(f"The account may be an administrator. It has 'site_admin' set to True", rtype="profiling")
117 | 
118 |         commits = gh_api.fetch_commits(repository, author=contributor.get('login'))
119 |         if commits != None and len(commits) > 0:
120 |             commits_message = f", at {commits[0]['commit']['author']['date']}."
121 |             oldest_commit = commits[-1]['commit']['author']['date']
122 |             if len(commits) > 1:
123 |                 commits_message = f", first one at {oldest_commit} and last one at {commits[0]['commit']['author']['date']}."
124 |             gx_output.c_log(f'Made (to this repo) {len(commits)} commits{commits_message}', rtype="commits")
125 | 
126 |         signed_commits = []
127 |         failed_verifications = []
128 |         signature_attributes = []
129 |         dates_mismatch_commits = []
130 |         commit_times = defaultdict(int)
131 |         gx_output.stdout(f"\r[{c_users_index}/{len(c_users)}] Analyzing {len(commits)} commits and any signing keys for {contributor.get('login')}"+' '*40, end = '', flush=True)
132 |         for commit in commits:
133 |             c = commit["commit"]
134 | 
135 |             v_reason = c["verification"]["reason"]
136 |             if c["verification"]["verified"] == True:
137 |                 try:
138 |                     if "BEGIN SSH SIGNATURE" in c["verification"]["signature"]:
139 |                         signature_attributes.append(gx_ugly_ssh_parser.ugly_inhouse_ssh_signature_block(c["verification"]["signature"]))
140 |                     else:
141 |                         signature_attributes.append(gx_ugly_openpgp_parser.ugly_inhouse_openpgp_block(c["verification"]["signature"]))
142 |                 except Exception as ex:
143 |                     gx_output.c_log(f"Failed at parsing a signature, not strange due to our ugly parsing code. Here's some more data. {c['verification']['signature']} - {ex}", rtype="debug")
144 | 
145 |                 if v_reason != "valid":
146 |                     gx_output.c_log(f"Unexpected condition - verified commit set to True and reason != 'valid'. Reason is: {v_reason} - Report to dev!", rtype="debug")
147 |                 else:
148 |                     signed_commits.append(c)
149 |             elif v_reason != "unsigned":
150 |                 if v_reason == "bad_email": 
151 |                     gx_output.c_log(f"The email in the signature doesn't match the 'committer' email: {commit['html_url']}", rtype="signatures")
152 |                 elif v_reason == "unverified_email": 
153 |                     gx_output.c_log(f"The committer email in the signature was not Verified in the account: {commit['html_url']}", rtype="signatures")
154 |                 elif v_reason == "expired_key": 
155 |                     gx_output.c_log(f"The key that made the signature expired: {commit['html_url']}", rtype="signatures")
156 |                 elif v_reason == "not_signing_key": 
157 |                     gx_output.c_log(f"The PGP key used in the signature did not include the 'signing' flag: {commit['html_url']}", rtype="signatures")
158 |                 elif v_reason == "gpgverify_error" or v_reason == "gpgverify_unavailable": 
159 |                     gx_output.c_log(f"There was an error communicating with the signature verification service: {commit['html_url']}", rtype="signatures")
160 |                 elif v_reason == "unknown_signature_type": 
161 |                     gx_output.c_log(f"A non-PGP signature was found in the commit: {commit['html_url']}", rtype="signatures")
162 |                 elif v_reason == "no_user": 
163 |                     gx_output.c_log(f"The email address in 'committer' does not belong to a User: {commit['html_url']}", rtype="signatures")
164 |                 elif v_reason == "unknown_key": 
165 |                     gx_output.c_log(f"The key used to sign the commit is not in their profile and can't be verified: {commit['html_url']}", rtype="signatures")
166 |                 elif v_reason == "malformed_signature" or v_reason == "invalid": 
167 |                     gx_output.c_log(f"The signature was malformed and a parsing error took place: {commit['html_url']}", rtype="signatures")
168 |                 failed_verifications.append(c)
169 | 
170 |             if c["author"]["email"] not in contributor_emails: 
171 |                 gx_output.c_log(f"[{c['author']['email']}] obtained by parsing commits.", rtype="emails")
172 |                 contributor_emails.append(c["author"]["email"]) 
173 |                 gx_context.linkIdentifier("EMAIL", [c["author"]["email"]], contributor_login)
174 | 
175 |             commit_date = gh_time.parse_date(c['author']['date'])
176 |             if commit_date < contributor_created_at_time:
177 |                 dates_mismatch_commits.append(c)
178 | 
179 |             # Let's group by commit hour, we may have an insight here.
180 |             commit_times[commit_date.hour] += 1
181 | 
182 |         if len(dates_mismatch_commits) > 0:
183 |             gx_output.c_log(f"WARNING: UNRELIABLE DATES (Older than Account) in {len(dates_mismatch_commits)} commits by [{contributor_login}]. Potential tampering, account re-use, or Rebase. List at: {repository.get('html_url')}/commits/?author={contributor_login}&until={contributor.get('created_at')}", rtype="commits")
184 |             gx_output.c_log(f"View commits with unreliable DATES here: {repository.get('html_url')}/commits/?author={contributor_login}&until={contributor.get('created_at')}", rtype="commits")
185 |             gx_context.linkIdentifier("DATE_MISMATCH_COMMITS", [len(dates_mismatch_commits)], contributor_login)
186 | 
187 |         if len(commit_times) > 0:
188 |             # Let's link these commit hours to this contributor, and we'll do extra analysis in the associations X-Ray
189 |             gx_context.linkIdentifier("COMMIT_HOURS", commit_times, contributor_login)
190 | 
191 |             total_commits = len(commits)
192 |             formatted_output = f"Commit Hours for [{total_commits}] commits:"
193 |             sorted_commit_times = sorted(commit_times.items(), key=lambda item: item[1], reverse=True)
194 |             
195 |             for commit_hour, count in sorted_commit_times:
196 |                 percentage = (count / total_commits) * 100
197 |                 range_label = gx_definitions.COMMIT_HOURS[commit_hour]
198 |                 formatted_output += f" [{range_label}: {count} ({percentage:.2f}%)]"
199 | 
200 |             gx_output.c_log(formatted_output, rtype="commits")
201 | 
202 |         # PGP Signature attributes: We have precise Key IDs used in signatures + details on signature creation time and algorithm
203 |         unique_pgp_pka = set(attribute.get('pgp_publicKeyAlgorithm') for attribute in signature_attributes if attribute.get('pgp_pulicKeyAlgorithm') is not None)
204 |         unique_pgp_st = set(attribute.get('pgp_sig_type') for attribute in signature_attributes if attribute.get('pgp_sig_type') is not None)
205 |         unique_pgp_ha = set(attribute.get('pgp_hashAlgorithm') for attribute in signature_attributes if attribute.get('pgp_hashAlgorithm') is not None)
206 |         unique_pgp_sct = set(attribute.get('pgp_signature_creation_time') for attribute in signature_attributes if attribute.get('pgp_signature_creation_time') is not None)
207 |         unique_pgp_keyids = set(attribute.get('pgp_keyid') for attribute in signature_attributes if attribute.get('pgp_keyid') is not None)
208 | 
209 |         # We don't link SSH Key IDs because SSH keys are unique across GitHub; PGP keys can be added to more than 1 account.
210 |         gx_context.linkIdentifier("PGP_KEYID", unique_pgp_keyids, contributor_login)
211 |         gx_context.linkIdentifier("PGP_PKA", unique_pgp_pka, contributor_login)
212 |         gx_context.linkIdentifier("PGP_HA", unique_pgp_ha, contributor_login)
213 |         gx_context.linkIdentifier("PGP_SCT", unique_pgp_sct, contributor_login)
214 | 
215 |         # SSH Signature attributes: We don't have a Key ID so far, but we do have the signature algorithms - hey, it's something! right? right??
216 |         unique_ssh_sa = set(attribute.get('ssh_signature_algorithm') for attribute in signature_attributes if attribute.get('ssh_signature_algorithm') is not None)
217 |         if len(unique_ssh_sa) > 0: gx_output.c_log(f"SSH signatures used Algorithms: [{unique_ssh_sa}] obtained from parsing signature blobs", rtype="keys")
218 |         gx_context.linkIdentifier("SSH_SA", unique_ssh_sa, contributor_login)
219 | 
220 |         # Let's add signature attributes output.
221 |         if len(unique_pgp_pka) > 0: gx_output.c_log(f"PGP signatures used publicKeyAlgorithms: [{unique_pgp_pka}] obtained from parsing signature blobs", rtype="keys")
222 |         # Based on our testing, Signature Type appears to be always 0 in GitHub: Signature of a binary document - Let's only log if it differs.
223 |         if len(unique_pgp_st) > 0:
224 |             for sigtype in unique_pgp_st:
225 |                 if sigtype != "Signature of a binary document": 
226 |                     gx_output.c_log(f"PGP signatures used an atypical signature Type: [{sigtype}] obtained from parsing signature blobs", rtype="keys")
227 |                     # Let's also link the atypical sigtype to the user just in case we spot more accounts using it.
228 |                     gx_context.linkIdentifier("PGP_SIG_TYPE", [sigtype], contributor_login)
229 |         if len(unique_pgp_ha) > 0: gx_output.c_log(f"PGP signatures used hash Algorithms: [{unique_pgp_ha}] obtained from parsing signature blobs", rtype="keys")
230 | 
231 | 
232 |         # https://docs.github.com/en/rest/users/gpg-keys?apiVersion=2022-11-28#list-gpg-keys-for-a-user
233 |         # GitHub calls them GPG keys, but we're going to refer to them as PGP for the OpenPGP standard
234 |         pgp_keys = gh_api.fetch_gpg_keys(contributor_login)
235 |         if pgp_keys != None and len(pgp_keys) > 0:
236 |             primary_key_ids = [key.get('key_id') for key in pgp_keys]
237 |             gx_output.c_log(f"{len(pgp_keys)} Primary PGP Keys in this contributor's profile: {str(primary_key_ids)}", rtype="keys")
238 |             gx_output.c_log(f"PGP Keys: https://api.github.com/users/{contributor_login}/gpg_keys", rtype="keys")
239 | 
240 |         for primary_key in pgp_keys:
241 |             # Let's parse and drain info from raw_key fields in primary keys
242 |             if primary_key.get('raw_key') != None:
243 |                 key_attributes = gx_ugly_openpgp_parser.ugly_inhouse_openpgp_block(primary_key.get('raw_key'))
244 |                 if key_attributes.get('malformed_beginning') != None:
245 |                     malformed_beginning = key_attributes.get('malformed_beginning').replace('\r\n',' | ')
246 |                     gx_output.c_log(f"Bogus data found at the beginning of a PGP Key containing: {malformed_beginning}", rtype="user_input")
247 |                 if key_attributes.get('malformed_ending') != None:
248 |                     malformed_ending = key_attributes.get('malformed_ending').replace('\r\n',' | ')
249 |                     gx_output.c_log(f"Bogus data found at the end of a PGP Key containing: {malformed_ending}", rtype="user_input")
250 |                 if key_attributes.get('userId') != None:
251 |                     gx_output.c_log(f"[{key_attributes.get('userId')}] obtained from parsing PGP Key ID {primary_key.get('key_id')}", rtype="personal")
252 |                 if key_attributes.get('armored_version') != None:
253 |                     armored_version = key_attributes.get('armored_version').replace('\r\n',' | ')
254 |                     gx_output.c_log(f"[Version: {armored_version}] obtained from parsing PGP Key ID {primary_key.get('key_id')}", rtype="keys")
255 |                     gx_context.linkIdentifier("KEY_ARMORED_VERSION", [armored_version], contributor_login)
256 |                 if key_attributes.get('armored_comment') != None:
257 |                     armored_comment = key_attributes.get('armored_comment').replace('\r\n',' | ')
258 |                     gx_output.c_log(f"[Comment: {armored_comment}] obtained from parsing PGP Key ID {primary_key.get('key_id')}", rtype="keys")
259 |                     gx_context.linkIdentifier("KEY_ARMORED_COMMENT", [armored_comment], contributor_login)
260 | 
261 |             # Let's add to the colab+key relationship all primary and subkeys from the user profile
262 |             primary_key_id = primary_key.get('key_id')
263 | 
264 |             # Link this Primary Key ID to the contributor
265 |             if primary_key_id: gx_context.linkIdentifier("PGP_KEYID", [primary_key_id], contributor_login)
266 | 
267 |             if primary_key.get('name') != None:
268 |                 gx_output.c_log(f"Primary key name typed by user for key {primary_key_id}: [{primary_key.get('name')}]", rtype="user_input")
269 | 
270 |             for email in primary_key.get('emails'):
271 |                 if email not in contributor_emails: 
272 |                     message = "(shows as Verified)" if email.get('verified') == True else "(shows as Not Verified)"
273 |                     gx_output.c_log(f"[{email.get('email')}] {message} obtained from primary Key with ID {primary_key_id}", rtype="emails")
274 |                     contributor_emails.append(email)
275 |                     # There's a Verified: False or True field, we link it disregarding if its verified.
276 |                     gx_context.linkIdentifier("EMAIL", [email['email']], contributor_login)
277 |  
278 |             for sub_key in primary_key["subkeys"]:
279 |                 sub_key_id = sub_key.get('key_id')
280 |                 if sub_key_id: gx_context.linkIdentifier("PGP_KEYID", [sub_key_id], contributor_login)
281 | 
282 |                 if sub_key.get('name') != None:
283 |                     gx_output.c_log(f"Subkey name typed by user for key {sub_key_id}: {sub_key.get('name')}", rtype="user_input")
284 | 
285 |                 for email in sub_key.get('emails'):
286 |                     if email not in contributor_emails: 
287 |                         gx_output.c_log(f"[{email}] obtained from subKey with ID {sub_key_id}", rtype="emails")
288 |                         contributor_emails.append(email)
289 |                         gx_context.linkIdentifier("EMAIL", [email], contributor_login)
290 |  
291 |                 if sub_key.get('expires_at') != None:
292 |                     kexpiration = gh_time.parse_date(sub_key.get('expires_at'))
293 |                     if kexpiration < datetime.now(timezone.utc):
294 |                         message = '(EXPIRED)'
295 |                     else:
296 |                         message = f'(EXPIRES in {(kexpiration-datetime.now(timezone.utc)).days} days)'
297 |                 else:
298 |                     message = '(DOES NOT EXPIRE)'
299 | 
300 |                 gx_output.c_log(f"PGP Subkey {sub_key.get('key_id')} in profile. Created at: {sub_key.get('created_at')} - Expires: {sub_key.get('expires_at')} {message}", rtype="keys")
301 |                 days_since_creation = (datetime.now(timezone.utc) - gh_time.parse_date(sub_key.get('created_at'))).days 
302 |                 gx_context.linkIdentifier("PGP_SUBKEY_CREATED_AT", [days_since_creation], contributor_login)
303 | 
304 |             gx_output.c_log(f'Primary Key details: {primary_key}', rtype="debug")
305 | 
306 | 
307 |         # SSH Signing keys 
308 |         # https://docs.github.com/en/rest/users/ssh-signing-keys?apiVersion=2022-11-28#list-ssh-signing-keys-for-a-user
309 |         ssh_signing_keys = gh_api.fetch_ssh_signing_keys(contributor_login)
310 |         if ssh_signing_keys != None and len(ssh_signing_keys) > 0:
311 |             gx_output.c_log(f"{len(ssh_signing_keys)} SSH Keys used for Signatures in this contributor's profile", rtype="keys")
312 |             gx_output.c_log(f"SSH Signing Keys: https://api.github.com/users/{contributor_login}/ssh_signing_keys", rtype="keys")
313 | 
314 |         for ssh_signing_key in ssh_signing_keys:
315 |             algorithm = gx_ugly_ssh_parser.ugly_inhouse_ssh_key(ssh_signing_key.get('key'))
316 |             gx_output.c_log(f"SSH Signing Key title typed by user for Key ID [{ssh_signing_key.get('id')}]: [{ssh_signing_key.get('title')}]", rtype="user_input")
317 |             algorithm = f"of type [{algorithm}] " if algorithm != None else ""
318 |             gx_output.c_log(f"SSH Signing Key ID [{ssh_signing_key.get('id')}] {algorithm}in profile, created at {ssh_signing_key.get('created_at')}.", rtype="keys")
319 |             days_since_creation = (datetime.now(timezone.utc) - gh_time.parse_date(ssh_signing_key.get('created_at'))).days 
320 |             gx_context.linkIdentifier("SSH_SIGNING_KEY_CREATED_AT", [days_since_creation], contributor_login)
321 | 
322 |         # SSH Authentication keys
323 |         ssh_auth_keys = gh_api.fetch_ssh_auth_keys(contributor_login)
324 |         if len(ssh_auth_keys) > 0:
325 |             gx_output.c_log(f"{len(ssh_auth_keys)} SSH Authentication Keys in this contributor's profile", rtype="keys")
326 |             gx_output.c_log(f"SSH Authentication Keys: https://api.github.com/users/{contributor_login}/keys", rtype="keys")
327 | 
328 |         # We don't keep track of duplicate/cloned keys for authentication SSH keys because GitHub won't allow them
329 |         # https://docs.github.com/en/authentication/troubleshooting-ssh/error-key-already-in-use
330 |         for ssh_auth_key in ssh_auth_keys:
331 |             algorithm = gx_ugly_ssh_parser.ugly_inhouse_ssh_key(ssh_auth_key.get('key'))
332 |             algorithm = f"of type [{algorithm}] " if algorithm != None else ""
333 |             gx_output.c_log(f"SSH Authentication Key ID [{ssh_auth_key.get('id')}] {algorithm}in profile.", rtype="keys")
334 | 
335 |         gx_output.c_log(f"All commits (for this Repo): {repository.get('html_url')}/commits/?author={contributor_login}", rtype="commits")
336 |         # Unique key ids for now only holds keys we've extracted from commit signatures
337 |         if len(unique_pgp_keyids) > 0:
338 |             # https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query
339 |             # Unfortunately GitHub requires (for other users than our own) to provide (non-regex) input keywords in order to
340 |             # return results in the commits API which accept filtering such as is:signed - and input keywords restrict our results.
341 |             gx_output.c_log(f"{len(unique_pgp_keyids)} Keys ({unique_pgp_keyids}) were used by this contributor when signing commits.", rtype="keys")
342 |             github_keys_used = [keyid for keyid in unique_pgp_keyids if keyid in gx_definitions.GITHUB_WEB_EDITOR_SIGNING_KEYS]
343 |             if len(github_keys_used) > 0:
344 |                 gx_output.c_log(f"{len(github_keys_used)} of the keys used to sign commits belong to GitHub's Web editor {github_keys_used}", rtype="keys")
345 | 
346 |         if len(commits) == len(signed_commits):
347 |             gx_output.c_log(f"Contributor has signed all of their {len(signed_commits)} total commits (to this repo).", rtype="signatures")
348 | 
349 |         if len(failed_verifications) > 0:
350 |             gx_output.c_log(f"Contributor has failed signature verifications in {len(failed_verifications)} of their total {len(signed_commits)} signed commits.", rtype="signatures")
351 | 
352 |         if len(signed_commits) == 0 and len(failed_verifications) == 0:
353 |             gx_output.c_log(f"Contributor has not signed any of their {len(commits)} commits (in this repo).", rtype="signatures")
354 | 
355 |         if len(signed_commits) == 0 and len(failed_verifications) > 0:
356 |             gx_output.c_log(f"Contributor has {len(failed_verifications)} failed attempts at signing commits and 0 succesful commits signed out of their {len(commits)} total commits.", rtype="signatures")
357 | 
358 |         if len(signed_commits) > 0 and len(signed_commits) < len(commits):
359 |             gx_output.c_log(f"Contributor has a mix of {len(signed_commits)} signed vs. {len(commits)-len(signed_commits)} unsigned commits (in this repo).", rtype="signatures")
360 | 
361 |         for scommit in signed_commits: 
362 |             if scommit['verification']['reason'] != 'valid': print(scommit) # This shouldn't happen
363 | 
364 |         public_repos = int(contributor.get('public_repos'))
365 |         if public_repos > 0:
366 |             gx_output.c_log(f"Contributor has {public_repos} total public repos.", rtype="profiling")
367 | 
368 |         gx_output.c_log(f"Contributor has {contributor.get('followers')} followers.", rtype="profiling")
369 | 
370 | 
371 |         matching_anonymous = [user for user in c_anon if user['email'] in contributor_emails]
372 |         if len(matching_anonymous) > 0:
373 |             gx_output.c_log(f"One of {contributor_login} emails matched the following anonymous users: {matching_anonymous}", rtype="profiling")
374 | 
375 | 
376 |         gx_output.stdout(f"\r[{c_users_index}/{len(c_users)}] Collecting recent (90d) public events for {contributor.get('login')}"+' '*40, end = '', flush=True)
377 | 
378 |         # Get Public Events generated by this account, if any. GitHub offers up to 90 days of data, which might still be useful.
379 |         public_events = gh_api.fetch_contributor_events(contributor)
380 |         if len(public_events) > 0:
381 |             gh_public_events.log_events(public_events, gx_output, for_repository=False)
382 | 
383 |         c_users_index += 1 
384 |         c_ended_at = datetime.now()
385 |         gx_output.c_log(f"X-Ray on contributor ended at {c_ended_at} - {(c_ended_at-c_started_at).seconds} seconds elapsed", rtype="metrics")
386 | 
387 |     # Let's first create a dictionary merging by email - this is because duplicate anonymous are "normal" or regularly seen
388 |     # GitHub checks if any of (email OR name) differ and if so treats the anonymous user as different
389 |     # Add all of these under Anonymous contributor output
390 |     unique_anonymous = {}
391 |     for ac in c_anon:
392 |         email = ac.get('email','ERROR_PULLING_ANON_EMAIL')
393 |         if email not in unique_anonymous:
394 |             unique_anonymous[email] = []
395 |         unique_anonymous[email].append(ac.get('name','ERROR_PULLING_ANON_NAME'))
396 | 
397 |     commits_url = f"Find commits with: https://api.github.com/search/commits?q=repo:{repository.get('full_name')}+author-email:PLACE_EMAIL_HERE"
398 |     gx_output.a_log(commits_url, anonymous="#", rtype="urls")
399 |     for k,v in unique_anonymous.items():
400 |         gx_output.a_log(f'{k} - {v}', anonymous="#", rtype="anonymous")
401 | 
402 |     gx_output.stdout('\rContributors have been analyzed..'+' '*60, flush=True)
403 | 
404 |     return True
405 | 


--------------------------------------------------------------------------------
/src/gitxray/xrays/repository_xray.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timezone
  2 | from collections import defaultdict
  3 | import math
  4 | from gitxray.include import gh_time, gh_public_events, gh_reactions
  5 | 
  6 | def run(gx_context, gx_output, gh_api):
  7 |     gx_output.stdout("Running verifications on the repository..")
  8 | 
  9 |     repository = gx_context.getRepository()
 10 |     contributors = gx_context.getContributors()
 11 | 
 12 |     gx_output.stdout(f"Checking for similar repository names in GitHub.."+" "*40, end="")
 13 |     # This gets all repository names matching our repository name, sorted first by highest rating
 14 |     similar_names = gh_api.search_repositories_by_name(repository.get('name'), limit=10)
 15 |     if similar_names != None and similar_names.get('total_count') != None and similar_names.get('total_count') > 0:
 16 |         most_rated = similar_names.get('items')[0]
 17 |         search_url = f"https://github.com/search?q={repository.get('name')}%20in:name&type=repositories&s=stars&o=desc"
 18 |         if most_rated.get('full_name') == repository.get('full_name'):
 19 |             reponame_msg = f"This is the highest rating repository with name [{repository.get('name')}]"
 20 |         else:
 21 |             reponame_msg = f"WARNING: This is NOT the highest rating repository with name [{repository.get('name')}]"
 22 | 
 23 |         if similar_names.get('total_count') > 0:
 24 |             gx_output.r_log(f'{reponame_msg}. {similar_names.get("total_count")} repositories with a similar name were discovered - See them here: {search_url}', 'profiling')
 25 |         else:
 26 |             gx_output.r_log(f'{reponame_msg}', 'profiling')
 27 | 
 28 |     if repository.get('stargazers_count') is not None:
 29 |         stargazers_message = f"Stars count: [{repository.get('stargazers_count')}]"
 30 |         if repository.get('stargazers_count', 0) > 0:
 31 |             stargazers_message += f" List at: {repository.get('stargazers_url')}"
 32 |         gx_output.r_log(stargazers_message, rtype="profiling")
 33 | 
 34 |     if repository.get('owner'):
 35 |         gx_output.r_log(f"Repository owner account is [{repository.get('owner').get('login')}]: {repository.get('owner').get('html_url')}", rtype="profiling")
 36 | 
 37 |     if repository.get('html_url'):
 38 |         gx_output.r_log(f"Repository Url: [{repository.get('html_url')}]", rtype="urls")
 39 | 
 40 |     if repository.get('homepage'):
 41 |         gx_output.r_log(f"Homepage: [{repository.get('homepage')}]", rtype="urls")
 42 | 
 43 |     # These go in the repository xray and not contributors because the REST API returns all per repository
 44 |     # https://api.github.com/repos/infobyte/faraday/issues/comments - and won't allow filtering in a helpful (to us) way
 45 |     gx_output.stdout(f"\rGetting all repository comments on commits.."+" "*40, end="")
 46 |     commit_comments = gh_api.fetch_repository_commit_comments(repository)
 47 |     if isinstance(commit_comments, list) and len(commit_comments) > 0: 
 48 |         total_comments = defaultdict(int)
 49 |         positive_reactions = defaultdict(int)
 50 |         negative_reactions = defaultdict(int)
 51 |         neutral_reactions = defaultdict(int)
 52 |         for cc in commit_comments:
 53 |             gh_reactions.categorize_reactions(cc, positive_reactions, negative_reactions, neutral_reactions)
 54 |             login = cc.get('user').get('login')
 55 |             message = (f"User {login} added a comment to a Commit on [{cc.get('created_at')}]" + (f", and then updated it on [{cc.get('updated_at')}]" if cc.get('updated_at') != cc.get('created_at') else "") + f": {cc.get('html_url')}")
 56 |             total_comments[login] += 1
 57 |             gx_output.c_log(message, rtype="comments", contributor=login)
 58 | 
 59 |             created_at_ts = gh_time.parse_date(cc.get('created_at'))
 60 |             updated_at_ts = gh_time.parse_date(cc.get('updated_at'))
 61 |             # Comments updated after a day of being posted are of interest; they are not typo edits made right away.
 62 |             if (updated_at_ts-created_at_ts).days > 0:
 63 |                 gx_output.c_log(f"A comment by [{login}] on a Commit was updated {(updated_at_ts-created_at_ts).days} days after being created: {cc.get('html_url')}", rtype="comments", contributor=login)
 64 | 
 65 |         for login,ccount in total_comments.items():        
 66 |             if not gx_context.isContributor(login):
 67 |                 login_tmp = f"{login} [NOT a contributor]"
 68 |             else:
 69 |                 login_tmp = login
 70 |             gx_output.c_log(f"User {login_tmp} added {ccount} Comments to Commits.", rtype="comments", contributor=login)
 71 |             #gx_output.c_log(f"{ccount} Comments added to Commits by [{login}] available at: {repository.get('url')}/comments", rtype="comments")
 72 | 
 73 |         # Not adding much value 
 74 |         gx_output.r_log(f"{len(commit_comments)} Comments in commits available at: {repository.get('url')}/comments", rtype="comments")
 75 | 
 76 |         # Comment with the most reactions
 77 |         if len(positive_reactions) > 0:
 78 |             url, count = gh_reactions.sort_reactions(positive_reactions)[0]
 79 |             if count > 0: gx_output.r_log(f"The Commits comment with the most POSITIVE reactions ({count}) is available at: {url}", rtype="comments")
 80 |         if len(negative_reactions) > 0:
 81 |             url, count = gh_reactions.sort_reactions(negative_reactions)[0]
 82 |             if count > 0: gx_output.r_log(f"The Commits comment with the most NEGATIVE reactions ({count}) is available at: {url}", rtype="comments")
 83 |         if len(neutral_reactions) > 0:
 84 |             url, count = gh_reactions.sort_reactions(neutral_reactions)[0]
 85 |             if count > 0: gx_output.r_log(f"The Commits comment with the most NEUTRAL reactions ({count}) is available at: {url}", rtype="comments")
 86 | 
 87 |     
 88 |     gx_output.stdout(f"\rGetting all repository comments on issues.."+" "*30, end="")
 89 |     issues_comments = gh_api.fetch_repository_issues_comments(repository)
 90 |     if isinstance(issues_comments, list) and len(issues_comments) > 0: 
 91 |         total_comments = defaultdict(int)
 92 |         positive_reactions = defaultdict(int)
 93 |         negative_reactions = defaultdict(int)
 94 |         neutral_reactions = defaultdict(int)
 95 |         for cc in issues_comments:
 96 |             gh_reactions.categorize_reactions(cc, positive_reactions, negative_reactions, neutral_reactions)
 97 |             login = cc.get('user').get('login')
 98 |             message = (f"User {login} added a comment to an Issue on [{cc.get('created_at')}]" + (f", and then updated it on [{cc.get('updated_at')}]" if cc.get('updated_at') != cc.get('created_at') else "") + f": {cc.get('html_url')}")
 99 |             total_comments[login] += 1
100 |             gx_output.c_log(message, rtype="comments", contributor=login)
101 | 
102 |             created_at_ts = gh_time.parse_date(cc.get('created_at'))
103 |             updated_at_ts = gh_time.parse_date(cc.get('updated_at'))
104 |             # Comments updated after a day of being posted are of interest; they are not typo edits made right away.
105 |             if (updated_at_ts-created_at_ts).days > 0:
106 |                 gx_output.c_log(f"A comment by [{login}] on an Issue was updated {(updated_at_ts-created_at_ts).days} days after being created: {cc.get('html_url')}", rtype="comments", contributor=login)
107 | 
108 |         for login,ccount in total_comments.items():
109 |             if not gx_context.isContributor(login):
110 |                 login_tmp = f"{login} [NOT a contributor]"
111 |             else:
112 |                 login_tmp = login
113 |             gx_output.c_log(f"User {login_tmp} added {ccount} Comments to Issues.", rtype="comments", contributor=login)
114 |             #gx_output.c_log(f"{ccount} Comments added to Issues by [{login}] available at: {repository.get('url')}/issues/comments", rtype="comments")
115 | 
116 |         gx_output.r_log(f"{len(issues_comments)} Comments in issues available at: {repository.get('url')}/issues/comments", rtype="comments")
117 | 
118 |         # Comment with the most reactions
119 |         if len(positive_reactions) > 0:
120 |             url, count = gh_reactions.sort_reactions(positive_reactions)[0]
121 |             if count > 0: gx_output.r_log(f"The Issues comment with the most POSITIVE reactions ({count}) is available at: {url}", rtype="comments")
122 |         if len(negative_reactions) > 0:
123 |             url, count = gh_reactions.sort_reactions(negative_reactions)[0]
124 |             if count > 0: gx_output.r_log(f"The Issues comment with the most NEGATIVE reactions ({count}) is available at: {url}", rtype="comments")
125 |         if len(neutral_reactions) > 0:
126 |             url, count = gh_reactions.sort_reactions(neutral_reactions)[0]
127 |             if count > 0: gx_output.r_log(f"The Issues comment with the most NEUTRAL reactions ({count}) is available at: {url}", rtype="comments")
128 | 
129 |     gx_output.stdout(f"\rGetting all repository comments on pull requests.."+" "*30, end="")
130 |     pulls_comments = gh_api.fetch_repository_pulls_comments(repository)
131 |     if isinstance(pulls_comments, list) and len(pulls_comments) > 0: 
132 |         total_comments = defaultdict(int)
133 |         positive_reactions = defaultdict(int)
134 |         negative_reactions = defaultdict(int)
135 |         neutral_reactions = defaultdict(int)
136 |         for cc in pulls_comments:
137 |             try:
138 |                 gh_reactions.categorize_reactions(cc, positive_reactions, negative_reactions, neutral_reactions)
139 |                 login = cc.get('user', {}).get('login', None)
140 |             except:
141 |                 continue
142 |             message = (f"User {login} added a comment to a PR on [{cc.get('created_at')}]" + (f", and then updated it on [{cc.get('updated_at')}]" if cc.get('updated_at') != cc.get('created_at') else "") + f": {cc.get('html_url')}")
143 |             total_comments[login] += 1
144 |             gx_output.c_log(message, rtype="comments", contributor=login)
145 | 
146 |             created_at_ts = gh_time.parse_date(cc.get('created_at'))
147 |             updated_at_ts = gh_time.parse_date(cc.get('updated_at'))
148 |             # Comments updated after a day of being posted are of interest; they are not typo edits made right away.
149 |             if (updated_at_ts-created_at_ts).days > 0:
150 |                 gx_output.c_log(f"A comment by [{login}] on a PR was updated {(updated_at_ts-created_at_ts).days} days after being created: {cc.get('html_url')}", rtype="comments", contributor=login)
151 | 
152 |         for login,ccount in total_comments.items():        
153 |             if not gx_context.isContributor(login):
154 |                 login_tmp = f"{login} [NOT a contributor]"
155 |             else:
156 |                 login_tmp = login
157 |             gx_output.c_log(f"User {login_tmp} added {ccount} Comments to PRs.", rtype="comments", contributor=login)
158 |             #gx_output.c_log(f"{ccount} Comments added to PRs by [{login}] available at: {repository.get('url')}/pulls/comments", rtype="comments")
159 | 
160 |         gx_output.r_log(f"{len(pulls_comments)} Comments in pulls available at: {repository.get('url')}/pulls/comments", rtype="comments")
161 | 
162 |         # Comment with the most reactions
163 |         if len(positive_reactions) > 0:
164 |             url, count = gh_reactions.sort_reactions(positive_reactions)[0]
165 |             if count > 0: gx_output.r_log(f"The PRs comment with the most POSITIVE reactions ({count}) is available at: {url}", rtype="comments")
166 |         if len(negative_reactions) > 0:
167 |             url, count = gh_reactions.sort_reactions(negative_reactions)[0]
168 |             if count > 0: gx_output.r_log(f"The PRs comment with the most NEGATIVE reactions ({count}) is available at: {url}", rtype="comments")
169 |         if len(neutral_reactions) > 0:
170 |             url, count = gh_reactions.sort_reactions(neutral_reactions)[0]
171 |             if count > 0: gx_output.r_log(f"The PRs comment with the most NEUTRAL reactions ({count}) is available at: {url}", rtype="comments")
172 | 
173 | 
174 |     gx_output.stdout(f"\rChecking for repository deployments.."+" "*30, end="")
175 |     if repository.get('deployments_url'):
176 |          deployments = gh_api.fetch_repository_deployments(repository)
177 |          if len(deployments) > 0: gx_output.r_log(f"{len(deployments)} Deployments available at: [{repository.get('html_url')}/deployments]", rtype="deployments")
178 | 
179 |     gx_output.stdout(f"\rChecking for repository environments.."+" "*30, end="")
180 |     environments = gh_api.fetch_repository_environments(repository)
181 |     if environments != None and environments.get('total_count') != None and environments.get('total_count') > 0:
182 |         gx_output.r_log(f"{environments.get('total_count')} Environments available at: [{repository.get('url')}/environments]", rtype="environments")
183 |         for environment in environments.get('environments'):
184 |             gx_output.r_log(f"Environment [{environment.get('name')}] created [{environment.get('created_at')}], updated [{environment.get('updated_at')}]: {environment.get('html_url')}", rtype="environments")
185 |             #print(gh_api.fetch_environment_protection_rules(repository, environment.get('name')))
186 | 
187 |     gx_output.stdout(f"\rChecking for repository forks.."+" "*30, end="")
188 |     if repository.get('forks_count', 0) > 0:
189 |         gx_output.r_log(f"Repository has {repository.get('forks_count')} forks: {repository.get('forks_url')}", rtype="profiling")
190 | 
191 |     gx_output.stdout(f"\rInspecting repository branches.."+" "*40, end="")
192 |     branches = gh_api.fetch_repository_branches(repository)
193 |     if isinstance(branches, list) and len(branches) > 0: 
194 |         gx_output.r_log(f"{len(branches)} Branches available at: [{repository.get('html_url')}/branches]", rtype="branches")
195 |         unprotected_branches = []
196 |         protected_branches = []
197 |         for branch in branches:
198 |             if branch.get('protected') == False:
199 |                 unprotected_branches.append(branch.get('name'))
200 |             else:
201 |                 protected_branches.append(branch.get('name'))
202 | 
203 |         if len(unprotected_branches) > 0: gx_output.r_log(f"{len(unprotected_branches)} Unprotected Branches: {unprotected_branches}", rtype="branches")
204 |         if len(protected_branches) > 0: gx_output.r_log(f"{len(protected_branches)} Protected Branches: {protected_branches}", rtype="branches")
205 | 
206 |     gx_output.stdout(f"\rInspecting repository labels.."+" "*40, end="")
207 |     labels = gh_api.fetch_repository_labels(repository)
208 |     if isinstance(labels, list) and len(labels) > 0: 
209 |         gx_output.r_log(f"{len(labels)} Labels available at: [{repository.get('html_url')}/labels]", rtype="labels")
210 |         non_default_labels = [label.get('name') for label in labels if label.get('default') == False]
211 |         if len(non_default_labels) > 0:
212 |             gx_output.r_log(f"{len(non_default_labels)} Non-default Labels: {non_default_labels} available at: [{repository.get('html_url')}/labels]", rtype="labels")
213 | 
214 |     gx_output.stdout(f"\rInspecting repository tags.."+" "*40, end="")
215 |     tags = gh_api.fetch_repository_tags(repository)
216 |     if isinstance(tags, list) and len(tags) > 0: 
217 |         gx_output.r_log(f"{len(tags)} Tags available at: [{repository.get('html_url')}/tags]", rtype="tags")
218 |     else:
219 |         tags = []
220 |     tag_taggers = defaultdict(int)
221 | 
222 |     """ A bit shameful here because we can't really get too much data out of tags because of the way the GH API is implemented.
223 |     It only returns stripped tags when getting all tags, we can't even get who the tagger was. """
224 |     for tag in tags:
225 |         tagger = tag.get('tagger')
226 |         if tagger == None:
227 |             # Lightweight tags - for some reason GitHub's API is returning stripped down version of tags even if they are not lightweight
228 |             gx_output.r_log(f"Tag [{tag.get('name')}] is available at: [{repository.get('html_url')}/tags]", rtype="tags")
229 |         else: 
230 |             tagger = tagger.get('email')
231 |             tag_taggers[tagger] += 1
232 |             gx_output.r_log(f"A tag was created by {tagger} at {tag.get('tagger').get('date')}: {tag.get('url')}", rtype="tags")
233 | 
234 |     total_tags = sum(tag_taggers.values())
235 |     for tagger, tags in tag_taggers.items():
236 |         percentage_tags = (tags / total_tags) * 100
237 |         message = f"{tagger} created historically {tags} tags [{percentage_tags:.2f}%]"
238 |         gx_output.r_log(message, rtype="tags")
239 | 
240 | 
241 |     gx_output.stdout(f"\rInspecting repository releases.."+" "*40, end="")
242 |     releases = gh_api.fetch_repository_releases(repository)
243 |     if isinstance(releases, list) and len(releases) > 0: 
244 |         gx_output.r_log(f"{len(releases)} Releases available at: [{repository.get('html_url')}/releases]", rtype="releases")
245 | 
246 |     release_authors = defaultdict(int)
247 |     asset_uploaders = defaultdict(int)
248 | 
249 |     for release in releases:
250 |         # Particular case in which the GitHub API returns erratic data, mostly happening when not using an auth token
251 |         # This needs to be fixed at a lower level in our gh_api code, patching here in the meantime.
252 |         if not isinstance(release, dict):
253 |             continue
254 | 
255 |         if release.get('author') == None:
256 |             author = "NO_USERNAME"
257 |         else:
258 |             author = release.get('author').get('login')
259 | 
260 |         release_authors[author] += 1
261 |         gx_output.r_log(f"A release was created by {author} at {release.get('created_at')}: {release.get('html_url')}", rtype="releases")
262 |         if len(release.get('assets')) > 0:
263 |             # This release has assets other than frozen code. Let's check if updated_at differs from created_at
264 |             # Which may be an indicator of a compromised release by a malicious actor updating binaries.
265 |             for asset in release.get('assets'):
266 |                 if asset.get('uploader') == None:
267 |                     uploaded_by = "NO_USERNAME"
268 |                 else:
269 |                     uploaded_by = asset.get('uploader').get('login')
270 |                 asset_uploaders[uploaded_by] += 1
271 |                 created_at = asset.get('created_at')
272 |                 message = f"An asset was uploaded by {uploaded_by} at {created_at}: {asset.get('url')}"
273 |                 gx_output.r_log(message, rtype="releases")
274 |                 gx_output.c_log(message, rtype="releases", contributor=uploaded_by)
275 |                 created_at_ts = gh_time.parse_date(created_at)
276 |                 updated_at = asset.get('updated_at')
277 |                 updated_at_ts = gh_time.parse_date(updated_at)
278 |                 if (updated_at_ts-created_at_ts).days > 0:
279 |                     gx_output.r_log(f"WARNING: An asset in Release [{release.get('name')}] by [{uploaded_by}] was updated {(updated_at_ts-created_at_ts).days} days after its release: {asset.get('url')}", rtype="releases")
280 | 
281 |     total_releases = sum(release_authors.values())
282 |     total_assets = sum(asset_uploaders.values())
283 |     asset_uploaders_set = set(asset_uploaders.keys())
284 |     for author, releases in release_authors.items():
285 |         percentage_releases = (releases / total_releases) * 100
286 |         if gx_context.isContributor(author):
287 |             message = f"User {author} created historically {releases} releases [{percentage_releases:.2f}%]"
288 |         else:
289 |             message = f"WARNING: {author} is NOT a contributor to this repository and yet created historically {releases} releases [{percentage_releases:.2f}%]"
290 | 
291 |         # Check if the author has also uploaded assets
292 |         if author in asset_uploaders:
293 |             assets = asset_uploaders[author]
294 |             percentage_assets = (assets / total_assets) * 100
295 |             message += f" and uploaded a total of {assets} assets [{percentage_assets:.2f}%]"
296 |             asset_uploaders_set.remove(author)  # Remove from set as it's been handled
297 |         else:
298 |             message += " and never uploaded assets."
299 | 
300 |         gx_output.r_log(message, rtype="releases")
301 |         gx_output.c_log(message, rtype="releases", contributor=author)
302 | 
303 |     # Handle asset uploaders who did not create any releases
304 |     for uploader in asset_uploaders_set:
305 |         assets = asset_uploaders[uploader]
306 |         percentage_assets = (assets / total_assets) * 100
307 |         message = f"WARNING: User {uploader} has uploaded {assets} assets [{percentage_assets:.2f}%] and never created a release."
308 |         gx_output.r_log(message, rtype="releases")
309 |         gx_output.c_log(message, rtype="releases", contributor=uploader)
310 | 
311 |         """ Work in Progress: This sounded fun but ended up being a dead end.
312 |         # Let's run an additional check on stargazers if, and only if, the repository has up to 5000 gazers.
313 |         # This is because we can only pull groups of 100, in which case we would send an extra 50 requests to get all of them.
314 |         # More than that sounds too much overhead for a remains-to-be-seen-how-helpful-this-is feature which lacks AI & blockchain superpowers.
315 | 
316 |         # We're relying here on a trick to make it work. Unfortunately the gazers API does NOT return creation time for accounts,
317 |         # BUT it does return the account ID which is a sequential value. And by paying attention we've been able to tell about ~50-100k accounts
318 |         # get created daily. Anyhow, if we group the accounts by close IDs; we may end up being able to identify accounts that were created close to eachother
319 |         # and we set a threshold (eg 50%) and inform our user the % of accounts that appear to be fake and gazing the repo that way.
320 | 
321 |      
322 |         if repository.get('stargazers_count') <= 5000: 
323 |             print(f"Buscando {repository.get('stargazers_count')} stargazers.")
324 |             stargazers = gh_api.fetch_repository_stargazers(repository, limit=5000)
325 |             if len(stargazers) > 0:
326 |                 starusers = [(star.get('id'), star.get('login')) for star in stargazers]  # Collect both id and login
327 |                 sorted_ids = sorted(starusers, key=lambda x: x[0])  # Sort by user ID
328 |                 groups = {}
329 | 
330 |                 group_width = 1000000
331 | 
332 |                 for id, login in sorted_ids:
333 |                     group_key = id // group_width  # Determine the group key based on id_width
334 |                     if group_key not in groups:
335 |                         groups[group_key] = []
336 |                     groups[group_key].append((id, login))
337 | 
338 |       
339 |                 total_users = int(repository.get('stargazers_count')) 
340 |                 threshold = total_users * 0.05
341 | 
342 |                 # Output the groups
343 |                 for grou_key, group in groups.items():
344 |                     if group:  # Ensure the group is not empty
345 |                         range_start = min(group, key=lambda x: x[0])[0]
346 |                         range_end = max(group, key=lambda x: x[0])[0]
347 |                         if len(group) > threshold:
348 |                             logins = ', '.join([user[1] for user in group])  # Collect all logins in the group
349 |                             print(f"Group length: {len(group)} - Group ID: {group_key}: Range {range_start} to {range_end}, Members: {len(group)}, Logins: {logins}")
350 |         """
351 | 
352 |     if repository.get('subscribers_count') is not None:
353 |         watchers_message = f"Watchers count: [{repository.get('subscribers_count')}]"
354 |         if repository.get('subscribers_count', 0) > 0:
355 |             watchers_message += f" List at: {repository.get('subscribers_url')}"
356 |         gx_output.r_log(watchers_message, rtype="profiling")
357 | 
358 |     if repository.get('open_issues_count', 0) > 0:
359 |         gx_output.r_log(f"Repository has {repository.get('open_issues_count')} Open Issues: {repository.get('html_url')}/issues", rtype="profiling")
360 | 
361 |     if repository.get('description'):
362 |         gx_output.r_log(f"Repository description: [{repository.get('description')}]", rtype="profiling")
363 | 
364 |     if repository.get('topics'):
365 |         gx_output.r_log(f"Topics: {str(repository.get('topics'))}", rtype="profiling")
366 | 
367 |     if repository.get('fork') != False and repository.get('fork') != None:
368 |         parent = repository.get('parent').get('full_name')
369 |         source = repository.get('source').get('full_name')
370 |         gx_output.stdout(f"\rRepository is a FORK of a parent named: {repository.get('parent').get('full_name')}: {repository.get('parent')['html_url']}")
371 |         gx_output.r_log(f"Repository is a FORK of repo: {repository.get('parent')['html_url']}", rtype="fork")
372 |         gx_output.stdout(f"This also means that GitHub will return ALL contributors (might be a LOT) up to the source repository")
373 |         if parent != source:
374 |             gx_output.stdout(f"Please know the parent of this repository is not the original source, which is: {source}") 
375 |             gx_output.r_log(f"The parent of this fork comes from SOURCE repo: {repository.get('source')['html_url']}", rtype="fork")
376 | 
377 | 
378 |     if repository.get('created_at') is not None:
379 |         days = (datetime.now(timezone.utc) - gh_time.parse_date(repository.get('created_at', datetime.utcnow().isoformat()))).days 
380 |         message = f"{days} days old"
381 |         if days > 365:
382 |             years = "{:.2f}".format(days / 365)
383 |             message = f"{years} years old"
384 |         gx_output.r_log(f"Repository created: {repository.get('created_at')}, is {message}.", rtype="profiling")
385 | 
386 |     if repository.get('updated_at') is not None:
387 |         days = (datetime.now(timezone.utc) - gh_time.parse_date(repository.get('updated_at', datetime.utcnow().isoformat()))).days 
388 |         message = f"{days} days ago"
389 |         if days > 365:
390 |             years = "{:.2f}".format(days / 365)
391 |             message = f"{years} years ago"
392 |         gx_output.r_log(f"Repository last updated: {repository.get('updated_at')}, {message}.", rtype="profiling")
393 | 
394 |     if repository.get('archived') == True:
395 |         gx_output.r_log(f"Repository is archived and therefore likely no longer maintained.", rtype="profiling")
396 | 
397 |     if repository.get('disabled') == True:
398 |         gx_output.r_log(f"Repository is disabled and therefore likely no longer maintained.", rtype="profiling")
399 | 
400 |     if repository.get('private') == True:
401 |         gx_output.r_log(f"Repository's visibility is set to [private]", rtype="profiling")
402 | 
403 |     public_events = gh_api.fetch_repository_public_events(repository)
404 |     if isinstance(public_events, list) and len(public_events) > 0:
405 |         gh_public_events.log_events(public_events, gx_output, for_repository=True)
406 | 
407 |     if repository.get('organization'):
408 |         org_url = repository.get('organization').get('url')
409 |         gx_output.r_log(f"Repository is owned by an Organization {org_url} - (Note that Creating an Org is free in github.com.)", rtype="profiling")
410 |         # Only supported in organizations
411 |         custom_values = gh_api.fetch_repository_custom_values(repository)
412 |         if len(custom_values) > 0:
413 |             gx_output.r_log(f"Repository Custom Property Values: {str(custom_values)}", rtype="user_input")
414 | 
415 |     # Now look into PRs and let's try and identify anything interesting.
416 |     prs = gh_api.fetch_repository_pull_requests(repository)
417 |     submitter_contrib_counts = defaultdict(lambda: {'submitted': 0, 'accepted':0, 'open': 0, 'rejected': 0})
418 |     submitter_notcontrib_counts = defaultdict(lambda: {'submitted': 0, 'accepted':0, 'open': 0, 'rejected': 0})
419 |     clogins = [c.get('login') for c in contributors]
420 |     if isinstance(prs, list) and len(prs) > 0:
421 |         for pr in prs:
422 |             try: # quick ugly patch instead of checking all types are dict and keys exist.
423 |                 submitter = pr['user']['login']
424 |             except: 
425 |                 continue
426 |             is_merged = pr['merged_at'] is not None
427 |             if submitter not in clogins: 
428 |                 submitter_counts = submitter_notcontrib_counts
429 |             else:
430 |                 submitter_counts = submitter_contrib_counts
431 |     
432 |             submitter_counts[submitter]['submitted'] += 1
433 | 
434 |             if is_merged:
435 |                 submitter_counts[submitter]['accepted'] += 1
436 |             elif pr['state'] == 'closed':
437 |                 submitter_counts[submitter]['rejected'] += 1
438 |             else:
439 |                 submitter_counts[submitter]['open'] += 1
440 | 
441 |         for submitter_counts in [submitter_contrib_counts, submitter_notcontrib_counts]:
442 |             for user, details in submitter_counts.items():
443 |                 if details['submitted'] > 0:
444 |                     # Only add a link to the URL of PRs if it belongs to a user account
445 |                     if user in clogins:
446 |                         gx_output.c_log(f"{details['submitted']} Pull Requests by [{user}] at: {repository.get('html_url')}/pulls?q=author%3a{user}", rtype="prs", contributor=user)
447 |                     details['rejected_percent'] = (details['rejected'] / details['submitted']) * 100
448 |                 else:
449 |                     details['rejected_percent'] = 0
450 |     
451 |                 # Used GPT for this, we're automathgically weighting amount AND percentage, and it appears to be working.
452 |                 details['rejected_score'] = details['rejected_percent'] * math.log1p(details['rejected'])
453 |    
454 |         sorted_submitters_contrib_rejected = sorted(submitter_contrib_counts.items(), key=lambda x: (-x[1]['rejected_score'], -x[1]['submitted']))
455 |         sorted_submitters_notcontrib_rejected = sorted(submitter_notcontrib_counts.items(), key=lambda x: (-x[1]['rejected_score'], -x[1]['submitted']))
456 | 
457 |         # First loop on top 3 to log in Repository output
458 |         message = []
459 |         for user, details in sorted_submitters_contrib_rejected[:3]:
460 |             if details['rejected'] > 0:
461 |                 message.append(f"[{user} {details['rejected']} rejected out of {details['submitted']}]")
462 |         if len(message) > 0:
463 |             gx_output.r_log(f"Top repository contributors with rejected PRs: " + " | ".join(message), rtype="contributors")
464 | 
465 |         # Now for NON contributors
466 |         message = []
467 |         for user, details in sorted_submitters_notcontrib_rejected[:3]:
468 |             if details['rejected'] > 0:
469 |                 message.append(f"[{user} {details['rejected']} rejected out of {details['submitted']}]")
470 |         if len(message) > 0:
471 |             gx_output.r_log(f"Top non-contributor GitHub users with rejected PRs: " + " | ".join(message), rtype="contributors")
472 | 
473 |         # And now loop on all to log under each user account.
474 |         for user, details in submitter_contrib_counts.items():
475 |             if details['rejected'] > 0:
476 |                 gx_output.c_log(f"The user submitted {details['submitted']} Pull Requests out of which {details['rejected']} were rejected.", rtype="profiling", contributor=user)
477 |             if details['accepted'] > 0:
478 |                 gx_output.c_log(f"The user submitted {details['submitted']} Pull Requests out of which {details['accepted']} were merged.", rtype="profiling", contributor=user)
479 |             if details['open'] > 0:
480 |                 gx_output.c_log(f"The user submitted {details['submitted']} Pull Requests out of which {details['open']} remain open.", rtype="profiling", contributor=user)
481 | 
482 |     # Check if there were any users with mismatches in commits dates in the repository.
483 |     for user, dates_mismatch_commits in gx_context.getIdentifierValues("DATE_MISMATCH_COMMITS").items():
484 |             gx_output.r_log(f"WARNING: UNRELIABLE DATES (Older than Account) in {dates_mismatch_commits} commits by [{user}]. Potential tampering, account re-use, or Rebase.", rtype="commits")
485 |        
486 | 
487 |     """ This here next is Work in Progress - trying to figure out what to pay attention to here that makes sense.
488 |     # Get all Issues. Note from GitHub that Issues returns both Issues + PRs:
489 |     # https://docs.github.com/en/rest/issues/issues?apiVersion=2022-11-28
490 |     # The reason we request these again instead of just calling issues and using it above for PRs
491 |     # is that the Issues endpoint does not include merged_at information. 
492 |     issues = gh_api.fetch_repository_issues(repository)
493 |     print(f"Analyzing a total of {len(issues)-len(prs)} issues and {len(prs)} PRs")
494 |     not_created_by_contributors = 0
495 |     i_pr_len = len(issues)
496 |     c_logins = [item['login'] for item in contributors if item['type'] in ["User","Bot"]]
497 |     for i in issues_prs:
498 |         if i.get('user', {}).get('login') not in c_logins:
499 |             not_created_by_contributors += 1
500 | 
501 |     gx_output.r_log(f"All {i_pr_len} existing issues and PRs were created by contributors.", rtype="profiling")
502 |     gx_output.r_log(f"The repository has no record of Issues or Pull Requests.", rtype="profiling")
503 |     """
504 | 
505 |     return True
506 | 


--------------------------------------------------------------------------------
/src/gitxray/xrays/workflows_xray.py:
--------------------------------------------------------------------------------
  1 | from gitxray.include import gx_definitions, gh_time
  2 | from collections import defaultdict
  3 | import base64, re
  4 | 
  5 | 
  6 | def run(gx_context, gx_output, gh_api):
  7 |     gx_output.stdout("\rRunning verifications on existing Workflows..."+" "*50)
  8 |     repository = gx_context.getRepository()
  9 |     contributors = gx_context.getContributors()
 10 | 
 11 |     gx_output.stdout(f"\rQuerying for repository action workflows.."+" "*50, end="")
 12 |     workflows = gh_api.fetch_repository_actions_workflows(repository)
 13 |     if workflows != None and workflows.get('total_count', 0) > 0:
 14 |         gx_output.r_log(f"{workflows.get('total_count')} Workflows available at: [{repository.get('url')}/actions/workflows]", rtype="workflows")
 15 |         for workflow in workflows.get('workflows'):
 16 |             workflow_file = workflow.get('path').split('/')[-1]
 17 |             gx_output.r_log(f"Workflow [{workflow.get('name')}] created [{workflow.get('created_at')}], updated [{workflow.get('updated_at')}]: [{workflow.get('html_url')}]", rtype="workflows")
 18 | 
 19 |             runs = gh_api.fetch_repository_actions_runs(repository, workflow_file=workflow_file)
 20 |             if runs != None and runs.get('total_count', 0) > 0: 
 21 |                 run_contributors = defaultdict(int)
 22 |                 run_non_contributors = defaultdict(int)
 23 |                 run_actors = defaultdict(int)
 24 |                 run_numbers = []
 25 |                 for run in runs.get('workflow_runs'):
 26 |                     run_numbers.append(run.get('run_number', -1))
 27 |                     run_actors[run.get('actor').get('login')] += 1
 28 | 
 29 |                 if len(run_numbers) > 0:
 30 |                     min_run = min(run_numbers)
 31 |                     max_run = max(run_numbers)
 32 |                     missing_numbers = sorted(set(range(min_run, max_run+1)) - set(run_numbers))
 33 |                     if len(missing_numbers) > 0: 
 34 |                         gx_output.r_log(f"Workflow [{workflow.get('name')}] has [{len(missing_numbers)}] missing or deleted runs. This could have been an attacker erasing their tracks, or legitimate cleanup.", rtype="workflows")
 35 |                         gx_output.r_log(f"Missing run numbers for Workflow [{workflow.get('name')}]: {missing_numbers}", rtype="workflows")
 36 | 
 37 |                 total_runs = int(runs.get('total_count'))
 38 |                 for actor, actor_runs in run_actors.items():
 39 |                     percentage_runs = (actor_runs / total_runs) * 100
 40 |                     if gx_context.isContributor(actor):
 41 |                         run_contributors[actor] += 1
 42 |                         message = f"Contributor [{actor}] ran {actor_runs} [{percentage_runs:.2f}%] times workflow [{workflow.get('name')}] - See them at: [{repository.get('html_url')}/actions?query=actor%3A{actor}]"
 43 |                     else:
 44 |                         run_non_contributors[actor] += 1
 45 |                         message = f"{actor} is NOT a contributor and ran {actor_runs} [{percentage_runs:.2f}%] times workflow [{workflow.get('name')}] - See them at: [{repository.get('html_url')}/actions?query=actor%3A{actor}]"
 46 | 
 47 |                     gx_output.c_log(message, rtype="workflows", contributor=actor)
 48 |                     gx_output.r_log(message, rtype="workflows")
 49 |           
 50 |                 if len(run_non_contributors) > 0 or len(run_contributors) > 0: 
 51 |                     all_non_c_runners = len(run_non_contributors.keys())
 52 |                     all_non_c_runs = sum(run_non_contributors.values())
 53 |                     all_c_runners = len(run_contributors.keys())
 54 |                     all_c_runs = sum(run_contributors.values())
 55 |                     gx_output.r_log(f"Workflow [{workflow.get('name')}] was run by [{all_non_c_runners}] NON-contributors [{all_non_c_runs}] times and by [{all_c_runners}] contributors [{all_c_runs}] times. [{repository.get('html_url')}/actions/workflows/{workflow_file}]", rtype="workflows")
 56 | 
 57 |             # Workflows may not contain a path - I believe those cases are for legacy workflows, seldom run into them
 58 |             if len(workflow.get('path')) > 0: contents = gh_api.fetch_repository_file_contents(repository, workflow.get('path'))
 59 |             else: contents = {}
 60 |             if contents.get('content') != None:
 61 | 
 62 |                 # We have the contents of a workflow, let's analyze it.
 63 |                 encoded_content = contents.get('content')
 64 |                 decoded_content = base64.b64decode(encoded_content).decode('utf-8').lower()
 65 | 
 66 |                 # https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners
 67 |                 if "self-hosted" in decoded_content: gx_output.r_log(f"Workflow [{workflow.get('name')}] appears to be executing in a self-hosted runner: [{workflow.get('html_url')}]", rtype="workflows")
 68 | 
 69 |                 # https://securitylab.github.com/resources/github-actions-preventing-pwn-requests/
 70 |                 if any(a in decoded_content for a in ["pull_request_target","workflow_run","issue_comment","issue:"]):
 71 |                     gx_output.r_log(f"WARNING: Workflow [{workflow.get('name')}] may be triggered by an event that might be misused by attackers. See more at https://gitxray.com/vulnerable_workflows", rtype="workflows")
 72 | 
 73 |                 #https://github.com/actions/toolkit/issues/641
 74 |                 if "ACTIONS_ALLOW_UNSECURE_COMMANDS: true" in decoded_content: gx_output.r_log(f"WARNING: Workflow [{workflow.get('name')}] sets ACTIONS_ALLOW_UNSECURE_COMMANDS.", rtype="workflows")
 75 | 
 76 |                 if "secrets." in decoded_content: 
 77 |                     secrets = re.findall(r"secrets\.[A-Za-z-_0-9]*", decoded_content) 
 78 |                     gx_output.r_log(f"Workflow [{workflow.get('name')}] makes use of Secrets: {secrets}: [{workflow.get('html_url')}]", rtype="workflows")
 79 | 
 80 |                 # https://securitylab.github.com/resources/github-actions-untrusted-input/
 81 |                 user_inputs = []
 82 |                 for input_label, pattern in gx_definitions.WORKFLOWS_USER_INPUT.items():
 83 |                     if re.search(pattern, decoded_content):
 84 |                         user_inputs.append(input_label)
 85 |  
 86 |                 if len(user_inputs) > 0: gx_output.r_log(f"Workflow [{workflow.get('name')}] handles user input via: {user_inputs}", rtype="workflows")
 87 | 
 88 | 
 89 |     gx_output.stdout(f"\rQuerying for repository workflow artifacts.."+" "*30, end="")
 90 |     artifacts = gh_api.fetch_repository_actions_artifacts(repository)
 91 |     if artifacts != None and artifacts.get('total_count', 0) > 0:
 92 |         gx_output.r_log(f"{artifacts.get('total_count')} Artifacts available at: [{repository.get('url')}/actions/artifacts]", rtype="artifacts")
 93 |         for artifact in artifacts.get('artifacts'):
 94 |             gx_output.r_log(f"Artifact [{artifact.get('name')}] created [{artifact.get('created_at')}], updated [{artifact.get('updated_at')}]: {artifact.get('url')}", rtype="artifacts")
 95 |             created_at = artifact.get('created_at')
 96 |             created_at_ts = gh_time.parse_date(created_at)
 97 |             updated_at = artifact.get('updated_at')
 98 |             updated_at_ts = gh_time.parse_date(updated_at)
 99 |             # This shouldn't happen but we still run a check; artifacts can't be updated but instead completely overwritten
100 |             # More data here: https://github.com/actions/upload-artifact#overwriting-an-artifact
101 |             if (updated_at_ts-created_at_ts).days > 0:
102 |                 gx_output.r_log(f"WARNING: An artifact [{artifact.get('name')}] was updated {(updated_at_ts-created_at_ts).days} days after being created: {artifact.get('url')}", rtype="artifacts")
103 | 
104 | 
105 |     gx_output.stdout("")
106 |     return True
107 | 


--------------------------------------------------------------------------------