├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ └── scorecard.yml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Maintainers.md ├── NOTICE ├── README.md ├── assets ├── cred_detection_workflow.png ├── keys_Token_Detection_workflow.png └── xgitguard_workflow.png ├── requirements.txt ├── roadmap.md └── xgitguard ├── __init__.py ├── common ├── __init__.py ├── configs_read.py ├── data_format.py ├── github_calls.py ├── logger.py └── ml_process.py ├── config ├── confidence_values.csv ├── dictionary_words.csv ├── enterprise_keywords.csv ├── extensions.csv ├── primary_keywords.csv ├── public_keywords.csv ├── secondary_creds.csv ├── secondary_keys.csv ├── stop_words.csv ├── xgg_configs.yaml └── xgg_search_paths.csv ├── custom keyword search ├── __init__.py ├── enterprise_keyword_search.py └── public_keyword_search.py ├── file-scanner ├── extension_search.py └── secret_detection.py ├── github-enterprise ├── __init__.py ├── enterprise_cred_detections.py └── enterprise_key_detections.py ├── github-public ├── __init__.py ├── public_cred_detections.py └── public_key_detections.py ├── logs └── .log_desc ├── ml_training ├── __init__.py ├── ml_data-collector │ ├── __init__.py │ ├── github-enterprise-ml-data_collector │ │ ├── __init__.py │ │ ├── enterprise_cred_data_collector.py │ │ └── enterprise_key_data_collector.py │ └── github-public-ml-data_collector │ │ ├── __init__.py │ │ ├── public_cred_data_collector.py │ │ └── public_key_data_collector.py ├── ml_feature_engineering.py └── model.py ├── output └── .output └── utilities ├── __init__.py ├── common_utilities.py ├── file_utilities.py └── query_length_validator.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/scorecard.yml: -------------------------------------------------------------------------------- 1 | name: Scorecard analysis workflow 2 | on: 3 | push: 4 | # Only the default branch is supported. 5 | branches: 6 | - main 7 | schedule: 8 | # Weekly on Saturdays. 9 | - cron: '30 1 * * 6' 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | analysis: 15 | name: Scorecard analysis 16 | runs-on: ubuntu-latest 17 | permissions: 18 | # Needed for Code scanning upload 19 | security-events: write 20 | # Needed for GitHub OIDC token if publish_results is true 21 | id-token: write 22 | 23 | steps: 24 | - name: "Checkout code" 25 | uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 26 | with: 27 | fetch-depth: 0 28 | persist-credentials: false 29 | 30 | - name: "Run analysis" 31 | uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 32 | with: 33 | results_file: results.sarif 34 | results_format: sarif 35 | # Scorecard team runs a weekly scan of public GitHub repos, 36 | # see https://github.com/ossf/scorecard#public-data. 37 | # Setting `publish_results: true` helps us scale by leveraging your workflow to 38 | # extract the results instead of relying on our own infrastructure to run scans. 39 | # And it's free for you! 40 | publish_results: true 41 | 42 | # Upload the results as artifacts (optional). Commenting out will disable 43 | # uploads of run results in SARIF format to the repository Actions tab. 44 | # https://docs.github.com/en/actions/advanced-guides/storing-workflow-data-as-artifacts 45 | - name: "Upload artifact" 46 | uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 47 | with: 48 | name: SARIF file 49 | path: results.sarif 50 | retention-days: 5 51 | 52 | # Upload the results to GitHub's code scanning dashboard (optional). 53 | # Commenting out will disable upload of results to your repo's Code Scanning dashboard 54 | - name: "Upload to code-scanning" 55 | uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.16.4 56 | with: 57 | sarif_file: results.sarif 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # IDEs and editors 132 | /.idea 133 | .project 134 | .classpath 135 | .c9/ 136 | *.launch 137 | .settings/ 138 | *.sublime-workspace 139 | 140 | # IDE - VSCode 141 | .vscode/* 142 | !.vscode/settings.json 143 | !.vscode/tasks.json 144 | !.vscode/launch.json 145 | !.vscode/extensions.json 146 | .history/* 147 | 148 | # System Files 149 | .DS_Store 150 | Thumbs.db 151 | 152 | # Run time files 153 | xgitguard/logs/*.log 154 | xgitguard/output/*.csv 155 | xgitguard/output/*.pickle -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # xGitGuard Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series 85 | of actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or 92 | permanent ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within 112 | the community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.0, available at 118 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 125 | at [https://www.contributor-covenant.org/translations][translations]. 126 | 127 | [homepage]: https://www.contributor-covenant.org 128 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html 129 | [Mozilla CoC]: https://github.com/mozilla/diversity 130 | [FAQ]: https://www.contributor-covenant.org/faq 131 | [translations]: https://www.contributor-covenant.org/translations -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | If you would like to contribute code to this project you can do so through 4 | GitHub by forking the repository and sending a pull request. 5 | 6 | Before Comcast merges your code into the project you must sign the 7 | [Comcast Contributor License Agreement (CLA)](https://gist.github.com/ComcastOSS/a7b8933dd8e368535378cda25c92d19a). 8 | 9 | If you haven't previously signed a Comcast CLA, you'll automatically be asked 10 | to when you open a pull request. Alternatively, we can send you a PDF that 11 | you can sign and scan back to us. Please create a new GitHub issue to request 12 | a PDF version of the CLA. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Comcast Cable Communications Management, LLC 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /Maintainers.md: -------------------------------------------------------------------------------- 1 | # 🛡️ xGitGuard Maintainers 2 | 3 | Thank you for your interest in contributing to xGitGuard! Below are the dedicated individuals who contribute to the maintenance and development of xGitGuard. 4 | 5 | ## Comcast Maintainers 6 | 7 | Comcast maintainers are responsible for overseeing the development and maintenance of xGitGuard within the Comcast organization. 8 | 9 | | Name | GitHub Handle Card | 10 | |--------------------|----------------------| 11 | | Dinesh Prakash | [![Dinesh Prakash](https://img.shields.io/badge/GitHub-dinpraka-blue?logo=github)](https://github.com/dinpraka) | 12 | | David Jayaseelan | [![David Jayaseelan](https://img.shields.io/badge/GitHub-davidjayaseelan-blue?logo=github)](https://github.com/jay6david) | 13 | | Adhithya Rajasekaran| [![Adhithya Rajasekaran](https://img.shields.io/badge/GitHub-radhi1991-blue?logo=github)](https://github.com/radhi1991) | 14 | | Gowtham Raj J | [![Gowtham Raj J](https://img.shields.io/badge/GitHub-jgowthamr-blue?logo=github)](https://github.com/jgowthamr) | 15 | | Sai Sundar | [![Sai Sundar](https://img.shields.io/badge/GitHub-sai100-blue?logo=github)](https://github.com/sai100) | 16 | | Nisha Balamurugan | [![nishabalamurugan](https://img.shields.io/badge/GitHub-nishabalamurugan-blue?logo=github)](https://github.com/nishabalamurugan) | 17 | 18 | 19 | ## External Maintainers 20 | 21 | External maintainers come from various organizations and institutions, contributing their expertise to xGitGuard's development. 22 | 23 | | Name | GitHub Handle Card | Affiliation | 24 | |--------------------|----------------------|-----------------------------------| 25 | | Himaja Nimmagadda | [![Himaja Nimmagadda](https://img.shields.io/badge/GitHub-hcn892-blue?logo=github)](https://github.com/hcn892) | [George Washington University](https://www.gwu.edu) | 26 | | Dinesh Paneerselvam | [![Dinesh Paneerselvam](https://img.shields.io/badge/GitHub-DineshPanneerselvam-blue?logo=github)](https://github.com/DineshPanneerselvam) | [Infosys](https://www.infosys.com) | 27 | | Preethi Manimaran | [![Preethi Manimaran](https://img.shields.io/badge/GitHub-preethid03-blue?logo=github)](https://github.com/preethid03) | [Gigamon](https://www.gigamon.com)| 28 | 29 | 30 | ## Contact 31 | 32 | If you have any questions, concerns, or suggestions regarding xGitGuard, feel free to reach out to any of the maintainers listed above. We welcome your feedback and contributions! 33 | 34 | ## Acknowledgments 35 | 36 | We extend our gratitude to all contributors, users, and organizations that have supported the xGitGuard project in various ways. 37 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | xGitGuard 2 | 3 | Copyright 2021 Comcast Cable Communications Management, LLC 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | 17 | SPDX-License-Identifier: Apache-2.0 18 | 19 | This product includes software developed at Comcast (http://www.comcast.com/). -------------------------------------------------------------------------------- /assets/cred_detection_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/cred_detection_workflow.png -------------------------------------------------------------------------------- /assets/keys_Token_Detection_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/keys_Token_Detection_workflow.png -------------------------------------------------------------------------------- /assets/xgitguard_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/xgitguard_workflow.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | pandas==1.3.5 3 | requests==2.32.0 4 | scipy==1.10.0 5 | scikit-learn==1.5.2 6 | urlextract==1.5.0 7 | PyYAML==6.0 -------------------------------------------------------------------------------- /roadmap.md: -------------------------------------------------------------------------------- 1 | 2 | # xGitGuard Roadmap 3 | 4 | ## How to Use This Roadmap 5 | This document serves as a comprehensive guide to the prioritized objectives of the xGitGuard project. It offers insight into the direction of the project, aiding contributors in understanding its trajectory. It also helps contributors determine whether their contributions align with the project's long-term goals. 6 | 7 | While a feature may not be listed here, it doesn't imply automatic refusal of a patch (except for "frozen features" mentioned below). We welcome patches for new features and encourage innovation. However, please be aware that such patches may take longer to review. 8 | 9 | --- 10 | 11 | ## Feature Classification 12 | 13 | ### Adhoc Scan 14 | | Feature | Description | Status | Developer (GitHub ID) | 15 | |----------------------------------|--------------------------------------------------|-----------|------------------------------| 16 | | [🎯 Targeted repository scanning](https://github.com/Comcast/xGitGuard/issues/24) | Scan user specified repositories for secrets | ✅ Done | [preethid03](https://github.com/preethid03) | 17 | | [🎯 Targeted organization scanning](https://github.com/Comcast/xGitGuard/issues/24) | Scan user specified organization for secrets | ✅ Done | [preethid03](https://github.com/preethid03) | 18 | 19 | --- 20 | 21 | ### File Scanner 22 | 23 | | Feature | Description | Status | Developer (GitHub ID) | 24 | |---------------------------|---------------------------------------------------------|--------|-----------------------| 25 | | 📁 Directory scanning | Enable scanning user specified directories for secrets | ⏳ WIP | [](https://github.com/developer6) | 26 | | 📁 Individual file scanning | Enable scanning user specified individual files for secrets | ⏳ WIP | [](https://github.com/developer7) | 27 | 28 | 29 | 30 | --- 31 | 32 | ### ML Integration ---> [GitHub Issues](https://github.com/Comcast/xGitGuard/issues/32) 33 | | Feature | Description | Status | Developer (GitHub ID) | 34 | |----------------------------------|--------------------------------------------------|-----------|------------------------------| 35 | | 🤖 Training ML models using BERT | Train models for secret detection using BERT | 🚧 To Do | [](https://github.com/developer8) | 36 | | 🤖 Integrating BERT into scanners | Integrate BERT model into xGitGuard scanner | 🚧 To Do | [](https://github.com/developer9) | 37 | 38 | --- 39 | 40 | ### Pre-commit Hook 41 | | Feature | Description | Status | Developer (GitHub ID) | 42 | |----------------------------------|--------------------------------------------------|-----------|------------------------------| 43 | | 🔒 Detecting secrets pre-commit| Detect secrets before committing changes | 🚧 To Do | [](https://github.com/) | 44 | 45 | 46 | --- 47 | 48 | ### Others 49 | | Feature | Description | Status | Developer (GitHub ID) | 50 | |----------------------------------|--------------------------------------------------|-----------|------------------------------| 51 | | Custom keyword search | Search for specific keywords within repositories| 🚧 To Do | [](https://github.com/developer8) | 52 | | Filtering archived repositories | Exclude archived repositories from scanning | 🚧 To Do | [](https://github.com/developer8) | 53 | | Filtering forked repositories | Exclude forked repositories from scanning | 🚧 To Do | [](https://github.com/developer8) | 54 | 55 | --- 56 | 57 | 58 | **Legend:** 59 | - ✅ Done: Completed feature. 60 | - 🚧 To Do: Feature in progress. 61 | 62 | --- 63 | 64 | ## Additional Issues and Contributions 65 | 66 | Contributors are welcome to explore and contribute to other issues on the xGitGuard repository: [xGitGuard GitHub Issues](https://github.com/Comcast/xGitGuard/issues) 67 | -------------------------------------------------------------------------------- /xgitguard/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = """Bahman Rashidi (Senior Security Architect) - Comcast Cable, 2 | Saravanakumar Ramasamy (Senior Lead Engineer) - Comcast Cable, 3 | Dinesh Prakash (Senior Lead Engineer) - Comcast Cable 4 | """ 5 | __version__ = "2.0" 6 | -------------------------------------------------------------------------------- /xgitguard/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/common/__init__.py -------------------------------------------------------------------------------- /xgitguard/common/configs_read.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import logging 20 | import os 21 | import sys 22 | 23 | import numpy as np 24 | from sklearn.feature_extraction.text import CountVectorizer 25 | 26 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 27 | parent_dir = os.path.dirname(MODULE_DIR) 28 | sys.path.append(parent_dir) 29 | 30 | from utilities.file_utilities import read_yaml_file, read_csv_file 31 | 32 | 33 | logger = logging.getLogger("xgg_logger") 34 | 35 | 36 | class ConfigsData: 37 | """ 38 | Initialize and Read all the configuration files needed for the xGitGuard process 39 | """ 40 | 41 | def __init__(self): 42 | logger.debug("Initializing Configuration Data") 43 | self.config_dir = os.path.abspath( 44 | os.path.join(os.path.dirname(MODULE_DIR), ".", "config") 45 | ) 46 | self.output_dir = os.path.abspath( 47 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 48 | ) 49 | self.read_xgg_configs(file_name="xgg_configs.yaml") 50 | 51 | def read_xgg_configs(self, file_name): 52 | """ 53 | Read the given xgg_configs YAML file in the config path and set the class variable for further use. 54 | 55 | Args: 56 | file_name (str): The name of the configuration file. 57 | """ 58 | logger.debug("<<<< 'Current Executing Function' >>>>") 59 | # Loading xgg_configs from xgg_configs_file 60 | self.xgg_configs_file = os.path.join(self.config_dir, file_name) 61 | if os.path.exists(self.xgg_configs_file): 62 | self.xgg_configs = read_yaml_file(self.xgg_configs_file) 63 | logger.debug(f"xgg_configs: {self.xgg_configs}") 64 | else: 65 | logger.error( 66 | f"Exiting as xGitGuard Configuration file not found: {self.xgg_configs_file}" 67 | ) 68 | raise Exception( 69 | f"Exiting as xGitGuard Configuration file not found: {self.xgg_configs_file}" 70 | ) 71 | 72 | def read_primary_keywords(self, file_name): 73 | """ 74 | Read the given primary keywords CSV file in the config path and set the class variable for further use. 75 | 76 | Args: 77 | file_name (str): The name of the CSV file. 78 | """ 79 | logger.debug("<<<< 'Current Executing Function' >>>>") 80 | 81 | # Loading primary keywords from primary keywords file 82 | self.primary_keywords_file = os.path.join(self.config_dir, file_name) 83 | self.primary_keywords = read_csv_file( 84 | self.primary_keywords_file, output="list", header=0 85 | ) 86 | self.primary_keywords = [ 87 | item for sublist in self.primary_keywords for item in sublist 88 | ] 89 | # logger.debug(f"primary_keywords: {self.primary_keywords}") 90 | 91 | def read_secondary_keywords(self, file_name): 92 | """ 93 | Read the given secondary keywords CSV file in the config directory and set the class variable for further use. 94 | 95 | Args: 96 | file_name (str): The name of the CSV file. 97 | """ 98 | logger.debug("<<<< 'Current Executing Function' >>>>") 99 | 100 | # Loading secondary keywords from secondary keywords file 101 | self.secondary_keywords_file = os.path.join(self.config_dir, file_name) 102 | self.secondary_keywords = read_csv_file( 103 | self.secondary_keywords_file, output="list", header=0 104 | ) 105 | self.secondary_keywords = [ 106 | item for sublist in self.secondary_keywords for item in sublist 107 | ] 108 | # logger.debug(f"secondary_keywords: {self.secondary_keywords}") 109 | 110 | def read_secondary_credentials(self, file_name): 111 | """ 112 | Read the given secondary credentials CSV file in the config directory and set the class variable for further use. 113 | 114 | Args: 115 | file_name (str): The name of the CSV file. 116 | """ 117 | logger.debug("<<<< 'Current Executing Function' >>>>") 118 | 119 | # Loading secondary Credentials from secondary credentials file 120 | self.secondary_credentials_file = os.path.join(self.config_dir, file_name) 121 | self.secondary_credentials = read_csv_file( 122 | self.secondary_credentials_file, output="list", header=0 123 | ) 124 | self.secondary_credentials = [ 125 | item for sublist in self.secondary_credentials for item in sublist 126 | ] 127 | # logger.debug(f"secondary_credentials: {self.secondary_credentials}") 128 | 129 | def read_extensions(self, file_name="extensions.csv"): 130 | """ 131 | Read the given extensions CSV file in the config path and set the class variable for further use. 132 | 133 | Args: 134 | file_name (str): The name of the CSV file. 135 | """ 136 | logger.debug("<<<< 'Current Executing Function' >>>>") 137 | 138 | # Get the extensions from extensions file 139 | self.extensions_file = os.path.join(self.config_dir, file_name) 140 | self.extensions = read_csv_file(self.extensions_file, output="list", header=0) 141 | self.extensions = [item for sublist in self.extensions for item in sublist] 142 | 143 | # logger.debug(f"Extensions: {self.extensions}") 144 | 145 | def read_hashed_url(self, file_name): 146 | """ 147 | Read the given hashed URL CSV file in the output path and set the class variable for further use. 148 | 149 | Args: 150 | file_name (str): The name of the CSV file. 151 | """ 152 | logger.debug("<<<< 'Current Executing Function' >>>>") 153 | 154 | # Loading Existing url hash detections 155 | self.hashed_url_file = os.path.join(self.output_dir, file_name) 156 | hashed_key_urls = read_csv_file(self.hashed_url_file, output="list", header=0) 157 | self.hashed_urls = [row[0] for row in hashed_key_urls] 158 | 159 | # logger.debug(f"hashed_urls: {self.hashed_urls}") 160 | 161 | def read_training_data(self, file_name): 162 | """ 163 | Read the given training data CSV file in the output path and set the class variable for further use. 164 | 165 | Args: 166 | file_name (str): The name of the CSV file. 167 | """ 168 | logger.debug("<<<< 'Current Executing Function' >>>>") 169 | self.training_data_file = os.path.join(self.output_dir, file_name) 170 | self.training_data = read_csv_file( 171 | self.training_data_file, output="dataframe", header=0 172 | ) 173 | if not self.training_data.empty: 174 | self.training_data = self.training_data.drop(columns="Label", axis=1) 175 | else: 176 | logger.error( 177 | f"Training Data is Empty. Add proper data and rerun: {self.training_data_file}" 178 | ) 179 | raise Exception( 180 | f"Training Data is Empty. Add proper data and rerun: {self.training_data_file}" 181 | ) 182 | 183 | def read_confidence_values(self, file_name="confidence_values.csv"): 184 | """ 185 | Read the given confidence values CSV file in the config path and set the key as index. 186 | 187 | This function sets the class variable for further use. 188 | 189 | Args: 190 | file_name (str): The name of the CSV file. 191 | """ 192 | logger.debug("<<<< 'Current Executing Function' >>>>") 193 | # Loading confidence levels from file 194 | self.confidence_values_file = os.path.join(self.config_dir, file_name) 195 | 196 | self.confidence_values = read_csv_file( 197 | self.confidence_values_file, output="dataframe", header=0 198 | ) 199 | if not self.confidence_values.empty: 200 | try: 201 | self.confidence_values = self.confidence_values.set_index("key") 202 | except Exception as e: 203 | logger.error(f"Confidence Values Setting Index Error: {e}") 204 | raise Exception(f"Confidence Values Setting Index Error: {e}") 205 | else: 206 | logger.error( 207 | f"confidence_values file is not present/readable: {self.confidence_values_file}" 208 | ) 209 | raise Exception( 210 | f"confidence_values file is not present/readable: {self.confidence_values_file}" 211 | ) 212 | 213 | def read_dictionary_words(self, file_name="dictionary_words.csv"): 214 | """ 215 | Read the given dictionary words CSV file in the config path. 216 | 217 | This function creates dictionary similarity values and sets the class variables for further use. 218 | 219 | Args: 220 | file_name (str): The name of the CSV file. 221 | """ 222 | logger.debug("<<<< 'Current Executing Function' >>>>") 223 | # Creating dictionary similarity values 224 | self.dictionary_words_file = os.path.join(self.config_dir, file_name) 225 | self.dictionary_words = read_csv_file( 226 | self.dictionary_words_file, output="dataframe", header=0 227 | ) 228 | # logger.debug("Dictionary_words file Read") 229 | # run Count Vectorizer 230 | if not self.dictionary_words.empty: 231 | try: 232 | self.dict_words_vc = CountVectorizer( 233 | analyzer="char", ngram_range=(3, 5), min_df=1e-5, max_df=1.0 234 | ) 235 | count = self.dict_words_vc.fit_transform( 236 | self.dictionary_words["dic_word"].apply( 237 | lambda count: np.str_(count) 238 | ) 239 | ) 240 | self.dict_words_ct = np.log10(count.sum(axis=0).getA1()) 241 | # logger.debug("Dictionary_words data Count Vectorized") 242 | except Exception as e: 243 | logger.error(f"Count Vectorizer Error: {e}") 244 | raise Exception(f"Count Vectorizer Error: {e}") 245 | else: 246 | logger.error( 247 | f"confidence_values file is not present/readable: {self.dictionary_words_file}" 248 | ) 249 | raise Exception( 250 | f"confidence_values file is not present/readable: {self.dictionary_words_file}" 251 | ) 252 | 253 | def read_stop_words(self, file_name="stop_words.csv"): 254 | """ 255 | Read the given stop words CSV file in the config path and set the class variable for further use. 256 | 257 | Args: 258 | file_name (str): The name of the CSV file. 259 | """ 260 | logger.debug("<<<< 'Current Executing Function' >>>>") 261 | # Get the programming language stop words 262 | self.stop_words_file = os.path.join(self.config_dir, file_name) 263 | self.stop_words = read_csv_file(self.stop_words_file, output="list", header=0) 264 | self.stop_words = [item for sublist in self.stop_words for item in sublist] 265 | # logger.debug(f"Total Stop Words: {len(self.stop_words)}") 266 | 267 | def read_search_paths(self, file_name): 268 | """ 269 | Read the given search paths CSV file in the config directory and set the class variable for further use. 270 | 271 | Args: 272 | file_name (str): The name of the CSV file. 273 | """ 274 | logger.debug("<<<< 'Current Executing Function' >>>>") 275 | 276 | # Loading the search paths file to retrieve the paths that need the extension filter applied 277 | self.search_paths_file = os.path.join(self.config_dir, file_name) 278 | self.search_paths = read_csv_file( 279 | self.search_paths_file, output="list", header=0 280 | ) 281 | self.search_paths = [item for sublist in self.search_paths for item in sublist] 282 | # logger.debug(f"search_paths: {self.search_paths}") 283 | 284 | def read_search_files(self, file_name): 285 | """ 286 | Read the given search paths CSV file in the config directory and set the class variable for further use. 287 | 288 | Args: 289 | file_name (str): The name of the CSV file. 290 | """ 291 | logger.debug("<<<< 'Current Executing Function' >>>>") 292 | 293 | # Reading the paths of files to be searched after applying the extension filter 294 | self.target_paths_file = os.path.join(self.output_dir, file_name) 295 | self.search_files = read_csv_file( 296 | self.target_paths_file, output="list", header=0 297 | ) 298 | self.search_files = [item for sublist in self.search_files for item in sublist] 299 | # logger.debug(f"search_files: {self.search_files}") 300 | 301 | def read_hashed_file(self, file_name): 302 | """ 303 | Read the given hashed file CSV file in the output path and set the class variable for further use. 304 | 305 | Args: 306 | file_name (str): The name of the CSV file. 307 | """ 308 | logger.debug("<<<< 'Current Executing Function' >>>>") 309 | # Loading Existing url hash detections 310 | self.hashed_file = os.path.join(self.output_dir, file_name) 311 | hashed_key_files = read_csv_file(self.hashed_file, output="", header=0) 312 | try: 313 | self.hashed_files = ( 314 | hashed_key_files.get("hashed_files").drop_duplicates().tolist() 315 | ) 316 | self.hashed_file_modified_time = ( 317 | hashed_key_files.get("file_modification_hash") 318 | .drop_duplicates() 319 | .tolist() 320 | ) 321 | self.hash_file_path = ( 322 | hashed_key_files.get("files").drop_duplicates().tolist() 323 | ) 324 | except: 325 | self.hashed_files = [] 326 | self.hashed_file_modified_time = [] 327 | self.hash_file_path = [] 328 | # logger.debug(f"hashed_urls: {self.hashed_urls}") 329 | 330 | 331 | if __name__ == "__main__": 332 | 333 | from datetime import datetime 334 | from common.logger import create_logger 335 | 336 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 337 | log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 338 | # Creates a logger 339 | logger = create_logger( 340 | log_level=10, console_logging=True, log_dir=log_dir, log_file_name=log_file_name 341 | ) 342 | configs = ConfigsData() 343 | -------------------------------------------------------------------------------- /xgitguard/common/data_format.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import functools 20 | import json 21 | import re 22 | from urlextract import URLExtract 23 | 24 | 25 | def remove_url_from_keys(code_content): 26 | """ 27 | Replace special chars in the given code content data 28 | params: code_content - string - code data with urls 29 | returns: data - string - Code data without url 30 | """ 31 | # Remove url address if present 32 | code_data = re.sub( 33 | r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", 34 | " ", 35 | code_content, 36 | ) 37 | # Remove email address characters if present 38 | code_data = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", code_data) 39 | 40 | special_chars = [ 41 | "'", 42 | "(", 43 | ")", 44 | ",", 45 | ".", 46 | "/", 47 | "0x", 48 | ";", 49 | "<", 50 | "=", 51 | ">", 52 | "@", 53 | "[", 54 | "\\", 55 | "]", 56 | "_", 57 | "{", 58 | "}", 59 | '"', 60 | ] 61 | # Remove special characters if present 62 | for special_char in special_chars: 63 | code_data = code_data.replace(special_char, " ") 64 | return code_data 65 | 66 | 67 | def remove_url_from_creds(code_content, key): 68 | """ 69 | Replace special chars in the given code content data 70 | params: code_content - string - code data with urls 71 | returns: data - string - Code data without url 72 | """ 73 | extractor = URLExtract() 74 | blacklisted_urls = extractor.find_urls(code_content) 75 | 76 | code_data = re.sub( 77 | r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", 78 | " ", 79 | code_content, 80 | ) 81 | code_data = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", code_data) 82 | 83 | for url in blacklisted_urls: 84 | code_data = code_data.replace(url, " ") 85 | 86 | special_chars = [ 87 | "'", 88 | '"', 89 | "#", 90 | "%", 91 | "&", 92 | "(", 93 | ")", 94 | "*", 95 | "+", 96 | ",", 97 | "-", 98 | ".", 99 | "/", 100 | ":", 101 | ";", 102 | "<", 103 | "=", 104 | ">", 105 | "?", 106 | "[", 107 | "\\", 108 | "]", 109 | "`", 110 | "{", 111 | "|", 112 | "}", 113 | "~", 114 | ] 115 | # Remove special characters if present 116 | for special_char in special_chars: 117 | code_data = code_data.replace(special_char, " ") 118 | codes_list = code_data.split() 119 | return codes_list 120 | 121 | 122 | def keys_extractor(code_content): 123 | """ 124 | Extract keys from the given code content 125 | params: code_content - string 126 | returns: keys - List - List of secret keys 127 | """ 128 | 129 | regexes = { 130 | "AWS Tokens": "(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}", 131 | "AWS Access Key ID": "[0-9a-zA-Z/+=]{40}", 132 | "Google OAuth Secret": "[0-9a-zA-Zn\-_]{24}", 133 | "Google OAuth Auth Code": "4/[0-9A-Za-zn\-_]+", 134 | "Google OAuth Refresh Token": "1/[0-9A-Za-zn\-_]{43}|1/[0-9A-Za-zn\-_]{64}", 135 | "Google OAuth Access Token": "ya29n.[0-9A-Za-zn\-_]+", 136 | "Google API Key": "AIza[0-9A-Za-zn\-_]{35}", 137 | "RSA Private Key": "BEGIN RSA PRIVATE KEY", 138 | "EC Private Key": "BEGIN EC PRIVATE KEY", 139 | "PGP Private Key": "BEGIN PGP PRIVATE KEY BLOCK", 140 | "General Private Key": "BEGIN PRIVATE KEY", 141 | "Google YouTube OAuth ID Gmail, GCloud": "[0-9]+-[0-9A-Za-z_]f32gn.appsn.googleusercontentn.com", 142 | "Amazon MWS": "access_tokenn$productionn$[0-9a-z]f16gn$[0-9a-f]f32g", 143 | "PayPal": "amznn.mwsn.[0-9a-f]f8g-[0-9a-f]f4g-[0-9a-f]f4g-[0-9a-f]f4g-[0-9a-f]f12g", 144 | "Slack Token": "(xox[pbaor]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})", 145 | "AWS": "(?:.*awsSecretKey|.*aws_secret|.*api-key|.*aws_account_secret).*" 146 | "(?=.*[A-Z])(?= 7 174 | and not (word in stop_words) 175 | and not (word.lower().startswith("u0")) 176 | and not (word.lower().startswith("0x")) 177 | and not (word.lower().startswith("rfc")) 178 | and not ("http" in word.lower()) 179 | and (bool(re.match("^(?=.*[0-9])(?=.*[a-zA-Z])", word))) 180 | ): 181 | creds.append(word) 182 | 183 | """creds = [word for word in code_content if len(word) >= 7] 184 | creds = [word for word in creds if not word in stop_words] 185 | creds = [word for word in creds if not word.lower().startswith('u0')] 186 | creds = [word for word in creds if not word.lower().startswith('0x')] 187 | creds = [word for word in creds if not word.lower().startswith('rfc')] 188 | creds = [word for word in creds if "http" not in word.lower()] 189 | creds = [word for word in creds if bool(re.match('^(?=.*[0-9])(?=.*[a-zA-Z])', word))]""" 190 | 191 | creds = list(set(creds)) 192 | creds = list(filter(None, creds)) 193 | return creds 194 | 195 | 196 | def format_commit_details(api_response_commit_data): 197 | """ 198 | Format the commit details from the api response 199 | params: api_response_commit_data - dict 200 | returns: commit_details - json dictionary 201 | """ 202 | try: 203 | response = api_response_commit_data 204 | if response.status_code == 200: 205 | commit_details = {} 206 | commit_data = [] 207 | commits_response = response.json() 208 | commit_details["status"] = response.status_code 209 | 210 | for commit in commits_response: 211 | commit_detail = {} 212 | 213 | try: 214 | commit_detail["commit_id"] = commit["sha"] 215 | except (IndexError, KeyError): 216 | commit_detail["commit_id"] = "" 217 | 218 | try: 219 | commit_detail["email"] = commit["commit"]["author"]["email"] 220 | except (IndexError, KeyError): 221 | commit_detail["email"] = "" 222 | 223 | try: 224 | commit_detail["commiter_name"] = commit["commit"]["author"]["name"] 225 | except (IndexError, KeyError): 226 | commit_detail["commiter_name"] = "" 227 | 228 | try: 229 | commit_detail["commit_date"] = commit["commit"]["author"]["date"] 230 | except (IndexError, KeyError): 231 | commit_detail["commit_date"] = "" 232 | 233 | try: 234 | if commit["author"] is not None: 235 | commit_detail["user_id"] = commit["author"]["login"] 236 | else: 237 | commit_detail["user_id"] = "" 238 | except (IndexError, KeyError): 239 | commit_detail["user_id"] = "" 240 | 241 | commit_data.append(commit_detail) 242 | commit_details["commits"] = commit_data 243 | else: 244 | commit_details = {} 245 | 246 | except (IndexError, KeyError): 247 | commit_details = {} 248 | 249 | commit_details = json.dumps(commit_details) 250 | return commit_details 251 | -------------------------------------------------------------------------------- /xgitguard/common/github_calls.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import logging 20 | import os 21 | import sys 22 | import time 23 | 24 | import requests 25 | from utilities.query_length_validator import query_length_validator 26 | 27 | logger = logging.getLogger("xgg_logger") 28 | 29 | 30 | class GithubCalls: 31 | 32 | def __init__(self, base_url, token_env, commits_api_url, throttle_time=2): 33 | assert ( 34 | token_env == "public" or token_env == "enterprise" 35 | ), f"token_env must be either 'public' or 'enterprise'. current: {token_env}" 36 | self._base_url = base_url 37 | self._token_env = token_env 38 | self._commits_api_url = commits_api_url 39 | self._throttle_time = throttle_time 40 | 41 | def run_github_search(self, search_query, extension, org=[], repo=[]): 42 | """ 43 | Run the GitHub API search with given search query 44 | Get the items from the response content and Return 45 | params: search_query - string - Search keyword 46 | params: extension - string - Search extension 47 | params: org - list 48 | params: repo - list 49 | returns: search_response - list 50 | """ 51 | logger.debug("<<<< 'Current Executing Function' >>>>") 52 | 53 | org_qualifiers = [] 54 | repo_qualifiers = [] 55 | 56 | if len(org) > 0: 57 | # Checks if the length of additional qualifiers has exceeded the character limit of 170. 58 | org_qualifiers = query_length_validator(org, "user") 59 | if org_qualifiers == -1: 60 | logger.error( 61 | "Character Limit reached. Please consider limiting the number of characters in orgs." 62 | ) 63 | sys.exit(1) 64 | 65 | elif len(repo) > 0: 66 | # Checks if the length of additional qualifiers has exceeded the character limit of 170. 67 | repo_qualifiers = query_length_validator(repo, "repo") 68 | if repo_qualifiers == -1: 69 | logger.error( 70 | "Character Limit reached. Please consider limiting the number of characters in repo." 71 | ) 72 | sys.exit(1) 73 | 74 | if not extension or extension == "others" or len(extension) == 0: 75 | response = self.__github_api_get_params( 76 | search_query, org_qualifiers, repo_qualifiers 77 | ) 78 | elif self._token_env == "public": 79 | 80 | response = self.__github_api_get_params( 81 | (search_query + " extension:" + extension), 82 | org_qualifiers, 83 | repo_qualifiers, 84 | ) 85 | else: 86 | response = self.__github_api_get_params( 87 | (search_query + " extension:" + extension), 88 | org_qualifiers, 89 | repo_qualifiers, 90 | ) 91 | 92 | if response: 93 | return response 94 | 95 | return [] 96 | 97 | def __github_api_get_params( 98 | self, search_query, org_qualifiers="", repo_qualifiers="" 99 | ): 100 | """ 101 | For the given GITHUB API url and search query, call the api 102 | Get and return the response 103 | ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN" 104 | 105 | params: search_query - string 106 | params: org_qualifiers - string 107 | params: repo_qualifiers - string 108 | returns: response - dict 109 | """ 110 | logger.debug("<<<< 'Current Executing Function' >>>>") 111 | if self._token_env == "public": 112 | token_var = "GITHUB_TOKEN" 113 | time.sleep(self._throttle_time) 114 | else: 115 | time.sleep(self._throttle_time) 116 | token_var = "GITHUB_ENTERPRISE_TOKEN" 117 | if "<< Enterprise Name >>" in self._base_url: 118 | logger.error( 119 | f"GitHub API URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry" 120 | ) 121 | sys.exit(1) 122 | 123 | if not os.getenv(token_var): 124 | logger.error( 125 | f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry" 126 | ) 127 | sys.exit(1) 128 | 129 | additional_qualifiers = "" 130 | if len(org_qualifiers) > 0: 131 | additional_qualifiers = org_qualifiers 132 | elif len(repo_qualifiers) > 0: 133 | additional_qualifiers = repo_qualifiers 134 | 135 | search_response = [] 136 | if additional_qualifiers: 137 | try: 138 | response = requests.get( 139 | self._base_url, 140 | params={ 141 | "q": f"{search_query} {additional_qualifiers}", 142 | "order": "desc", 143 | "sort": "indexed", 144 | "per_page": 100, 145 | }, 146 | auth=("token", os.getenv(token_var)), 147 | ) 148 | except Exception as e: 149 | logger.error(f"Github API call Error: {e}") 150 | else: 151 | try: 152 | response = requests.get( 153 | self._base_url, 154 | params={ 155 | "q": f"{search_query}", 156 | "order": "desc", 157 | "sort": "indexed", 158 | "per_page": 100, 159 | }, 160 | auth=("token", os.getenv(token_var)), 161 | ) 162 | except Exception as e: 163 | logger.error(f"Github API call Error: {e}") 164 | 165 | if response.status_code == 200: 166 | content = response.json() 167 | search_response.extend(content["items"]) 168 | try: 169 | while "next" in response.links.keys(): 170 | time.sleep(6) 171 | response = requests.get( 172 | response.links["next"]["url"], 173 | auth=("token", os.getenv(token_var)), 174 | ) 175 | 176 | if response.status_code == 200: 177 | content = response.json() 178 | if len(content["items"]) < 1: 179 | break 180 | search_response.extend(content["items"]) 181 | 182 | else: 183 | logger.info( 184 | f"Encountered an error in processing request.Response Status Code:{response.status_code}" 185 | ) 186 | break 187 | except Exception as e: 188 | logger.error( 189 | f"Error occured while iterating through file contents: {e}" 190 | ) 191 | else: 192 | logger.info( 193 | f"Encountered an error in processing request.Response Status Code:{response.status_code}" 194 | ) 195 | return search_response 196 | 197 | def public_url_content_get(self, file_url): 198 | """ 199 | For the given GitHub url, call the api 200 | Get and return the response 201 | ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN" 202 | 203 | params: api_url - string 204 | returns: response - string 205 | """ 206 | logger.debug("<<<< 'Current Executing Function' >>>>") 207 | 208 | token_key = "GITHUB_TOKEN" 209 | if not os.getenv(token_key): 210 | logger.error( 211 | f"GitHub API Token Environment variable '{token_key}' not set. API Search will fail/return no results. Please Setup and retry" 212 | ) 213 | sys.exit(1) 214 | 215 | try: 216 | time.sleep(self._throttle_time) 217 | response = requests.get( 218 | file_url, auth=("token", os.getenv(token_key)), timeout=10 219 | ) 220 | return response 221 | except Exception as e: 222 | logger.error(f"Github API file content get Error: {e}") 223 | 224 | return {} 225 | 226 | def enterprise_url_content_get(self, file_url, header): 227 | """ 228 | For the given GitHub url, call the api 229 | Get and return the response 230 | ### Need GitHub Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN" 231 | 232 | params: api_url - string 233 | returns: response - string 234 | """ 235 | logger.debug("<<<< 'Current Executing Function' >>>>") 236 | 237 | token_key = "GITHUB_ENTERPRISE_TOKEN" 238 | if not os.getenv(token_key): 239 | logger.error( 240 | f"GitHub API Token Environment variable '{token_key}' not set. API Search will fail/return no results. Please Setup and retry" 241 | ) 242 | sys.exit(1) 243 | elif "<< Enterprise Name >>" in self._base_url: 244 | logger.error( 245 | f"GitHub API Content URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry" 246 | ) 247 | sys.exit(1) 248 | 249 | try: 250 | time.sleep(self._throttle_time) 251 | response = requests.get( 252 | file_url, 253 | auth=("token", os.getenv(token_key)), 254 | headers=header, 255 | timeout=10, 256 | ) 257 | return response 258 | except Exception as e: 259 | logger.error(f"Github API file content get Error: {e}") 260 | 261 | return {} 262 | 263 | def get_github_public_commits(self, user_name, repo_name, file_path): 264 | """ 265 | For the given GitHub details, call the api and get commit details 266 | Get and return the response 267 | ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN" 268 | params: commits_api_url - string 269 | returns: response - string 270 | """ 271 | logger.debug("<<<< 'Current Executing Function' >>>>") 272 | full_commit_url = self._commits_api_url % (user_name, repo_name, file_path) 273 | token_var = "GITHUB_TOKEN" 274 | if not os.getenv(token_var): 275 | logger.error( 276 | f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry" 277 | ) 278 | sys.exit(1) 279 | 280 | try: 281 | time.sleep(self._throttle_time) 282 | response = requests.get( 283 | full_commit_url, auth=("token", os.getenv(token_var)), timeout=25 284 | ) 285 | return response 286 | except Exception as e: 287 | logger.error(f"Github API commit content get Error: {e}") 288 | return {} 289 | 290 | def get_github_enterprise_commits(self, user_name, repo_name, file_path, header): 291 | """ 292 | For the given GitHub details, call the api and get commit details 293 | Get and return the response 294 | ### Need GitHub Enterprise Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN" 295 | params: commits_api_url - string 296 | params: header - dict 297 | returns: response - string 298 | """ 299 | logger.debug("<<<< 'Current Executing Function' >>>>") 300 | 301 | token_var = "GITHUB_ENTERPRISE_TOKEN" 302 | if not os.getenv(token_var): 303 | logger.error( 304 | f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry" 305 | ) 306 | sys.exit(1) 307 | elif "<< Enterprise Name >>" in self._commits_api_url: 308 | logger.error( 309 | f"GitHub API Commits URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry" 310 | ) 311 | sys.exit(1) 312 | 313 | try: 314 | time.sleep(self._throttle_time) 315 | full_commit_url = self._commits_api_url.format( 316 | user_name=user_name, repo_name=repo_name, file_path=file_path 317 | ) 318 | response = requests.get( 319 | full_commit_url, 320 | auth=("token", os.getenv(token_var)), 321 | headers=header, 322 | timeout=25, 323 | ) 324 | return response 325 | except Exception as e: 326 | logger.error(f"Github API commit content get Error: {e}") 327 | return {} 328 | -------------------------------------------------------------------------------- /xgitguard/common/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import logging 20 | import os 21 | from datetime import datetime 22 | 23 | 24 | def create_logger(log_level=20, console_logging=True, log_dir=None, log_file_name=None): 25 | """ 26 | Create logging class and return 27 | params: log_level - int - Default - 10 28 | returns: console_logging - Boolean - Default - True 29 | returns: log_dir - string - optional 30 | returns: log_file_name - string - optional 31 | returns: logger - logging class 32 | """ 33 | logger_name = "xgg_logger" 34 | # Gets or creates a logger 35 | logger = logging.getLogger(logger_name) 36 | 37 | # set log level 38 | logger.setLevel(log_level) 39 | 40 | formatter = logging.Formatter( 41 | "[%(asctime)s] [ %(levelname)8s ] [%(filename)40s:%(funcName)30s] : %(message)s" 42 | ) 43 | 44 | # add file handler to logger 45 | logger.addHandler(set_file_handler(logger_name, formatter, log_dir, log_file_name)) 46 | 47 | if console_logging: 48 | logger.addHandler(set_console_handler(formatter)) 49 | 50 | return logger 51 | 52 | 53 | def set_file_handler(logger_name, formatter, log_dir, log_file_name): 54 | """Setting File streaming Handler""" 55 | # define file handler and set formatter 56 | if log_dir and os.path.exists(log_dir): 57 | log_dir = log_dir 58 | else: 59 | module_dir = os.path.dirname(os.path.realpath(__file__)) 60 | log_dir = os.path.abspath( 61 | os.path.join(os.path.dirname(module_dir), ".", "logs") 62 | ) 63 | if not log_file_name: 64 | log_file_name = f"{logger_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 65 | log_file = os.path.join(log_dir, log_file_name) 66 | file_handler = logging.FileHandler(log_file) 67 | file_handler.setFormatter(formatter) 68 | print(f"Current run logs file: {log_file}") 69 | return file_handler 70 | 71 | 72 | def set_console_handler(formatter): 73 | """Setting Console logging Handler""" 74 | # define console handler and set formatter 75 | console_handler = logging.StreamHandler() 76 | console_handler.setFormatter(formatter) 77 | return console_handler 78 | 79 | 80 | if __name__ == "__main__": 81 | from configs_read import ConfigsData 82 | 83 | configs = ConfigsData() 84 | module_dir = os.path.dirname(os.path.realpath(__file__)) 85 | log_dir = os.path.abspath(os.path.join(os.path.dirname(module_dir), ".", "logs")) 86 | 87 | logger = create_logger( 88 | log_level=10, 89 | console_logging=False, 90 | log_dir=log_dir, 91 | log_file_name=f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log", 92 | ) 93 | 94 | logger.debug("A debug message") 95 | logger.info("An info message") 96 | logger.warning("Something is not right.") 97 | logger.error("A Major error has happened.") 98 | logger.critical("Fatal error. Cannot continue") 99 | -------------------------------------------------------------------------------- /xgitguard/common/ml_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import logging 20 | import os 21 | import sys 22 | import numpy as np 23 | import pandas as pd 24 | from scipy.stats import entropy 25 | 26 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 27 | parent_dir = os.path.dirname(MODULE_DIR) 28 | sys.path.append(parent_dir) 29 | 30 | from common.configs_read import ConfigsData 31 | from utilities.common_utilities import is_num_present, is_uppercase_present 32 | from utilities.file_utilities import read_pickle_file 33 | 34 | logger = logging.getLogger("xgg_logger") 35 | 36 | 37 | def ml_prediction_process(model_name, training_data, detection_data, git_env=""): 38 | """ 39 | for the given training data and detection data 40 | Format the detections snf training data as model needed 41 | Predict the detection using model 42 | Return the Dataframe of actual detections 43 | params: training_data - dataframe 44 | params: detection_data - dataframe - Detection Data 45 | returns: post_prediction_data - Dataframe - Actual detections 46 | """ 47 | logger.debug("<<<< 'Current Executing Function' >>>>") 48 | pre_prediction_data = detection_data.copy() 49 | if git_env: 50 | if git_env == "public": 51 | detection_data = detection_data.drop( 52 | [ 53 | "Source", 54 | "Primary_Key", 55 | "Commit_Details", 56 | "URL", 57 | "Owner", 58 | "Repo_Name", 59 | "Detected_Timestamp", 60 | "Year", 61 | "Month", 62 | "Day", 63 | ], 64 | axis=1, 65 | ) 66 | else: 67 | detection_data = detection_data.drop( 68 | [ 69 | "Source", 70 | "Commit_Details", 71 | "URL", 72 | "Owner", 73 | "Repo_Name", 74 | "Detected_Timestamp", 75 | "Year", 76 | "Month", 77 | "Day", 78 | ], 79 | axis=1, 80 | ) 81 | else: 82 | detection_data = detection_data.drop( 83 | [ 84 | "Source", 85 | "URL", 86 | "Detected_Timestamp", 87 | "Year", 88 | "Month", 89 | "Day", 90 | ], 91 | axis=1, 92 | ) 93 | try: 94 | detection_data["Len_Key"] = detection_data.apply( 95 | lambda x: len(x["Secret"]), axis=1 96 | ) 97 | detection_data["Len_Code"] = detection_data.apply( 98 | lambda x: len(x["Code"]), axis=1 99 | ) 100 | detection_data["Has_Digit"] = detection_data.apply( 101 | lambda x: is_num_present(x["Secret"]), axis=1 102 | ) 103 | detection_data["Has_Cap"] = detection_data.apply( 104 | lambda x: is_uppercase_present(x["Secret"]), axis=1 105 | ) 106 | 107 | detection_data = detection_data.drop(["Secret", "Code"], axis=1) 108 | train_dummies = pd.get_dummies(training_data) 109 | detection_dummies = pd.get_dummies(detection_data) 110 | train_dummies, detection_dummies = train_dummies.align( 111 | detection_dummies, join="left", axis=1 112 | ) 113 | detection_dummies = detection_dummies.fillna(0) 114 | 115 | config_dir = os.path.abspath( 116 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 117 | ) 118 | model_file = os.path.join(config_dir, model_name) 119 | # Read pre trained Model object 120 | rf = read_pickle_file(model_file) 121 | # Predict the current detection 122 | predictions = rf.predict(detection_dummies) 123 | indexes = [i for i, e in enumerate(predictions) if e != 0] 124 | post_prediction_data = pre_prediction_data.iloc[indexes, :] 125 | return post_prediction_data 126 | except Exception as e: 127 | print(f"Error in predicting through model: {e}") 128 | post_prediction_data = pd.DataFrame() 129 | return post_prediction_data 130 | 131 | 132 | def entropy_calc(labels, base=None): 133 | """ 134 | Calculates Shannon Entropy for given labels 135 | params: labels - list 136 | params: base - Optional 137 | returns: entropy values - list 138 | """ 139 | # logger.debug("<<<< 'Current Executing Function' >>>>") 140 | _, counts = np.unique(labels, return_counts=True) 141 | return entropy(counts, base=base) 142 | -------------------------------------------------------------------------------- /xgitguard/config/confidence_values.csv: -------------------------------------------------------------------------------- 1 | key,value 2 | --password,5 3 | --token,5 4 | ?access_token,5 5 | ?accesskeyid,5 6 | access_key,5 7 | access_key_id,5 8 | access_key_secret,5 9 | access_secret,5 10 | access_token,5 11 | account_sid,2 12 | algolia_api_key,3 13 | amazon_secret_access_key,5 14 | api_key,5 15 | api_key_secret,4 16 | api_key_sid,2 17 | app_token,1 18 | artifacts_bucket,1 19 | artifacts_secret,1 20 | ASPX,1 21 | atoken,1 22 | auth,4 23 | yml,5 24 | auth_token,5 25 | aws_access_key,5 26 | aws_access_key_id,2 27 | aws_secret_access_key,5 28 | aws_secret_key,5 29 | bintray_key,1 30 | codecov_token,2 31 | get_token,1 32 | mapbox_access_token,3 33 | agfa,1 34 | twig,2 35 | c,5 36 | csv,2 37 | aspx,3 38 | p12,4 39 | cf_password,2 40 | client_secret,5 41 | cloudflare_api_key,2 42 | conf,5 43 | config,5 44 | consumer_secret,5 45 | coveralls_repo_token,2 46 | coverity_scan_token,2 47 | cpp,5 48 | cred,1 49 | cs,1 50 | cshtml,1 51 | CSV,1 52 | customer_secret,2 53 | dat,1 54 | database_password,3 55 | datadog_api_key,1 56 | db_password,3 57 | db_pw,3 58 | deploy_password,3 59 | deploy_token,3 60 | docker_hub_password,3 61 | docker_key,2 62 | docker_pass,2 63 | docker_passwd,2 64 | docker_password,2 65 | dockerhubpassword,2 66 | ejs,2 67 | encryption_password,2 68 | erb,1 69 | fg,1 70 | file_password,1 71 | firebase_token,1 72 | ftp_password,5 73 | ftp_pw,5 74 | gh_token,2 75 | github_access_token,5 76 | github_api_key,5 77 | github_auth,5 78 | github_key,5 79 | github_oauth_token,5 80 | github_password,5 81 | github_pwd,5 82 | github_token,5 83 | gitignore,5 84 | go,5 85 | gpg_passphrase,2 86 | h,5 87 | heroku_api_key,2 88 | html,1 89 | ini,1 90 | ipynb,3 91 | java,5 92 | js,5 93 | json,3 94 | jsp,5 95 | jsx,5 96 | key,1 97 | keystore_pass,2 98 | log,1 99 | mysql_password,5 100 | npm_auth_token,4 101 | npm_token,2 102 | oauth_token,5 103 | os_password,3 104 | others,1 105 | ovpn,3 106 | pass,1 107 | passphrase,3 108 | password,4 109 | pem,5 110 | php,4 111 | phtml,1 112 | pkey,5 113 | plist,1 114 | ppk,4 115 | priv,2 116 | properties,5 117 | publish_key,3 118 | py,5 119 | pypi_password,5 120 | rb,1 121 | release_token,2 122 | repotoken,1 123 | rsa,4 124 | s3_access_key,5 125 | s3_access_key_id,5 126 | s3_key,5 127 | s3_secret_key,5 128 | sauce_access_key,2 129 | secret,4 130 | secret_key_base,1 131 | sh,5 132 | signing_key,1 133 | sonar_token,2 134 | sonatype_password,1 135 | sshpass,3 136 | sshpassword,4 137 | swift,1 138 | token,5 139 | ts,1 140 | txt,1 141 | user_secret,5 142 | private_key,5 143 | vue,1 144 | xhtml,1 145 | xml,2 146 | yaml,4 -------------------------------------------------------------------------------- /xgitguard/config/enterprise_keywords.csv: -------------------------------------------------------------------------------- 1 | keyword 2 | -------------------------------------------------------------------------------- /xgitguard/config/extensions.csv: -------------------------------------------------------------------------------- 1 | type 2 | json 3 | py 4 | js 5 | java 6 | php 7 | xml 8 | others 9 | cpp 10 | cs 11 | cshtml 12 | ejs 13 | erb 14 | go 15 | h 16 | rb 17 | sh 18 | swift 19 | properties 20 | ovpn 21 | conf 22 | config 23 | ini 24 | plist 25 | yaml 26 | yml 27 | fg 28 | gitignore 29 | key 30 | p12 31 | pem 32 | pkey 33 | ppk 34 | priv 35 | rsa 36 | aspx 37 | c 38 | ts 39 | html 40 | ipynb 41 | jsp 42 | jsx 43 | phtml 44 | twig 45 | vue 46 | xhtml 47 | csv 48 | dat 49 | log 50 | txt -------------------------------------------------------------------------------- /xgitguard/config/primary_keywords.csv: -------------------------------------------------------------------------------- 1 | primary_keys 2 | 3 | -------------------------------------------------------------------------------- /xgitguard/config/public_keywords.csv: -------------------------------------------------------------------------------- 1 | keyword 2 | -------------------------------------------------------------------------------- /xgitguard/config/secondary_creds.csv: -------------------------------------------------------------------------------- 1 | keyword 2 | password 3 | --token 4 | ?access_token 5 | ?accesskeyid 6 | access_key_id 7 | access_key_secret 8 | access_key 9 | access_secret 10 | access_token 11 | account_sid 12 | api_key_secret 13 | api_key_sid 14 | api_key 15 | app_token 16 | artifacts_bucket 17 | artifacts_secret 18 | atoken 19 | auth_token 20 | auth 21 | cf_password 22 | ci_deploy_password 23 | cloudflare_api_key 24 | codecov_token 25 | coveralls_repo_token 26 | coverity_scan_token 27 | cred 28 | database_password 29 | datadog_api_key 30 | db_password 31 | db_pw 32 | deploy_password 33 | deploy_token 34 | docker_hub_password 35 | docker_key 36 | docker_pass 37 | docker_passwd 38 | docker_password 39 | dockerhubpassword 40 | encryption_password 41 | file_password 42 | firebase_token 43 | ftp_password 44 | ftp_pw 45 | github_password 46 | github_pwd 47 | gpg_passphrase 48 | key 49 | passphrase 50 | keystore_pass 51 | mapbox_access_token 52 | mysql_password 53 | npm_auth_token 54 | npm_token 55 | oauth_token 56 | os_password 57 | pass 58 | password 59 | publish_key 60 | pypi_password 61 | release_token 62 | repotoken 63 | sauce_access_key 64 | secret_key_base 65 | secret 66 | signing_key 67 | sonar_token 68 | sonatype_password 69 | sshpass 70 | token 71 | twine_password 72 | customer_secret 73 | consumer_secret 74 | --password 75 | sshpassword 76 | private_key -------------------------------------------------------------------------------- /xgitguard/config/secondary_keys.csv: -------------------------------------------------------------------------------- 1 | keyword 2 | token 3 | --token 4 | ?access_token 5 | ?accesskeyid 6 | access_key_id 7 | access_key_secret 8 | access_key 9 | access_secret 10 | access_token 11 | user_secret 12 | customer_secret 13 | consumer_secret 14 | client_secret 15 | account_sid 16 | agfa 17 | algolia_api_key 18 | amazon_secret_access_key 19 | api_key_secret 20 | api_key_sid 21 | api_key 22 | app_token 23 | artifacts_bucket 24 | artifacts_secret 25 | atoken 26 | auth_token 27 | auth 28 | aws_access_key_id 29 | aws_access_key 30 | aws_secret_access_key 31 | aws_secret_key 32 | bintray_key 33 | cloudflare_api_key 34 | codecov_token 35 | coveralls_repo_token 36 | coverity_scan_token 37 | cred 38 | datadog_api_key 39 | deploy_token 40 | docker_key 41 | firebase_token 42 | gh_token 43 | github_access_token 44 | get_token 45 | github_api_key 46 | github_auth 47 | github_key 48 | github_oauth_token 49 | github_token 50 | heroku_api_key 51 | key 52 | mapbox_access_token 53 | npm_auth_token 54 | npm_token 55 | oauth_token 56 | publish_key 57 | release_token 58 | repotoken 59 | s3_access_key_id 60 | s3_access_key 61 | s3_key 62 | s3_secret_key 63 | sauce_access_key 64 | secret_key_base 65 | secret 66 | signing_key 67 | sonar_token -------------------------------------------------------------------------------- /xgitguard/config/stop_words.csv: -------------------------------------------------------------------------------- 1 | Stop Words 2 | static 3 | static1 4 | static2 5 | static5 6 | static6 7 | static4 8 | static3 9 | server2 10 | images2 11 | secure1 12 | zipCode 13 | streetAddress 14 | forever 15 | secure 16 | tracking 17 | malloc 18 | calloc 19 | realloc 20 | memcpy 21 | int 22 | float 23 | char 24 | sizeof 25 | http 26 | https 27 | def 28 | class 29 | list 30 | str 31 | tuple 32 | dict 33 | collections 34 | this 35 | else 36 | if 37 | elif 38 | dynamic 39 | free 40 | pointer 41 | func 42 | are 43 | counters 44 | struct 45 | type 46 | memptr 47 | memcmp 48 | heap 49 | stack 50 | ptr 51 | apache 52 | nginx 53 | ram 54 | rom 55 | tcp 56 | ip 57 | ping 58 | icmp 59 | bgp 60 | header 61 | protocol -------------------------------------------------------------------------------- /xgitguard/config/xgg_configs.yaml: -------------------------------------------------------------------------------- 1 | # xGitGuard Input Configurations 2 | default: 3 | log_dir: None 4 | 5 | github: 6 | throttle_time: 10 7 | # GitHub Public 8 | public_api_url: "https://api.github.com/search/code" 9 | public_commits_url: "https://api.github.com/repos/%s/%s/commits?path=%s" 10 | 11 | # GitHub Enterprise - For Open Source 12 | enterprise_api_url: "https://github.<< Enterprise Name >>.com/api/v3/search/code" 13 | enterprise_pre_url: "https://github.<< Enterprise Name >>.com/api/v3/repos/" 14 | url_validator: "https://github.<< Enterprise Name >>.com/api/v3/search/code" 15 | enterprise_commits_url: "https://github.<< Enterprise Name >>.com/api/v3/repos/{user_name}/{repo_name}/commits?path={file_path}" 16 | enterprise_header: { "Accept": "application/vnd.github.v3.raw" } 17 | 18 | model: 19 | # Model Configurations 20 | 21 | # GitHub Public 22 | public: 23 | training_data_key: "public_key_train.csv" 24 | training_data_cred: "public_cred_train.csv" 25 | model_key_file: "public_xgg_key_rf_model_object.pickle" 26 | model_cred_file: "public_xgg_cred_rf_model_object.pickle" 27 | 28 | # GitHub Enterprise 29 | enterprise: 30 | training_data_key: "key_train.csv" 31 | training_data_cred: "cred_train.csv" 32 | model_key_file: "xgg_key_rf_model_object.pickle" 33 | model_cred_file: "xgg_cred_rf_model_object.pickle" 34 | 35 | secrets: 36 | public_data_columns: 37 | [ 38 | "Source", 39 | "Primary_Key", 40 | "Second_Key", 41 | "Extension", 42 | "URL", 43 | "Owner", 44 | "Repo_Name", 45 | "Commit_Details", 46 | "Secret", 47 | "Code", 48 | "Detected_Timestamp", 49 | "Key_Weight", 50 | "SKey_Count", 51 | "Entropy", 52 | "Dictionary_Similarity", 53 | "Score", 54 | "Year", 55 | "Month", 56 | "Day", 57 | "Hour", 58 | ] 59 | enterprise_data_columns: 60 | [ 61 | "Source", 62 | "Second_Key", 63 | "Extension", 64 | "URL", 65 | "Owner", 66 | "Repo_Name", 67 | "Commit_Details", 68 | "Secret", 69 | "Code", 70 | "Detected_Timestamp", 71 | "Key_Weight", 72 | "SKey_Count", 73 | "Entropy", 74 | "Dictionary_Similarity", 75 | "Score", 76 | "Year", 77 | "Month", 78 | "Day", 79 | "Hour", 80 | ] 81 | enterprise_data_collector_columns: 82 | [ 83 | "Source", 84 | "Second_Key", 85 | "Extension", 86 | "URL", 87 | "Owner", 88 | "Repo_Name", 89 | "Secret", 90 | "Code", 91 | "Detected_Timestamp", 92 | "Key_Weight", 93 | "SKey_Count", 94 | "Entropy", 95 | "Dictionary_Similarity", 96 | "Score", 97 | "Year", 98 | "Month", 99 | "Day", 100 | "Hour", 101 | ] 102 | public_data_collector_columns: 103 | [ 104 | "Source", 105 | "Primary_Key", 106 | "Second_Key", 107 | "Extension", 108 | "URL", 109 | "Owner", 110 | "Repo_Name", 111 | "Secret", 112 | "Code", 113 | "Detected_Timestamp", 114 | "Key_Weight", 115 | "SKey_Count", 116 | "Entropy", 117 | "Dictionary_Similarity", 118 | "Score", 119 | "Year", 120 | "Month", 121 | "Day", 122 | "Hour", 123 | ] 124 | 125 | file_scanner: 126 | local_file_scan_detection_columns: 127 | [ 128 | "Source", 129 | "Second_Key", 130 | "Extension", 131 | "URL", 132 | "Secret", 133 | "Code", 134 | "Detected_Timestamp", 135 | "Key_Weight", 136 | "SKey_Count", 137 | "Entropy", 138 | "Dictionary_Similarity", 139 | "Score", 140 | "Year", 141 | "Month", 142 | "Day", 143 | "Hour", 144 | ] 145 | unique_columns: ["Source", "Second_Key", "Extension", "URL", "Code"] 146 | 147 | keywords: 148 | public_data_columns: 149 | [ 150 | "Source", 151 | "Second_Key", 152 | "URL", 153 | "Owner", 154 | "Repo_Name", 155 | "Commit_Details", 156 | "Detected_Timestamp", 157 | "Year", 158 | "Month", 159 | "Day", 160 | "Hour", 161 | ] 162 | enterprise_data_columns: 163 | [ 164 | "Source", 165 | "Second_Key", 166 | "URL", 167 | "Owner", 168 | "Repo_Name", 169 | "Commit_Details", 170 | "Detected_Timestamp", 171 | "Year", 172 | "Month", 173 | "Day", 174 | "Hour", 175 | ] 176 | -------------------------------------------------------------------------------- /xgitguard/config/xgg_search_paths.csv: -------------------------------------------------------------------------------- 1 | scan_paths -------------------------------------------------------------------------------- /xgitguard/custom keyword search/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/custom keyword search/__init__.py -------------------------------------------------------------------------------- /xgitguard/custom keyword search/enterprise_keyword_search.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import hashlib 3 | import os 4 | import sys 5 | import pandas as pd 6 | import time 7 | from datetime import datetime 8 | 9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 10 | 11 | parent_dir = os.path.dirname(MODULE_DIR) 12 | sys.path.insert(0, parent_dir) 13 | 14 | from common.configs_read import ConfigsData 15 | from common.data_format import ( 16 | format_commit_details, 17 | ) 18 | from common.github_calls import GithubCalls 19 | from common.logger import create_logger 20 | from utilities.common_utilities import check_github_token_env 21 | from utilities.file_utilities import write_to_csv_file 22 | 23 | file_prefix = "xgg_" 24 | 25 | 26 | def format_detection(skeyword, org_url, url): 27 | """ 28 | Format the data from the given content and other data 29 | params: skeyword - string - Secondary Keyword 30 | params: org_url - string - github url 31 | params: url - string - github url 32 | returns: secrets_data_list - list - List of formatted detections 33 | """ 34 | logger.debug("<<<< 'Current Executing Function' >>>>") 35 | secrets_data_list = [] 36 | secret_data = [] 37 | 38 | user_name = org_url.split("/")[3] 39 | repo_name = org_url.split("/")[4] 40 | 41 | try: 42 | file_path = url.split("/contents/")[1] 43 | header = configs.xgg_configs["github"]["enterprise_header"] 44 | api_response_commit_data = githubCalls.get_github_enterprise_commits( 45 | user_name, repo_name, file_path, header 46 | ) 47 | commit_details = format_commit_details(api_response_commit_data) 48 | except Exception as e: 49 | logger.warning(f"Github commit content formation error: {e}") 50 | commit_details = {} 51 | 52 | secret_data.insert(0, commit_details) 53 | secret_data.insert(0, repo_name) 54 | secret_data.insert(0, user_name) 55 | secret_data.insert(0, org_url) 56 | secret_data.insert(0, skeyword) 57 | secret_data.insert(0, "xGG_Enterprise") 58 | valid_secret_row = [value for value in secret_data] 59 | valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 60 | now = datetime.now() 61 | valid_secret_row.append(now.year) 62 | valid_secret_row.append(now.month) 63 | valid_secret_row.append(now.day) 64 | valid_secret_row.append(now.hour) 65 | secrets_data_list.append(valid_secret_row) 66 | return secrets_data_list 67 | 68 | 69 | def process_search_urls(org_urls_list, url_list, search_query): 70 | """ 71 | 72 | params: org_urls_list - list - list of html urls to get code content 73 | params: url_list - list - list of html urls to get code content 74 | params: search_query - string 75 | returns: secrets_data_list - list - Detected secrets data 76 | """ 77 | logger.debug("<<<< 'Current Executing Function' >>>>") 78 | # Processes search findings 79 | skeyword = search_query.split('"')[1].strip() 80 | secrets_data_list = [] 81 | try: 82 | for url in url_list: 83 | org_url = org_urls_list[url_list.index(url)] 84 | secret_data_list = format_detection(skeyword, org_url, url) 85 | if secret_data_list: 86 | for secret_data in secret_data_list: 87 | secrets_data_list.append(secret_data) 88 | except Exception as e: 89 | logger.error(f"Total Process Search (Exception Error): {e}") 90 | return secrets_data_list 91 | 92 | 93 | def check_existing_detections(org_url_list, url_list, search_query): 94 | """ 95 | Check whether the current urs where processed in previous runs 96 | for each url in url list 97 | create hex hash value for the url 98 | check the url hash in previous detected urls 99 | if not present add them to further process 100 | skip if its already present in detected urls 101 | params:org_url_list - List - List of search org urls 102 | params: url_list - List - List of search result urls 103 | params: search_query - String - Search query string 104 | 105 | returns: new_urls_list - List - New url list 106 | returns: new_hashed_urls - List - New Url Hash detected 107 | """ 108 | logger.debug("<<<< 'Current Executing Function' >>>>") 109 | new_org_url_list, new_urls_list, new_hashed_urls = [], [], [] 110 | global file_prefix 111 | # Get the Already predicted hashed url list if present 112 | try: 113 | # for Reading training Data only one time 114 | if configs.hashed_urls: 115 | pass 116 | except: 117 | configs.read_hashed_url( 118 | file_name=file_prefix + "enterprise_hashed_url_custom_keywords.csv" 119 | ) 120 | 121 | if url_list: 122 | for url in url_list: 123 | url_to_hash = url + search_query 124 | hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest() 125 | new_hashed_url = [] 126 | if not hashed_url in configs.hashed_urls: 127 | new_org_url_list.append(org_url_list[url_list.index(url)]) 128 | new_urls_list.append(url) 129 | new_hashed_url.append(hashed_url) 130 | new_hashed_url.append(url) 131 | if new_hashed_url: 132 | new_hashed_urls.append(new_hashed_url) 133 | return new_org_url_list, new_urls_list, new_hashed_urls 134 | 135 | 136 | def process_search_results(search_response_lines, search_query): 137 | """ 138 | params: search_response_lines - list 139 | params: search_query - string 140 | 141 | returns: detection_writes_per_query - int - Total detections written to file 142 | returns: new_results_per_query - int - No of new urls per query 143 | returns: detections_per_query - int - No of detections per search 144 | """ 145 | logger.debug("<<<< 'Current Executing Function' >>>>") 146 | detection_writes_per_query = 0 147 | new_results_per_query = 0 148 | detections_per_query = 0 149 | new_hashed_urls = [] 150 | global file_prefix 151 | 152 | url_list, org_url_list = [], [] 153 | 154 | hashed_urls_file = os.path.join( 155 | configs.output_dir, file_prefix + "enterprise_hashed_url_custom_keywords.csv" 156 | ) 157 | for line in search_response_lines: 158 | html_url = line["html_url"] 159 | org_url_list.append(html_url) 160 | html_url = ( 161 | configs.xgg_configs["github"]["enterprise_pre_url"] 162 | + line["repository"]["full_name"] 163 | + "/contents/" 164 | + line["path"] 165 | ) 166 | url_list.append(html_url) 167 | 168 | if url_list: 169 | # Check if current url is processed in previous runs 170 | new_org_urls_list, new_urls_list, new_hashed_urls = check_existing_detections( 171 | org_url_list, url_list, search_query 172 | ) 173 | new_results_per_query = len(new_urls_list) 174 | if new_hashed_urls: 175 | secrets_detected = process_search_urls( 176 | new_org_urls_list, new_urls_list, search_query 177 | ) 178 | detections_per_query += len(secrets_detected) 179 | if secrets_detected: 180 | try: 181 | logger.debug( 182 | f"Current secrets_detected count: {len(secrets_detected)}" 183 | ) 184 | secrets_detected_df = pd.DataFrame( 185 | secrets_detected, 186 | columns=configs.xgg_configs["keywords"][ 187 | "enterprise_data_columns" 188 | ], 189 | ) 190 | detection_writes_per_query += secrets_detected_df.shape[0] 191 | try: 192 | secrets_detected_file = os.path.join( 193 | configs.output_dir, 194 | "xgg_enterprise_custom_keywords_detected.csv", 195 | ) 196 | write_to_csv_file(secrets_detected_df, secrets_detected_file) 197 | except Exception as e: 198 | logger.error(f"Process Error: {e}") 199 | except Exception as e: 200 | logger.error(f"keywords Dataframe creation failed. Error: {e}") 201 | secrets_detected_df = pd.DataFrame( 202 | columns=configs.xgg_configs["keywords"][ 203 | "enterprise_data_columns" 204 | ], 205 | ) 206 | 207 | else: 208 | logger.info("No keywords in current search results") 209 | 210 | try: 211 | new_hashed_urls_df = pd.DataFrame( 212 | new_hashed_urls, columns=["hashed_url", "url"] 213 | ) 214 | write_to_csv_file(new_hashed_urls_df, hashed_urls_file) 215 | except Exception as e: 216 | logger.error(f"File Write error: {e}") 217 | sys.exit(1) 218 | else: 219 | logger.info( 220 | f"All {len(url_list)} urls in current search is already processed and hashed" 221 | ) 222 | else: 223 | logger.info(f"No valid html urls in the current search results to process.") 224 | return detection_writes_per_query, new_results_per_query, detections_per_query 225 | 226 | 227 | def format_search_query_list(secondary_keywords): 228 | """ 229 | Create the search query list using Secondary Keywords 230 | params: secondary_keywords - list 231 | returns: search_query_list - list 232 | """ 233 | logger.debug("<<<< 'Current Executing Function' >>>>") 234 | search_query_list = [] 235 | # Format GitHub Search Query 236 | for secondary_keyword in secondary_keywords: 237 | search_query_list.append('"' + secondary_keyword + '"') 238 | logger.info(f"Total number of items in search_query_list: {len(search_query_list)}") 239 | return search_query_list 240 | 241 | 242 | def run_detection(enterprise_keywords=[], org=[], repo=[]): 243 | """ 244 | Run GitHub search 245 | If a Enterprise keyword is provided, perform the search using the Enterprise keyword. 246 | params: enterprise_keywords - list - optional 247 | params: org - list - optional 248 | params: repo - list - optional 249 | returns: True or False 250 | 251 | """ 252 | if enterprise_keywords: 253 | if isinstance(enterprise_keywords, list): 254 | configs.secondary_keywords = enterprise_keywords 255 | else: 256 | logger.error( 257 | f"Please pass Enterprise keywords in List like '['password',]'" 258 | ) 259 | sys.exit(1) 260 | else: 261 | # Get the enterprise_keywords from enterprise_keywords file 262 | configs.read_secondary_keywords(file_name="enterprise_keywords.csv") 263 | logger.info(f"Total Enterprise keywords : {len(configs.secondary_keywords)}") 264 | 265 | total_search_pairs = len(configs.secondary_keywords) 266 | logger.info(f"Total Search Pairs: {total_search_pairs}") 267 | 268 | total_processed_search, total_detection_writes = 0, 0 269 | search_query_list = [] 270 | # Format GitHub Search Query List 271 | search_query_list = format_search_query_list(configs.secondary_keywords) 272 | logger.info(f"Total search_query_list count: {len(search_query_list)}") 273 | 274 | # Loop over each extension for each search query 275 | for search_query in search_query_list: 276 | detection_writes_per_query = 0 277 | new_results_per_query = 0 278 | detections_per_query = 0 279 | logger.info(f"******* Processing Search Query: {search_query} *******") 280 | try: 281 | # Search GitHub and return search response confidence_score 282 | total_processed_search += 1 283 | # time.sleep(2) 284 | search_response_lines = githubCalls.run_github_search( 285 | search_query, 286 | "", 287 | org, 288 | repo, 289 | ) 290 | # If search has detections, process the result urls else continue next search 291 | if search_response_lines: 292 | ( 293 | detection_writes_per_query, 294 | new_results_per_query, 295 | detections_per_query, 296 | ) = process_search_results( 297 | search_response_lines, 298 | search_query, 299 | ) 300 | logger.info( 301 | f"Detection writes in current search query: {detection_writes_per_query}" 302 | ) 303 | total_detection_writes += detection_writes_per_query 304 | else: 305 | logger.info( 306 | f"Search '{search_query}' returns no results. Continuing..." 307 | ) 308 | continue 309 | except Exception as e: 310 | logger.error(f"Process Error: {e}") 311 | logger.info(f"Current Total Processed Search: {total_processed_search}") 312 | logger.info(f"Current Total Detections Write: {total_detection_writes}") 313 | 314 | if new_results_per_query >= 0: 315 | logger.info( 316 | f"Total: {total_search_pairs} " + f"Processed: {total_processed_search} " 317 | ) 318 | 319 | return True 320 | 321 | 322 | def setup_logger(log_level=10, console_logging=True): 323 | """ 324 | Call logger create module and setup the logger for current run 325 | params: log_level - int - optional - Default - 20 - INFO 326 | params: console_logging - Boolean - optional - Enable console logging - default True 327 | """ 328 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 329 | log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 330 | global logger 331 | # Creates a logger 332 | logger = create_logger( 333 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 334 | ) 335 | 336 | 337 | def arg_parser(): 338 | """ 339 | Parse the command line Arguments and return the values 340 | params: None 341 | returns: enterprise_keywords - list 342 | returns: org - list 343 | returns: repo - list 344 | returns: log_level - int - Default - 20 - INFO 345 | returns: console_logging - Boolean - Default - True 346 | """ 347 | global file_prefix 348 | 349 | argparser = argparse.ArgumentParser() 350 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 351 | log_level_choices = [10, 20, 30, 40, 50] 352 | argparser.add_argument( 353 | "-e", 354 | "--enterprise_keywords", 355 | metavar="Enterprise Keywords", 356 | action="store", 357 | type=str, 358 | default="", 359 | help="Pass the Enterprise Keywords list as comma separated string", 360 | ) 361 | 362 | argparser.add_argument( 363 | "-o", 364 | "--org", 365 | metavar="Owner", 366 | action="store", 367 | type=str, 368 | default="", 369 | help="Pass the Org name list as comma separated string", 370 | ) 371 | 372 | argparser.add_argument( 373 | "-r", 374 | "--repo", 375 | metavar="Repo", 376 | action="store", 377 | type=str, 378 | default="", 379 | help="Pass the repo name list as comma separated string", 380 | ) 381 | 382 | argparser.add_argument( 383 | "-l", 384 | "--log_level", 385 | metavar="Logger Level", 386 | action="store", 387 | type=int, 388 | default=20, 389 | choices=log_level_choices, 390 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 391 | ) 392 | 393 | argparser.add_argument( 394 | "-c", 395 | "--console_logging", 396 | metavar="Console Logging", 397 | action="store", 398 | type=str, 399 | default="Yes", 400 | choices=flag_choices, 401 | help="Pass the Console Logging as Yes or No. Default is Yes", 402 | ) 403 | 404 | args = argparser.parse_args() 405 | 406 | if args.enterprise_keywords: 407 | enterprise_keywords = args.enterprise_keywords.split(",") 408 | else: 409 | enterprise_keywords = [] 410 | 411 | if args.org: 412 | org = args.org.split(",") 413 | else: 414 | org = [] 415 | 416 | if args.repo: 417 | if len(org) <= 0: 418 | repo = args.repo.split(",") 419 | else: 420 | repo = [] 421 | else: 422 | repo = [] 423 | 424 | if args.log_level in log_level_choices: 425 | log_level = args.log_level 426 | else: 427 | log_level = 20 428 | if args.console_logging.lower() in flag_choices[:5]: 429 | console_logging = True 430 | else: 431 | console_logging = False 432 | 433 | return ( 434 | enterprise_keywords, 435 | org, 436 | repo, 437 | log_level, 438 | console_logging, 439 | ) 440 | 441 | 442 | if __name__ == "__main__": 443 | # Argument Parsing 444 | ( 445 | enterprise_keywords, 446 | org, 447 | repo, 448 | log_level, 449 | console_logging, 450 | ) = arg_parser() 451 | 452 | # Setting up Logger 453 | setup_logger(log_level, console_logging) 454 | 455 | logger.info("xGitGuard Custom keyword search Process Started") 456 | 457 | # Read and Setup Global Configuration Data to reference in all process 458 | configs = ConfigsData() 459 | githubCalls = GithubCalls( 460 | configs.xgg_configs["github"]["enterprise_api_url"], 461 | "enterprise", 462 | configs.xgg_configs["github"]["enterprise_commits_url"], 463 | ) 464 | 465 | # Check if the GitHub API token environment variable for "enterprise" is set 466 | valid_config, token_var = check_github_token_env("enterprise") 467 | if not valid_config: 468 | logger.error( 469 | f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry" 470 | ) 471 | sys.exit(1) 472 | 473 | run_detection(enterprise_keywords, org, repo) 474 | logger.info("xGitGuard Custom keyword search Process Completed") 475 | -------------------------------------------------------------------------------- /xgitguard/custom keyword search/public_keyword_search.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import hashlib 3 | import os 4 | import sys 5 | import pandas as pd 6 | import time 7 | from datetime import datetime 8 | 9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 10 | 11 | parent_dir = os.path.dirname(MODULE_DIR) 12 | sys.path.insert(0, parent_dir) 13 | 14 | from common.configs_read import ConfigsData 15 | from common.data_format import ( 16 | format_commit_details, 17 | ) 18 | from common.github_calls import GithubCalls 19 | from common.logger import create_logger 20 | from utilities.common_utilities import check_github_token_env 21 | from utilities.file_utilities import write_to_csv_file 22 | 23 | file_prefix = "xgg_" 24 | 25 | 26 | def format_search_query_list(secondary_keywords): 27 | """ 28 | Create the search query list using Secondary Keywords 29 | params: secondary_keywords - list 30 | returns: search_query_list - list 31 | """ 32 | logger.debug("<<<< 'Current Executing Function' >>>>") 33 | search_query_list = [] 34 | # Format GitHub Search Query 35 | for secondary_keyword in secondary_keywords: 36 | search_query_list.append('"' + secondary_keyword + '"') 37 | logger.info(f"Total number of items in search_query_list: {len(search_query_list)}") 38 | return search_query_list 39 | 40 | 41 | def format_detection(skeyword, url): 42 | """ 43 | Format the data from the given content and other data 44 | params: skeyword - string - Secondary Keyword 45 | params: url - string - github url 46 | returns: secrets_data_list - list - List of formatted detections 47 | """ 48 | logger.debug("<<<< 'Current Executing Function' >>>>") 49 | secrets_data_list = [] 50 | secret_data = [] 51 | user_name = url.split("/")[3] 52 | repo_name = url.split("/")[4] 53 | raw_url = url.replace("raw.githubusercontent.com", "github.com") 54 | raw_url_splits = raw_url.split(repo_name) 55 | raw_url = raw_url_splits[0] + repo_name + "/blob" + raw_url_splits[1] 56 | try: 57 | file_path = "/".join(raw_url_splits[1].split("/")[2:]) 58 | api_response_commit_data = githubCalls.get_github_public_commits( 59 | user_name, repo_name, file_path 60 | ) 61 | commit_details = format_commit_details(api_response_commit_data) 62 | except Exception as e: 63 | logger.warning(f"Github commit content formation error: {e}") 64 | commit_details = {} 65 | secret_data.insert(0, commit_details) 66 | secret_data.insert(0, repo_name) 67 | secret_data.insert(0, user_name) 68 | secret_data.insert(0, raw_url) 69 | secret_data.insert(0, skeyword) 70 | secret_data.insert(0, "xGG_Public") 71 | valid_secret_row = [value for value in secret_data] 72 | valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 73 | now = datetime.now() 74 | valid_secret_row.append(now.year) 75 | valid_secret_row.append(now.month) 76 | valid_secret_row.append(now.day) 77 | valid_secret_row.append(now.hour) 78 | secrets_data_list.append(valid_secret_row) 79 | logger.debug(f"Current formatted secrets_data_list count: {len(secrets_data_list)}") 80 | 81 | return secrets_data_list 82 | 83 | 84 | def process_search_urls(url_list, search_query): 85 | """ 86 | params: url_list - list - list of html urls to get code content 87 | params: search_query - string 88 | returns: secrets_data_list - list - Detected secrets data 89 | """ 90 | logger.debug("<<<< 'Current Executing Function' >>>>") 91 | # Processes search findings 92 | skeyword = search_query 93 | secrets_data_list = [] 94 | try: 95 | for url in url_list: 96 | secret_data_list = format_detection( 97 | skeyword, 98 | url, 99 | ) 100 | if secret_data_list: 101 | for secret_data in secret_data_list: 102 | secrets_data_list.append(secret_data) 103 | except Exception as e: 104 | logger.error(f"Total Process Search (Exception Error): {e}") 105 | return secrets_data_list 106 | 107 | 108 | def check_existing_detections(url_list, search_query): 109 | """ 110 | Check whether the current urs where processed in previous runs 111 | for each url in url list 112 | create hex hash value for the url 113 | check the url hash in previous detected urls 114 | if not present add them to further process 115 | skip if its already present in detected urls 116 | params: url_list - List - List of search result urls 117 | params: search_query - String - Search query string 118 | returns: new_urls_list - List - New url list 119 | returns: new_hashed_urls - List - New Url Hash detected 120 | """ 121 | logger.debug("<<<< 'Current Executing Function' >>>>") 122 | new_urls_list, new_hashed_urls = [], [] 123 | global file_prefix 124 | # Get the Already predicted hashed url list if present 125 | try: 126 | # for Reading training Data only one time 127 | if configs.hashed_urls: 128 | pass 129 | except: 130 | configs.read_hashed_url( 131 | file_name=file_prefix + "public_hashed_url_custom_keywords.csv" 132 | ) 133 | if url_list: 134 | for url in url_list: 135 | url_to_hash = url + search_query 136 | hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest() 137 | new_hashed_url = [] 138 | if not hashed_url in configs.hashed_urls: 139 | new_urls_list.append(url) 140 | new_hashed_url.append(hashed_url) 141 | new_hashed_url.append(url) 142 | if new_hashed_url: 143 | new_hashed_urls.append(new_hashed_url) 144 | return new_urls_list, new_hashed_urls 145 | 146 | 147 | def process_search_results(search_response_lines, search_query): 148 | """ 149 | params: search_response_lines - list 150 | params: search_query - string 151 | 152 | returns: detection_writes_per_query - int - Total detections written to file 153 | returns: new_results_per_query - int - No of new urls per query 154 | returns: detections_per_query - int - No of detections per search 155 | """ 156 | logger.debug("<<<< 'Current Executing Function' >>>>") 157 | detection_writes_per_query = 0 158 | new_results_per_query = 0 159 | detections_per_query = 0 160 | new_hashed_urls = [] 161 | global file_prefix 162 | url_list = [] 163 | hashed_urls_file = os.path.join( 164 | configs.output_dir, file_prefix + "public_hashed_url_custom_keywords.csv" 165 | ) 166 | for line in search_response_lines: 167 | html_url = line["html_url"] 168 | html_url = html_url.replace("blob/", "") 169 | html_url = html_url.replace( 170 | "https://github.com", "https://raw.githubusercontent.com" 171 | ) 172 | url_list.append(html_url) 173 | if url_list: 174 | # Check if current url is processed in previous runs 175 | new_urls_list, new_hashed_urls = check_existing_detections( 176 | url_list, search_query 177 | ) 178 | new_results_per_query = len(new_urls_list) 179 | if new_hashed_urls: 180 | secrets_detected = process_search_urls(new_urls_list, search_query) 181 | detections_per_query += len(secrets_detected) 182 | if secrets_detected: 183 | try: 184 | logger.debug( 185 | f"Current secrets_detected count: {len(secrets_detected)}" 186 | ) 187 | secrets_detected_df = pd.DataFrame( 188 | secrets_detected, 189 | columns=configs.xgg_configs["keywords"]["public_data_columns"], 190 | ) 191 | detection_writes_per_query += secrets_detected_df.shape[0] 192 | try: 193 | secrets_detected_file = os.path.join( 194 | configs.output_dir, 195 | "xgg_public_custom_keywords_detected.csv", 196 | ) 197 | write_to_csv_file(secrets_detected_df, secrets_detected_file) 198 | except Exception as e: 199 | logger.error(f"Process Error: {e}") 200 | except Exception as e: 201 | logger.error(f"Keywords Dataframe creation failed. Error: {e}") 202 | secrets_detected_df = pd.DataFrame( 203 | columns=configs.xgg_configs["keywords"]["public_data_columns"], 204 | ) 205 | else: 206 | logger.info("No keywords in current search results") 207 | try: 208 | new_hashed_urls_df = pd.DataFrame( 209 | new_hashed_urls, columns=["hashed_url", "url"] 210 | ) 211 | write_to_csv_file(new_hashed_urls_df, hashed_urls_file) 212 | except Exception as e: 213 | logger.error(f"File Write error: {e}") 214 | sys.exit(1) 215 | else: 216 | logger.info( 217 | f"All {len(url_list)} urls in current search is already processed and hashed" 218 | ) 219 | else: 220 | logger.info(f"No valid html urls in the current search results to process.") 221 | return detection_writes_per_query, new_results_per_query, detections_per_query 222 | 223 | 224 | def run_detection(public_keywords=[], org=[], repo=[]): 225 | """ 226 | Run GitHub search 227 | If a primary keyword is provided, perform the search using the primary keyword. 228 | params: public_keywords - list - optional 229 | params: org - list - optional 230 | params: repo - list - optional 231 | returns: True or False 232 | """ 233 | if public_keywords: 234 | if isinstance(public_keywords, list): 235 | configs.secondary_keywords = public_keywords 236 | else: 237 | logger.error(f"Please pass public_keywords in List like '['password',]'") 238 | sys.exit(1) 239 | else: 240 | # Get the secondary_keywords from secondary_keywords file 241 | configs.read_secondary_keywords(file_name="public_keywords.csv") 242 | logger.info(f"Total Public keywords : {len(configs.secondary_keywords)}") 243 | 244 | total_search_pairs = len(configs.secondary_keywords) 245 | logger.info(f"Total Search Pairs: {total_search_pairs}") 246 | 247 | total_processed_search, total_detection_writes = 0, 0 248 | search_query_list = [] 249 | # Format GitHub Search Query List 250 | search_query_list = format_search_query_list(configs.secondary_keywords) 251 | logger.info(f"Total No.of search queries: {len(search_query_list)}") 252 | 253 | # Loop over each search query 254 | for search_query in search_query_list: 255 | detection_writes_per_query = 0 256 | new_results_per_query = 0 257 | logger.info(f"******* Processing Search Query: {search_query} *******") 258 | try: 259 | # Search GitHub and return search response confidence_score 260 | total_processed_search += 1 261 | time.sleep(2) 262 | search_response_lines = githubCalls.run_github_search( 263 | search_query, 264 | "", 265 | org, 266 | repo, 267 | ) 268 | # If search has detections, process the result urls else continue next search 269 | if search_response_lines: 270 | ( 271 | detection_writes_per_query, 272 | new_results_per_query, 273 | detections_per_query, 274 | ) = process_search_results( 275 | search_response_lines, 276 | search_query, 277 | ) 278 | logger.info( 279 | f"Detection writes in current search query: {detection_writes_per_query}" 280 | ) 281 | total_detection_writes += detection_writes_per_query 282 | else: 283 | logger.info( 284 | f"Search '{search_query}' returns no results. Continuing..." 285 | ) 286 | continue 287 | except Exception as e: 288 | logger.error(f"Process Error: {e}") 289 | logger.info(f"Current Total Processed Search: {total_processed_search}") 290 | logger.info(f"Current Total Detections Write: {total_detection_writes}") 291 | if new_results_per_query >= 0: 292 | logger.info( 293 | f"Total: {total_search_pairs} " + f"Processed: {total_processed_search} " 294 | ) 295 | return True 296 | 297 | 298 | def setup_logger(log_level=10, console_logging=True): 299 | """ 300 | Call logger create module and setup the logger for current run 301 | params: log_level - int - optional - Default - 20 - INFO 302 | params: console_logging - Boolean - optional - Enable console logging - default True 303 | """ 304 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 305 | log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 306 | global logger 307 | # Creates a logger 308 | logger = create_logger( 309 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 310 | ) 311 | 312 | 313 | def arg_parser(): 314 | """ 315 | Parse the command line Arguments and return the values 316 | params: None 317 | returns: public_keywords - list 318 | returns: org - list 319 | returns: repo - list 320 | returns: log_level - int - Default - 20 - INFO 321 | returns: console_logging - Boolean - Default - True 322 | """ 323 | global file_prefix 324 | global ml_prediction 325 | global unmask_secret 326 | argparser = argparse.ArgumentParser() 327 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 328 | log_level_choices = [10, 20, 30, 40, 50] 329 | argparser.add_argument( 330 | "-p", 331 | "--public_keywords", 332 | metavar="public keywords", 333 | action="store", 334 | type=str, 335 | default="", 336 | help="Pass the Primary Keywords list as comma separated string", 337 | ) 338 | argparser.add_argument( 339 | "-o", 340 | "--org", 341 | metavar="Owner", 342 | action="store", 343 | type=str, 344 | default="", 345 | help="Pass the Org name list as comma separated string", 346 | ) 347 | argparser.add_argument( 348 | "-r", 349 | "--repo", 350 | metavar="Repo", 351 | action="store", 352 | type=str, 353 | default="", 354 | help="Pass the repo name list as comma separated string", 355 | ) 356 | argparser.add_argument( 357 | "-l", 358 | "--log_level", 359 | metavar="Logger Level", 360 | action="store", 361 | type=int, 362 | default=20, 363 | choices=log_level_choices, 364 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 365 | ) 366 | argparser.add_argument( 367 | "-c", 368 | "--console_logging", 369 | metavar="Console Logging", 370 | action="store", 371 | type=str, 372 | default="Yes", 373 | choices=flag_choices, 374 | help="Pass the Console Logging as Yes or No. Default is Yes", 375 | ) 376 | args = argparser.parse_args() 377 | if args.public_keywords: 378 | public_keywords = args.public_keywords.split(",") 379 | else: 380 | public_keywords = [] 381 | if args.org: 382 | org = args.org.split(",") 383 | else: 384 | org = [] 385 | if args.repo: 386 | if len(org) <= 0: 387 | repo = args.repo.split(",") 388 | else: 389 | repo = [] 390 | else: 391 | repo = [] 392 | if args.log_level in log_level_choices: 393 | log_level = args.log_level 394 | else: 395 | log_level = 20 396 | if args.console_logging.lower() in flag_choices[:5]: 397 | console_logging = True 398 | else: 399 | console_logging = False 400 | return ( 401 | public_keywords, 402 | org, 403 | repo, 404 | log_level, 405 | console_logging, 406 | ) 407 | 408 | 409 | if __name__ == "__main__": 410 | # Argument Parsing 411 | ( 412 | public_keywords, 413 | org, 414 | repo, 415 | log_level, 416 | console_logging, 417 | ) = arg_parser() 418 | 419 | # Setting up Logger 420 | setup_logger(log_level, console_logging) 421 | logger.info("xGitGuard Custom keyword search Process Started") 422 | 423 | # Read and Setup Global Configuration Data to reference in all process 424 | configs = ConfigsData() 425 | githubCalls = GithubCalls( 426 | configs.xgg_configs["github"]["public_api_url"], 427 | "public", 428 | configs.xgg_configs["github"]["public_commits_url"], 429 | configs.xgg_configs["github"]["throttle_time"], 430 | ) 431 | 432 | # Check if the GitHub API token environment variable for "public" is set 433 | valid_config, token_var = check_github_token_env("public") 434 | if not valid_config: 435 | logger.error( 436 | f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry" 437 | ) 438 | sys.exit(1) 439 | run_detection(public_keywords, org, repo) 440 | logger.info("xGitGuard custom keyword search Process Completed") 441 | -------------------------------------------------------------------------------- /xgitguard/file-scanner/extension_search.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | from datetime import datetime 4 | import logging 5 | import os 6 | import sys 7 | from pathlib import Path 8 | 9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 10 | parent_dir = os.path.dirname(MODULE_DIR) 11 | sys.path.insert(0, parent_dir) 12 | 13 | from common.configs_read import ConfigsData 14 | from common.logger import create_logger 15 | 16 | logger = logging.getLogger("xgg_logger") 17 | new_search = 0 18 | 19 | 20 | def write_data(data): 21 | """ 22 | Write the searched data for a given extension. 23 | 24 | Args: 25 | data (str): The file path. 26 | 27 | Returns: 28 | bool: Indicates whether the operation was successful. 29 | """ 30 | global new_search 31 | try: 32 | detected_file = os.path.join( 33 | configs.output_dir, 34 | "xgg_search_files.csv", 35 | ) 36 | if new_search != 0: 37 | with open(detected_file, "a") as f: 38 | writer = csv.writer(f) 39 | writer.writerow([data]) 40 | else: 41 | with open(detected_file, "w") as f: 42 | writer = csv.writer(f) 43 | writer.writerow(["target_file_paths"]) 44 | writer.writerow([data]) 45 | new_search = 1 46 | except Exception as e: 47 | logger.error(f"Content File Write error: {e}") 48 | return False 49 | return True 50 | 51 | 52 | def find_files(extensions=[], search_path=""): 53 | """ 54 | Run search for the given directory using extensions and return file paths where these extensions are present. 55 | 56 | Args: 57 | extensions (list): The list of file extensions to search for. 58 | search_path (str): The file or directory path. 59 | 60 | Returns: 61 | list: A list of file paths where the specified extensions are present. 62 | """ 63 | if os.path.isfile(search_path): 64 | write_data([search_path]) 65 | return True 66 | 67 | if extensions: 68 | if isinstance(extensions, list): 69 | configs.extensions = extensions 70 | else: 71 | logger.error(f"Please pass extensions in List like '['py',]'") 72 | sys.exit() 73 | else: 74 | # Get the extensions from extensions file 75 | configs.read_extensions(file_name="extensions.csv") 76 | logger.info(f"Total Extensions: {len(configs.extensions)}") 77 | 78 | try: 79 | BASE_DIR = Path(search_path) 80 | for path in BASE_DIR.glob(r"**/*"): 81 | if path.suffix[1:] in configs.extensions: 82 | if os.path.isfile(path): 83 | write_data(path) 84 | except: 85 | logger.error(f"File search exception") 86 | 87 | 88 | def setup_logger(log_level=10, console_logging=True): 89 | """ 90 | Call the logger creation module and set up the logger for the current run. 91 | 92 | Args: 93 | log_level (int, optional): The logging level. Default is 20 (INFO). 94 | console_logging (bool, optional): Enable console logging. Default is True. 95 | """ 96 | global logger 97 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 98 | log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 99 | # Creates a logger 100 | logger = create_logger( 101 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 102 | ) 103 | 104 | 105 | def arg_parser(): 106 | """ 107 | Parse the command line arguments and return the values. 108 | 109 | Args: 110 | None 111 | 112 | Returns: 113 | extensions (list): The list of file extensions to search for. 114 | search_path (str): The file or directory path. 115 | log_level (int): The logging level. Default is 20 (INFO). 116 | console_logging (bool): Enable console logging. Default is True. 117 | """ 118 | argparser = argparse.ArgumentParser() 119 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 120 | log_level_choices = [10, 20, 30, 40, 50] 121 | 122 | argparser.add_argument( 123 | "-e", 124 | "--extensions", 125 | metavar="Extensions", 126 | action="store", 127 | type=str, 128 | default="", 129 | help="Pass the Extensions list as comma separated string", 130 | ) 131 | 132 | argparser.add_argument( 133 | "-p", 134 | "--search_path", 135 | metavar="Search path", 136 | action="store", 137 | type=str, 138 | default="", 139 | help="Pass the Search Path for scanner", 140 | ) 141 | 142 | argparser.add_argument( 143 | "-l", 144 | "--log_level", 145 | metavar="Logger Level", 146 | action="store", 147 | type=int, 148 | default=20, 149 | choices=log_level_choices, 150 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 151 | ) 152 | 153 | argparser.add_argument( 154 | "-c", 155 | "--console_logging", 156 | metavar="Console Logging", 157 | action="store", 158 | type=str, 159 | default="Yes", 160 | choices=flag_choices, 161 | help="Pass the Console Logging as Yes or No. Default is Yes", 162 | ) 163 | 164 | args = argparser.parse_args() 165 | 166 | if args.extensions: 167 | extensions = args.extensions.split(",") 168 | else: 169 | extensions = [] 170 | 171 | if args.search_path: 172 | search_path = args.search_path 173 | else: 174 | search_path = "" 175 | 176 | if args.log_level in log_level_choices: 177 | log_level = args.log_level 178 | else: 179 | log_level = 20 180 | if args.console_logging.lower() in flag_choices[:5]: 181 | console_logging = True 182 | else: 183 | console_logging = False 184 | 185 | return ( 186 | extensions, 187 | search_path, 188 | log_level, 189 | console_logging, 190 | ) 191 | 192 | 193 | if __name__ == "__main__": 194 | # Argument Parsing 195 | ( 196 | extensions, 197 | search_path, 198 | log_level, 199 | console_logging, 200 | ) = arg_parser() 201 | 202 | try: 203 | # Setting up Logger 204 | setup_logger(log_level, console_logging) 205 | 206 | logger.info("xGitGuard File Extension Process Started") 207 | # Read and Setup Global Configuration Data to reference in all process 208 | configs = ConfigsData() 209 | 210 | if search_path: 211 | find_files(extensions, search_path) 212 | else: 213 | configs.read_search_paths(file_name="xgg_search_paths.csv") 214 | search_paths = configs.search_paths 215 | if search_paths: 216 | for search_path in search_paths: 217 | find_files(extensions, search_path) 218 | else: 219 | logger.info(f"No Search paths to process from config file. Ending.") 220 | sys.exit(1) 221 | 222 | logger.info("xGitGuard File Extension Process Completed") 223 | except Exception as e: 224 | logger.error( 225 | f"xGitGuard Secret detection process encountered an exception: {e}" 226 | ) 227 | sys.exit(1) 228 | -------------------------------------------------------------------------------- /xgitguard/github-enterprise/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/github-enterprise/__init__.py -------------------------------------------------------------------------------- /xgitguard/github-public/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/github-public/__init__.py -------------------------------------------------------------------------------- /xgitguard/logs/.log_desc: -------------------------------------------------------------------------------- 1 | #directory for collecting app logs -------------------------------------------------------------------------------- /xgitguard/ml_training/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/__init__.py -------------------------------------------------------------------------------- /xgitguard/ml_training/ml_data-collector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/__init__.py -------------------------------------------------------------------------------- /xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/__init__.py -------------------------------------------------------------------------------- /xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/enterprise_key_data_collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | """ 19 | xGitGuard Enterprise GitHub Keys and Token ML Data Collection process 20 | Steps: 21 | Get Secondary Keywords and Extension file data from config path 22 | Prepare the search query list with each Secondary Keyword 23 | Loop over each extension for each search query 24 | Search GitHub and get response data 25 | Process the response urls 26 | If url is already processed in previous runs, skip the same 27 | Get the code content for the html urls 28 | Clean the code content and extract Secrets 29 | Detect the Secrets using RegEx and format Secret records 30 | Write the cleaned and detected url data 31 | calling Examples: 32 | By default the all configuration keys will be taken from config files 33 | 34 | # Run with Secondary Keywords and extensions from config files 35 | python enterprise_key_data_collector.py 36 | """ 37 | 38 | import argparse 39 | import hashlib 40 | import math 41 | import os 42 | import re 43 | import sys 44 | import time 45 | from datetime import datetime 46 | 47 | import pandas as pd 48 | from urlextract import URLExtract 49 | 50 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 51 | parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(MODULE_DIR))) 52 | sys.path.insert(0, parent_dir) 53 | 54 | from common.github_calls import GithubCalls 55 | from common.configs_read import ConfigsData 56 | from common.data_format import keys_extractor, remove_url_from_keys 57 | from common.logger import create_logger 58 | from common.ml_process import entropy_calc 59 | from utilities.common_utilities import check_github_token_env 60 | from utilities.file_utilities import write_to_csv_file 61 | 62 | 63 | def calculate_confidence(secondary_keyword, extension, secret): 64 | """ 65 | Calculates confidence scores for given Keywords 66 | params: secondary_keyword - string 67 | params: extension - string 68 | params: secret - string - Detected secret 69 | returns: confidence score 70 | """ 71 | # logger.debug("<<<< 'Current Executing Function' >>>>") 72 | try: 73 | if not configs.confidence_values.empty: 74 | pass 75 | except: 76 | configs.read_confidence_values(file_name="confidence_values.csv") 77 | 78 | try: 79 | if not configs.dictionary_words.empty: 80 | pass 81 | except: 82 | # Get the dictionary_words from dictionary words file 83 | configs.read_dictionary_words(file_name="dictionary_words.csv") 84 | logger.info( 85 | "Reading dictionary_words.csv file completed. Proceeding for search result processing" 86 | ) 87 | 88 | secondary_keyword_value = int( 89 | configs.confidence_values.loc[secondary_keyword]["value"] 90 | ) 91 | 92 | try: 93 | extension_value = int(configs.confidence_values.loc[extension]["value"]) 94 | except: 95 | extension = 0 96 | extension_value = 0 97 | 98 | entro = entropy_calc(list(secret)) 99 | d_match = configs.dict_words_ct * configs.dict_words_vc.transform([secret]).T 100 | 101 | return [sum([secondary_keyword_value, extension_value]), entro, d_match[0]] 102 | 103 | 104 | def format_detection(skeyword, org_url, url, code_content, secrets, skeyword_count): 105 | """ 106 | Format the secret data from the given code content and other data 107 | Format the secrets data in the required format 108 | Calculate the secrets confidence values 109 | Return the final formatted detections 110 | 111 | params: skeyword - string - Secondary Keyword 112 | params: org_url - string - github url 113 | params: url - string - github url 114 | params: code_content - list - User code content 115 | params: secrets - list - Detected secrets list 116 | params: skeyword_count - int - secondary keyword count 117 | returns: secrets_data_list - list - List of formatted detections 118 | """ 119 | logger.debug("<<<< 'Current Executing Function' >>>>") 120 | valid_secret = False 121 | secrets_data_list = [] 122 | secret_data = [] 123 | 124 | extension = org_url.split(".")[-1] 125 | user_name = org_url.split("/")[3] 126 | repo_name = org_url.split("/")[4] 127 | 128 | secret_data.insert(0, repo_name) 129 | secret_data.insert(0, user_name) 130 | secret_data.insert(0, org_url) 131 | secret_data.insert(0, extension) 132 | secret_data.insert(0, skeyword) 133 | secret_data.insert(0, "xGG_Enterprise_Key & Token") 134 | logger.debug("<<<< 'Current Executing Function calculate_confidence loop' >>>>") 135 | for secret in secrets: 136 | # Calculate confidence values for detected secrets 137 | confidence_score = calculate_confidence(skeyword, extension, secret) 138 | 139 | if confidence_score[1] > 1.5: 140 | valid_secret_row = [value for value in secret_data] 141 | secret_lines = re.findall(".*" + secret + ".*$", code_content, re.MULTILINE) 142 | code_line = secret 143 | for secret_line in secret_lines: 144 | if ( 145 | (skeyword in secret_line) 146 | and (secret_line != secret) 147 | and not ( 148 | [ 149 | element 150 | for element in ["http", "www", "uuid"] 151 | if (element in secret_line) 152 | ] 153 | ) 154 | and (secret_line.find(skeyword) < secret_line.find(secret)) 155 | ): 156 | if len(secret_line) < 300: 157 | code_line = secret_line 158 | valid_secret_row.append(secret) 159 | valid_secret = True 160 | break 161 | if valid_secret: 162 | valid_secret_row.append(code_line) 163 | valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 164 | valid_secret_row.append(confidence_score[0]) 165 | count_score = math.log2(50) / (math.log2(skeyword_count + 1) + 1) 166 | valid_secret_row.append(count_score) 167 | valid_secret_row.append(confidence_score[1]) 168 | d_match = math.log2(100) / (math.log2(confidence_score[2] + 1) + 1) 169 | valid_secret_row.append(d_match) 170 | valid_secret_row.append( 171 | confidence_score[0] + confidence_score[1] + count_score + d_match 172 | ) 173 | now = datetime.now() 174 | valid_secret_row.append(now.year) 175 | valid_secret_row.append(now.month) 176 | valid_secret_row.append(now.day) 177 | valid_secret_row.append(now.hour) 178 | secrets_data_list.append(valid_secret_row) 179 | valid_secret = False 180 | logger.debug(f"Current formatted secrets_data_list count: {len(secrets_data_list)}") 181 | # logger.debug(f"secrets_data_list: {secrets_data_list}") 182 | return secrets_data_list 183 | 184 | 185 | def process_search_urls(org_urls_list, url_list, search_query): 186 | """ 187 | Process the Search html url as below 188 | Get code content from GitHub for the html url 189 | Remove Url data from code content 190 | Extract secret values using regex 191 | Format the secrets detected 192 | Return the secrets detected 193 | 194 | params: org_urls_list - list - list of html urls to get code content 195 | params: url_list - list - list of html urls to get code content 196 | params: search_query - string 197 | returns: secrets_data_list - list - Detected secrets data 198 | """ 199 | logger.debug("<<<< 'Current Executing Function' >>>>") 200 | # Processes search findings 201 | skeyword = search_query.split('"')[1].strip() 202 | secrets_data_list = [] 203 | extractor = URLExtract() 204 | try: 205 | for url in url_list: 206 | header = configs.xgg_configs["github"]["enterprise_header"] 207 | code_content_response = githubCalls.enterprise_url_content_get(url, header) 208 | if code_content_response: 209 | code_content = code_content_response.text 210 | else: 211 | logger.debug("No response for url content get call") 212 | continue 213 | 214 | try: 215 | url_file_extension = url.split(".")[-1] 216 | url_counts = extractor.find_urls(code_content) 217 | if len(url_counts) > 30 or url_file_extension == "md": 218 | logger.debug( 219 | f"Skip processing URL extract from code content as at url counts is beyond 30: {len(url_counts)}" 220 | ) 221 | continue 222 | except Exception as e: 223 | logger.debug( 224 | f"Skip processing URL extract from code content at first 10000 URL limits" 225 | ) 226 | continue 227 | 228 | lines = code_content.split("\n") 229 | if len(lines) <= 2: 230 | logger.debug( 231 | f"Skiping processing URL extract from code content as url lines is beyond 2: {len(lines)}" 232 | ) 233 | continue 234 | 235 | code_contents = remove_url_from_keys(code_content) 236 | secrets_data = keys_extractor(code_contents) 237 | 238 | skeyword_count = code_content.lower().count(skeyword.lower()) 239 | 240 | if len(secrets_data) >= 1 and len(secrets_data) <= 20: 241 | org_url = org_urls_list[url_list.index(url)] 242 | secret_data_list = format_detection( 243 | skeyword, org_url, url, code_content, secrets_data, skeyword_count 244 | ) 245 | if secret_data_list: 246 | for secret_data in secret_data_list: 247 | secrets_data_list.append(secret_data) 248 | else: 249 | logger.debug( 250 | f"Skipping secrets_data as length is not between 1 to 20. Length: {len(secrets_data)}" 251 | ) 252 | except Exception as e: 253 | logger.error(f"Total Process Search (Exception Error): {e}") 254 | return secrets_data_list 255 | 256 | 257 | def check_existing_detections(org_url_list, url_list, search_query): 258 | """ 259 | Check whether the current urs where processed in previous runs 260 | for each url in url list 261 | create hex hash value for the url 262 | check the url hash in previous detected urls 263 | if not present add them to further process 264 | skip if its already present in detected urls 265 | 266 | params: org_url_list - List - List of original result urls 267 | params: url_list - List - List of search result urls 268 | params: search_query - String - Search query string 269 | 270 | returns: new_org_url_list - List - New original url list 271 | returns: new_urls_list - List - New url list 272 | returns: new_hashed_urls - List - New Url Hash detected 273 | """ 274 | logger.debug("<<<< 'Current Executing Function' >>>>") 275 | 276 | new_org_url_list, new_urls_list, new_hashed_urls = [], [], [] 277 | 278 | # Get the Already predicted hashed url list if present 279 | try: 280 | # for Reading training Data only one time 281 | if configs.hashed_urls: 282 | pass 283 | except: 284 | configs.read_hashed_url(file_name="train_enterprise_hashed_url_keys.csv") 285 | 286 | if url_list: 287 | for url in url_list: 288 | url_to_hash = url + search_query 289 | hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest() 290 | new_hashed_url = [] 291 | if not hashed_url in configs.hashed_urls: 292 | new_org_url_list.append(org_url_list[url_list.index(url)]) 293 | new_urls_list.append(url) 294 | new_hashed_url.append(hashed_url) 295 | new_hashed_url.append(url) 296 | if new_hashed_url: 297 | new_hashed_urls.append(new_hashed_url) 298 | return new_org_url_list, new_urls_list, new_hashed_urls 299 | 300 | 301 | def process_search_results(search_response_lines, search_query): 302 | """ 303 | For each search response items, process as below 304 | Get the html urls from the search response 305 | Check if the current url is already processed 306 | if not processed, continue. else skip the url and proceed 307 | Get the user code content for the html url 308 | Format and clean the code content 309 | Find the secrets 310 | Format and write data 311 | Write the hashed urls to file 312 | 313 | params: search_response_lines - list 314 | params: search_query - string 315 | 316 | returns: detection_writes_per_query - int - Total detections written to file 317 | returns: new_results_per_query - int - No of new urls per query 318 | returns: detections_per_query - int - No of detections per search 319 | """ 320 | logger.debug("<<<< 'Current Executing Function' >>>>") 321 | detection_writes_per_query = 0 322 | new_results_per_query = 0 323 | detections_per_query = 0 324 | new_hashed_urls = [] 325 | 326 | url_list, org_url_list = [], [] 327 | 328 | hashed_urls_file = os.path.join( 329 | configs.output_dir, "train_enterprise_hashed_url_keys.csv" 330 | ) 331 | for line in search_response_lines: 332 | html_url = line["html_url"] 333 | org_url_list.append(html_url) 334 | html_url = ( 335 | configs.xgg_configs["github"]["enterprise_pre_url"] 336 | + line["repository"]["full_name"] 337 | + "/contents/" 338 | + line["path"] 339 | ) 340 | url_list.append(html_url) 341 | 342 | if url_list: 343 | # Check if current url is processed in previous runs 344 | new_org_urls_list, new_urls_list, new_hashed_urls = check_existing_detections( 345 | org_url_list, url_list, search_query 346 | ) 347 | new_results_per_query = len(new_urls_list) 348 | if new_hashed_urls: 349 | secrets_detected = process_search_urls( 350 | new_org_urls_list, new_urls_list, search_query 351 | ) 352 | detections_per_query += len(secrets_detected) 353 | if secrets_detected: 354 | try: 355 | logger.debug( 356 | f"Current secrets_detected count: {len(secrets_detected)}" 357 | ) 358 | # logger.debug(f"secrets_detected: {secrets_detected}") 359 | secrets_detected_df = pd.DataFrame( 360 | secrets_detected, 361 | columns=configs.xgg_configs["secrets"][ 362 | "enterprise_data_collector_columns" 363 | ], 364 | ) 365 | except Exception as e: 366 | logger.error( 367 | f"secrets_detected Dataframe creation failed. Error: {e}" 368 | ) 369 | secrets_detected_df = pd.DataFrame( 370 | columns=configs.xgg_configs["secrets"][ 371 | "enterprise_data_collector_columns" 372 | ], 373 | ) 374 | if not secrets_detected_df.empty: 375 | secrets_detected_df = secrets_detected_df[ 376 | [ 377 | "Secret", 378 | "Second_Key", 379 | "Extension", 380 | "Code", 381 | "Key_Weight", 382 | "SKey_Count", 383 | "Entropy", 384 | "Dictionary_Similarity", 385 | "Score", 386 | ] 387 | ] 388 | secrets_detected_df["Label"] = 1 389 | if not secrets_detected_df.empty: 390 | detection_writes_per_query += secrets_detected_df.shape[0] 391 | logger.debug( 392 | f"Current secrets_detected_df count: {secrets_detected_df.shape[0]}" 393 | ) 394 | try: 395 | secrets_detected_file = os.path.join( 396 | configs.output_dir, "key_train_source.csv" 397 | ) 398 | write_to_csv_file( 399 | secrets_detected_df, secrets_detected_file 400 | ) 401 | except Exception as e: 402 | logger.error(f"Process Error: {e}") 403 | 404 | else: 405 | logger.debug( 406 | "secrets_detected_df is empty. So skipping collection." 407 | ) 408 | else: 409 | logger.info("No Secrets in current search results") 410 | 411 | try: 412 | new_hashed_urls_df = pd.DataFrame( 413 | new_hashed_urls, columns=["hashed_url", "url"] 414 | ) 415 | write_to_csv_file(new_hashed_urls_df, hashed_urls_file) 416 | except Exception as e: 417 | logger.error(f"File Write error: {e}") 418 | sys.exit(1) 419 | else: 420 | logger.info( 421 | f"All {len(url_list)} urls in current search is already processed and hashed" 422 | ) 423 | else: 424 | logger.info(f"No valid html urls in the current search results to process.") 425 | return detection_writes_per_query, new_results_per_query, detections_per_query 426 | 427 | 428 | def format_search_query_list(secondary_keywords): 429 | """ 430 | Create the search query list using Secondary Keywords 431 | params: secondary_keywords - list 432 | returns: search_query_list - list 433 | """ 434 | logger.debug("<<<< 'Current Executing Function' >>>>") 435 | search_query_list = [] 436 | # Format GitHub Search Query 437 | for secondary_keyword in secondary_keywords: 438 | search_query_list.append('"' + secondary_keyword + '"') 439 | logger.info(f"Total search_query_list count: {len(search_query_list)}") 440 | return search_query_list 441 | 442 | 443 | def run_data_collector(secondary_keywords=[], extensions=[]): 444 | """ 445 | Run GitHub detections 446 | Run search with Secondary Keywords and extension combination 447 | Steps: 448 | Get Secondary Keywords and Extension file data from config path 449 | Prepare the search query list by combining Primary Keyword with each Secondary Keyword 450 | Loop over each extension for each search query 451 | Search GitHub and get response data 452 | Process the response urls 453 | If url is already processed in previous runs, skip the same 454 | Get the code content for the html urls 455 | Clean the code content and extract secrets 456 | Detect the secrets using RegEx and format secret records 457 | Write the cleaned and detected secret data 458 | 459 | params: secondary_keywords - list - optional 460 | params: extensions - list - optional 461 | returns: True or False 462 | 463 | Examples: 464 | Run for Data collection for preparing model Features 465 | run_data_collector() 466 | 467 | Run for given Secondary Keyword and extension 468 | run_data_collector(secondary_keywords=["auth"], extensions=["py"]) 469 | 470 | Run without Secondary Keywords from config file and given list of extensions 471 | run_data_collector(extension = ["py","txt"]) 472 | """ 473 | logger.debug("<<<< 'Current Executing Function' >>>>") 474 | 475 | if secondary_keywords: 476 | if isinstance(secondary_keywords, list): 477 | configs.secondary_keywords = secondary_keywords 478 | else: 479 | logger.error(f"Please pass secondary_keywords in List like '['token',]'") 480 | sys.exit(1) 481 | else: 482 | # Get the secondary_keywords from secondary_keywords file 483 | configs.read_secondary_keywords(file_name="secondary_keys.csv") 484 | logger.info(f"Total Secondary Keywords: {len(configs.secondary_keywords)}") 485 | 486 | if extensions: 487 | if isinstance(secondary_keywords, list): 488 | configs.extensions = extensions 489 | else: 490 | logger.error(f"Please pass extensions in List like '['py',]'") 491 | sys.exit() 492 | else: 493 | # Get the extensions from extensions file 494 | configs.read_extensions(file_name="extensions.csv") 495 | logger.info(f"Total Extensions: {len(configs.extensions)}") 496 | 497 | total_search_pairs = len(configs.secondary_keywords) * len(configs.extensions) 498 | logger.info(f"Total Search Pairs: {total_search_pairs}") 499 | 500 | total_processed_search, total_detection_writes = 0, 0 501 | search_query_list = [] 502 | # Format GitHub Search Query List 503 | search_query_list = format_search_query_list(configs.secondary_keywords) 504 | if not search_query_list: 505 | logger.info(f"No Search query to process. Ending.") 506 | sys.exit(1) 507 | 508 | # Loop over each extension for each search query 509 | for extension in configs.extensions: 510 | for search_query in search_query_list: 511 | detection_writes_per_query = 0 512 | new_results_per_query = 0 513 | detections_per_query = 0 514 | logger.info( 515 | f"******* Processing Search Query: '{search_query} extension:{extension}' *******" 516 | ) 517 | try: 518 | # Search GitHub and return search response confidence_score 519 | total_processed_search += 1 520 | # time.sleep(2) 521 | search_response_lines = githubCalls.run_github_search( 522 | search_query, 523 | extension, 524 | ) 525 | # If search has detections, process the result urls else continue next search 526 | if search_response_lines: 527 | ( 528 | detection_writes_per_query, 529 | new_results_per_query, 530 | detections_per_query, 531 | ) = process_search_results(search_response_lines, search_query) 532 | logger.info( 533 | f"Detection writes in current search query: {detection_writes_per_query}" 534 | ) 535 | total_detection_writes += detection_writes_per_query 536 | else: 537 | # time.sleep(2) 538 | logger.info( 539 | f"Search '{search_query}' returns no results. Continuing..." 540 | ) 541 | continue 542 | except Exception as e: 543 | logger.error(f"Process Error: {e}") 544 | logger.info(f"Current Total Processed Search: {total_processed_search}") 545 | logger.info(f"Current Total Detections Write: {total_detection_writes}") 546 | 547 | if new_results_per_query >= 0: 548 | logger.info( 549 | f"Total: {total_search_pairs} " 550 | + f"Processed: {total_processed_search} " 551 | + f"Detected: {detections_per_query} " 552 | + f"Total Writes: {detection_writes_per_query} " 553 | + f"Count URL: {new_results_per_query}" 554 | ) 555 | 556 | logger.info(f"Total Processed Search: {total_processed_search}") 557 | logger.info(f"Total Detections Write: {total_detection_writes}") 558 | return True 559 | 560 | 561 | def setup_logger(log_level=10, console_logging=True): 562 | """ 563 | Call logger create module and setup the logger for current run 564 | params: log_level - int - optional - Default - 20 - INFO 565 | params: console_logging - Boolean - optional - Enable console logging - default True 566 | """ 567 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 568 | log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 569 | global logger 570 | # Creates a logger 571 | logger = create_logger( 572 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 573 | ) 574 | 575 | 576 | def arg_parser(): 577 | """ 578 | Parse the command line Arguments and return the values 579 | params: None 580 | returns: secondary_keywords - list 581 | returns: extensions - list 582 | returns: log_level - int - Default - 20 - INFO 583 | returns: console_logging - Boolean - Default - True 584 | """ 585 | 586 | argparser = argparse.ArgumentParser() 587 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 588 | log_level_choices = [10, 20, 30, 40, 50] 589 | argparser.add_argument( 590 | "-s", 591 | "--secondary_keywords", 592 | metavar="Secondary Keywords", 593 | action="store", 594 | type=str, 595 | default="", 596 | help="Pass the Secondary Keywords list as comma separated string", 597 | ) 598 | argparser.add_argument( 599 | "-e", 600 | "--extensions", 601 | metavar="Extensions", 602 | action="store", 603 | type=str, 604 | default="", 605 | help="Pass the Extensions list as comma separated string", 606 | ) 607 | 608 | argparser.add_argument( 609 | "-l", 610 | "--log_level", 611 | metavar="Logger Level", 612 | action="store", 613 | type=int, 614 | default=20, 615 | choices=log_level_choices, 616 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 617 | ) 618 | 619 | argparser.add_argument( 620 | "-c", 621 | "--console_logging", 622 | metavar="Console Logging", 623 | action="store", 624 | type=str, 625 | default="Yes", 626 | choices=flag_choices, 627 | help="Pass the Console Logging as Yes or No. Default is Yes", 628 | ) 629 | 630 | args = argparser.parse_args() 631 | 632 | if args.secondary_keywords: 633 | secondary_keywords = args.secondary_keywords.split(",") 634 | else: 635 | secondary_keywords = [] 636 | if args.extensions: 637 | extensions = args.extensions.split(",") 638 | else: 639 | extensions = [] 640 | 641 | if args.log_level in log_level_choices: 642 | log_level = args.log_level 643 | else: 644 | log_level = 20 645 | if args.console_logging.lower() in flag_choices[:5]: 646 | console_logging = True 647 | else: 648 | console_logging = False 649 | 650 | return secondary_keywords, extensions, log_level, console_logging 651 | 652 | 653 | if __name__ == "__main__": 654 | # Argument Parsing 655 | ( 656 | secondary_keywords, 657 | extensions, 658 | log_level, 659 | console_logging, 660 | ) = arg_parser() 661 | 662 | # Setting up Logger 663 | setup_logger(log_level, console_logging) 664 | 665 | logger.info("xGitGuard Enterprise Keys and Token Data Collection Process Started") 666 | 667 | configs = ConfigsData() 668 | githubCalls = GithubCalls( 669 | configs.xgg_configs["github"]["enterprise_api_url"], 670 | "enterprise", 671 | configs.xgg_configs["github"]["enterprise_commits_url"], 672 | ) 673 | 674 | valid_config, token_var = check_github_token_env("enterprise") 675 | if not valid_config: 676 | logger.error( 677 | f"GitHub API Token Environment variable '{token_var}' is not set. API Search will fail/return no results. Please Setup and retry" 678 | ) 679 | sys.exit(1) 680 | 681 | run_data_collector(secondary_keywords, extensions) 682 | 683 | logger.info("xGitGuard Enterprise Keys and Token Data Collection Process Completed") 684 | -------------------------------------------------------------------------------- /xgitguard/ml_training/ml_data-collector/github-public-ml-data_collector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/github-public-ml-data_collector/__init__.py -------------------------------------------------------------------------------- /xgitguard/ml_training/ml_feature_engineering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import argparse 20 | import logging 21 | import os 22 | import sys 23 | from datetime import datetime 24 | 25 | import pandas as pd 26 | 27 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 28 | parent_dir = os.path.dirname(MODULE_DIR) 29 | sys.path.append(parent_dir) 30 | 31 | from common.logger import create_logger 32 | from utilities.common_utilities import is_num_present, is_uppercase_present 33 | from utilities.file_utilities import read_csv_file, write_to_csv_file 34 | 35 | logger = logging.getLogger("xgg_logger") 36 | 37 | 38 | def get_training_data(file_name): 39 | """ 40 | Read the given training data file or default training data file and return the training data 41 | params: training_data_file - string - Training data file path 42 | returns: training_data - Datafrmae 43 | """ 44 | logger.debug("<<<< 'Current Executing Function' >>>>") 45 | if file_name: 46 | output_dir = os.path.abspath( 47 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 48 | ) 49 | training_data_file = os.path.join(output_dir, file_name) 50 | if os.path.exists(training_data_file): 51 | logger.debug(f"Reading Training data from file: {training_data_file}") 52 | training_data = read_csv_file(training_data_file, output="dataframe") 53 | else: 54 | logger.error( 55 | f"Training_data_file given is not present. Please check the file path: {training_data_file}" 56 | ) 57 | raise Exception( 58 | f"Training_data_file given is not present. Please check the file path: {training_data_file}" 59 | ) 60 | else: 61 | logger.error( 62 | "Training data file is not given. Please pass the input training Data file" 63 | ) 64 | raise Exception( 65 | "Training data file is not given. Please pass the input training Data file" 66 | ) 67 | 68 | return training_data 69 | 70 | 71 | def xgg_engineer_model(training_source_data_file, training_data_file=""): 72 | """ 73 | Get clean data and Engineer the Model. 74 | params: training_source_data_file - string - file path 75 | params: training_data_file - string - file path - optional 76 | returns: None 77 | """ 78 | logger.debug("<<<< 'Current Executing Function' >>>>") 79 | logger.info("xGitGuard Feature Engineering started") 80 | train_data = get_training_data(training_source_data_file) 81 | train_data["Len_Key"] = train_data.apply(lambda x: len(x["Secret"]), axis=1) 82 | train_data["Len_Code"] = train_data.apply(lambda x: len(x["Code"]), axis=1) 83 | train_data["Has_Digit"] = train_data.apply( 84 | lambda x: is_num_present(x["Secret"]), axis=1 85 | ) 86 | train_data["Has_Cap"] = train_data.apply( 87 | lambda x: is_uppercase_present(x["Secret"]), axis=1 88 | ) 89 | train_data = train_data.drop(["Secret", "Code"], axis=1) 90 | 91 | train_data = pd.get_dummies(train_data) 92 | if not train_data.empty: 93 | try: 94 | output_dir = os.path.abspath( 95 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 96 | ) 97 | training_src_file = os.path.join(output_dir, training_data_file) 98 | write_to_csv_file(train_data, training_src_file, write_mode="overwrite") 99 | except Exception as e: 100 | logger.error(f"Process Error: {e}") 101 | else: 102 | logger.error(f"Empty Training source data") 103 | logger.info("xGitGuard Feature Engineering Ended") 104 | 105 | 106 | def setup_logger(run_mode="training", log_level=10, console_logging=True): 107 | """ 108 | Call logger create module and setup the logger for current run 109 | params: run_mode - str - optional - Default - training 110 | params: log_level - int - optional - Default - 20 - INFO 111 | params: console_logging - Boolean - optional - Enable console logging - default True 112 | """ 113 | 114 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 115 | log_file_name = f"{run_mode}_{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 116 | 117 | global logger 118 | # Creates a logger 119 | logger = create_logger( 120 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 121 | ) 122 | 123 | 124 | def arg_parser(): 125 | """ 126 | Parse the command line Arguments and return the values 127 | params: None 128 | returns: data_type - string 129 | returns: source_data - string - Default - enterprise 130 | returns: log_level - int - Default - 20 - INFO 131 | returns: console_logging - Boolean - Default - True 132 | """ 133 | 134 | argparser = argparse.ArgumentParser() 135 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 136 | log_level_choices = [10, 20, 30, 40, 50] 137 | argparser.add_argument( 138 | "data_type", 139 | metavar="Data_Type", 140 | action="store", 141 | type=str, 142 | choices=["key", "cred"], 143 | help="Pass the Data_Type as cred or key", 144 | ) 145 | argparser.add_argument( 146 | "-s", 147 | "--source_data", 148 | metavar="Source Data", 149 | action="store", 150 | type=str, 151 | default="enterprise", 152 | choices=["enterprise", "public"], 153 | help="Pass the source of data as public or enterprise. Default is enterprise", 154 | ) 155 | 156 | argparser.add_argument( 157 | "-l", 158 | "--log_level", 159 | metavar="Logger Level", 160 | action="store", 161 | type=int, 162 | default=20, 163 | choices=log_level_choices, 164 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 165 | ) 166 | argparser.add_argument( 167 | "-c", 168 | "--console_logging", 169 | metavar="Console Logging", 170 | action="store", 171 | type=str, 172 | default="Yes", 173 | choices=flag_choices, 174 | help="Pass the Console Logging as Yes or No. Default is Yes", 175 | ) 176 | 177 | args = argparser.parse_args() 178 | 179 | if args.data_type: 180 | data_type = args.data_type.lower() 181 | else: 182 | logger.error(f"No Data Type is passed in comand line.") 183 | sys.exit(1) 184 | 185 | if args.source_data: 186 | source_data = args.source_data.lower() 187 | else: 188 | logger.error(f"No Source Data is passed in command line.") 189 | sys.exit(1) 190 | 191 | if args.log_level in log_level_choices: 192 | log_level = args.log_level 193 | else: 194 | log_level = 20 195 | if args.console_logging.lower() in flag_choices[:5]: 196 | console_logging = True 197 | else: 198 | console_logging = False 199 | 200 | return data_type, source_data, log_level, console_logging 201 | 202 | 203 | if __name__ == "__main__": 204 | 205 | ( 206 | data_type, 207 | source_data, 208 | log_level, 209 | console_logging, 210 | ) = arg_parser() 211 | 212 | run_mode = source_data + "_" + data_type 213 | setup_logger(run_mode, log_level, console_logging) 214 | 215 | logger.info(f"{run_mode.upper()} Feature Engineering process Started") 216 | output_dir = os.path.abspath( 217 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 218 | ) 219 | training_source_data_file, training_data_file = "", "" 220 | if source_data == "public": 221 | if data_type == "cred": 222 | 223 | public_cred_src_file = os.path.join( 224 | output_dir, "public_cred_train_source.csv" 225 | ) 226 | if os.path.exists(public_cred_src_file): 227 | logger.info( 228 | "Using public cred source data to engineer for public model" 229 | ) 230 | training_source_data_file = "public_cred_train_source.csv" 231 | training_data_file = "public_cred_train.csv" 232 | 233 | else: 234 | logger.error( 235 | f"Cred Training source data file for engineering not found" 236 | ) 237 | elif data_type == "key": 238 | public_key_src_file = os.path.join( 239 | output_dir, "public_key_train_source.csv" 240 | ) 241 | if os.path.exists(public_key_src_file): 242 | logger.info("Using public key source data to engineer for public model") 243 | 244 | training_source_data_file = "public_key_train_source.csv" 245 | training_data_file = "public_key_train.csv" 246 | else: 247 | logger.error(f"Key Training source data file for engineering not found") 248 | else: 249 | if data_type == "cred": 250 | logger.info( 251 | "Using enterprise cred source data to engineer for enterprise model" 252 | ) 253 | training_source_data_file = "cred_train_source.csv" 254 | training_data_file = "cred_train.csv" 255 | 256 | elif data_type == "key": 257 | logger.info( 258 | "Using enterprise key source data to engineer for enterprise model" 259 | ) 260 | training_source_data_file = "key_train_source.csv" 261 | training_data_file = "key_train.csv" 262 | 263 | if training_source_data_file and training_data_file: 264 | xgg_engineer_model( 265 | training_source_data_file=training_source_data_file, 266 | training_data_file=training_data_file, 267 | ) 268 | -------------------------------------------------------------------------------- /xgitguard/ml_training/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | import argparse 19 | import logging 20 | import os 21 | import sys 22 | from datetime import datetime 23 | 24 | from sklearn import metrics 25 | from sklearn.ensemble import RandomForestClassifier 26 | from sklearn.model_selection import train_test_split 27 | 28 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) 29 | parent_dir = os.path.dirname(MODULE_DIR) 30 | sys.path.append(parent_dir) 31 | 32 | from common.logger import create_logger 33 | from utilities.file_utilities import read_csv_file, write_pickle_file 34 | 35 | logger = logging.getLogger("xgg_logger") 36 | 37 | 38 | def get_training_data(file_name): 39 | """ 40 | Read the given training data file or default training data file and return the training data 41 | params: training_data_file - string - Training data file path 42 | returns: training_data - Datafrmae 43 | """ 44 | logger.debug("<<<< 'Current Executing Function' >>>>") 45 | if file_name: 46 | output_dir = os.path.abspath( 47 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 48 | ) 49 | training_data_file = os.path.join(output_dir, file_name) 50 | if os.path.exists(training_data_file): 51 | logger.debug(f"Reading Training data from file: {training_data_file}") 52 | training_data = read_csv_file(training_data_file, output="dataframe") 53 | else: 54 | logger.error( 55 | f"Training_data_file given is not present. Please check the file path: {training_data_file}" 56 | ) 57 | raise Exception( 58 | f"Training_data_file given is not present. Please check the file path: {training_data_file}" 59 | ) 60 | else: 61 | logger.error( 62 | "Training data file is not given. Please pass the input training Data file" 63 | ) 64 | raise Exception( 65 | "Training data file is not given. Please pass the input training Data file" 66 | ) 67 | 68 | return training_data 69 | 70 | 71 | def train_and_test_model(training_data): 72 | """ 73 | Train the model with training data and test the model. 74 | params: training_data - dataframe - Training Data 75 | returns: rf - object - Trained model 76 | """ 77 | logger.debug("<<<< 'Current Executing Function' >>>>") 78 | # Get Training Data 79 | x = training_data.drop(columns="Label", axis=1) 80 | # target variable 81 | y = training_data["Label"] 82 | 83 | if training_data.shape[0] < 2: 84 | logger.error( 85 | "Collect and add more training data for model creation. Minimum 2 rows required" 86 | ) 87 | raise Exception( 88 | "Collect and add more training data for model creation. Minimum 2 rows required" 89 | ) 90 | 91 | x_train, x_test, y_train, y_test = train_test_split( 92 | x, y, test_size=0.3, random_state=123 93 | ) 94 | 95 | rf = RandomForestClassifier(n_estimators=500, max_depth=3) 96 | rf.fit(x_train, y_train) 97 | 98 | y_pred = rf.predict(x_test) 99 | 100 | logger.debug("Detection Validation model is trained.") 101 | logger.debug(f"Random Forest Accuracy:{metrics.accuracy_score(y_test, y_pred)}") 102 | logger.debug(f"Precision: {metrics.precision_score(y_test, y_pred)}") 103 | logger.debug(f"Recall: {metrics.recall_score(y_test, y_pred)}") 104 | logger.debug(f"F1 Score: {metrics.f1_score(y_test, y_pred)}") 105 | 106 | return rf 107 | 108 | 109 | def xgg_train_model(training_data_file, model_name=""): 110 | """ 111 | Get trainind data and Train the Model. Test and persist the model 112 | params: training_data_file - string - file path 113 | returns: None 114 | """ 115 | logger.debug("<<<< 'Current Executing Function' >>>>") 116 | logger.info("xGitGuard Model Training started") 117 | training_data = get_training_data(training_data_file) 118 | ml_model = train_and_test_model(training_data) 119 | output_dir = os.path.abspath( 120 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 121 | ) 122 | model_file = os.path.join(output_dir, model_name + "model_object.pickle") 123 | write_pickle_file(object=ml_model, object_file=model_file) 124 | logger.info("xGitGuard Model Training Ended") 125 | 126 | 127 | def setup_logger(run_mode="training", log_level=10, console_logging=True): 128 | """ 129 | Call logger create module and setup the logger for current run 130 | params: run_mode - str - optional - Default - training 131 | params: log_level - int - optional - Default - 20 - INFO 132 | params: console_logging - Boolean - optional - Enable console logging - default True 133 | """ 134 | 135 | log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs")) 136 | log_file_name = f"{run_mode}_{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" 137 | 138 | global logger 139 | # Creates a logger 140 | logger = create_logger( 141 | log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name 142 | ) 143 | 144 | 145 | def arg_parser(): 146 | """ 147 | Parse the command line Arguments and return the values 148 | params: None 149 | returns: data_type - string 150 | returns: source_data - string - Default - enterprise 151 | returns: log_level - int - Default - 20 - INFO 152 | returns: console_logging - Boolean - Default - True 153 | """ 154 | 155 | argparser = argparse.ArgumentParser() 156 | flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"] 157 | log_level_choices = [10, 20, 30, 40, 50] 158 | argparser.add_argument( 159 | "data_type", 160 | metavar="Data_Type", 161 | action="store", 162 | type=str, 163 | choices=["key", "cred"], 164 | help="Pass the Data_Type as cred or key", 165 | ) 166 | argparser.add_argument( 167 | "-s", 168 | "--source_data", 169 | metavar="Source Data", 170 | action="store", 171 | type=str, 172 | default="enterprise", 173 | choices=["enterprise", "public"], 174 | help="Pass the source of data as public or enterprise. Default is enterprise", 175 | ) 176 | 177 | argparser.add_argument( 178 | "-l", 179 | "--log_level", 180 | metavar="Logger Level", 181 | action="store", 182 | type=int, 183 | default=20, 184 | choices=log_level_choices, 185 | help="Pass the Logging level as for CRITICAL - 50, ERROR - 40 WARNING - 30 INFO - 20 DEBUG - 10. Default is 20", 186 | ) 187 | argparser.add_argument( 188 | "-c", 189 | "--console_logging", 190 | metavar="Console Logging", 191 | action="store", 192 | type=str, 193 | default="Yes", 194 | choices=flag_choices, 195 | help="Pass the Console Logging as Yes or No. Default is Yes", 196 | ) 197 | 198 | args = argparser.parse_args() 199 | 200 | if args.data_type: 201 | data_type = args.data_type.lower() 202 | else: 203 | logger.error(f"No Data Type is passed in command line.") 204 | sys.exit(1) 205 | 206 | if args.source_data: 207 | source_data = args.source_data.lower() 208 | else: 209 | logger.error(f"No Source Data is passed in command line.") 210 | sys.exit(1) 211 | 212 | if args.log_level in log_level_choices: 213 | log_level = args.log_level 214 | else: 215 | log_level = 20 216 | if args.console_logging.lower() in flag_choices[:5]: 217 | console_logging = True 218 | else: 219 | console_logging = False 220 | 221 | return data_type, source_data, log_level, console_logging 222 | 223 | 224 | if __name__ == "__main__": 225 | 226 | data_type, source_data, log_level, console_logging = arg_parser() 227 | 228 | run_mode = source_data + "_" + data_type 229 | setup_logger(run_mode, log_level, console_logging) 230 | 231 | logger.info(f"{run_mode.upper()} Training Model process Started") 232 | output_dir = os.path.abspath( 233 | os.path.join(os.path.dirname(MODULE_DIR), ".", "output") 234 | ) 235 | training_data_file, model_name = "", "" 236 | if source_data == "public": 237 | if data_type == "cred": 238 | public_cred_training_data_file = os.path.join( 239 | output_dir, "public_cred_train.csv" 240 | ) 241 | enterprise_cred_training_data_file = os.path.join( 242 | output_dir, "cred_train.csv" 243 | ) 244 | if os.path.exists(public_cred_training_data_file): 245 | logger.info("Using public cred training data to train the public model") 246 | 247 | training_data_file = "public_cred_train.csv" 248 | model_name = "public_xgg_cred_rf_" 249 | 250 | elif os.path.exists(enterprise_cred_training_data_file): 251 | logger.info( 252 | "Using enterprise cred training data to train the public model" 253 | ) 254 | training_data_file = "cred_train.csv" 255 | model_name = "public_xgg_cred_rf_" 256 | else: 257 | logger.error( 258 | f"Cred Training data file not found for cred ml model creation" 259 | ) 260 | elif data_type == "key": 261 | public_key_training_data_file = os.path.join( 262 | output_dir, "public_key_train.csv" 263 | ) 264 | enterprise_key_training_data_file = os.path.join( 265 | output_dir, "key_train.csv" 266 | ) 267 | if os.path.exists(public_key_training_data_file): 268 | logger.info("Using public key training data to train the public model") 269 | training_data_file = "public_key_train.csv" 270 | model_name = "public_xgg_key_rf_" 271 | elif os.path.exists(enterprise_key_training_data_file): 272 | logger.info( 273 | "Using enterprise key training data to train the public model" 274 | ) 275 | training_data_file = "key_train.csv" 276 | model_name = "public_xgg_key_rf_" 277 | 278 | else: 279 | logger.error( 280 | f"Key Training data file not found for key ml model creation" 281 | ) 282 | else: 283 | if data_type == "cred": 284 | logger.info( 285 | "Using enterprise cred training data to train the enterprise model" 286 | ) 287 | training_data_file = "cred_train.csv" 288 | model_name = "xgg_cred_rf_" 289 | 290 | elif data_type == "key": 291 | logger.info( 292 | "Using enterprise key training data to train the enterprise model" 293 | ) 294 | training_data_file = "key_train.csv" 295 | model_name = "xgg_key_rf_" 296 | 297 | if training_data_file and model_name: 298 | xgg_train_model( 299 | training_data_file=training_data_file, 300 | model_name=model_name, 301 | ) 302 | logger.info("Training, Testing and Persisting the xgg Model Completed") 303 | -------------------------------------------------------------------------------- /xgitguard/output/.output: -------------------------------------------------------------------------------- 1 | #output directory -------------------------------------------------------------------------------- /xgitguard/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/utilities/__init__.py -------------------------------------------------------------------------------- /xgitguard/utilities/common_utilities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | import re 19 | import os 20 | 21 | 22 | def check_github_token_env(token_env): 23 | """ 24 | For the Given GITHUB Type, check whether ENV and URL set properly or not 25 | ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN" for Public 26 | ### Need GitHub Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN" for Enterprise 27 | params: token_env - string 28 | 29 | returns: (1,0), token_var 30 | """ 31 | 32 | if token_env == "public": 33 | token_var = "GITHUB_TOKEN" 34 | else: 35 | token_var = "GITHUB_ENTERPRISE_TOKEN" 36 | 37 | if os.getenv(token_var): 38 | return 1, token_var 39 | 40 | return 0, token_var 41 | 42 | 43 | def is_num_present(word): 44 | """ 45 | Check if any number present in Given String 46 | params: word - string 47 | returns: 0 or 1 48 | """ 49 | check = any(letter.isdigit() for letter in word) 50 | return 1 if check else 0 51 | 52 | 53 | def is_uppercase_present(word): 54 | """ 55 | Check if any Upper Case Letter present in Given String 56 | params: word - string 57 | returns: 0 or 1 58 | """ 59 | check = any(letter.isupper() for letter in word) 60 | return 1 if check else 0 61 | 62 | 63 | def is_special_chars_present(word): 64 | """ 65 | Check if any special characters present in Given String 66 | params: word - string 67 | returns: 0 or 1 68 | """ 69 | regex = re.compile("[@_!#$%^&*()<>?/\|}{~:]") 70 | check = regex.search(word) 71 | return 1 if check else 0 72 | 73 | 74 | def mask_data(code, secret): 75 | """ 76 | Mask the letters except first 4 chars 77 | params: code - string - full key line 78 | params: secret - string - Secret 79 | returns: masked_code - string 80 | """ 81 | try: 82 | match_group = re.search("(?<=:|=).*$", code) 83 | if match_group: 84 | match = match_group.group(0).strip() 85 | masked_code = re.sub(r"(?<=:|=).*$", "", code) 86 | if match[len(match) - 1] == '"': 87 | masked_code = masked_code + match[0:4] + "#" * (10) + '"' 88 | else: 89 | masked_code = masked_code + match[0:4] + "#" * (10) 90 | else: 91 | masked_code = re.sub(secret, "##########", code) 92 | except Exception as e: 93 | masked_code = re.sub(secret, "##########", code) 94 | return masked_code 95 | -------------------------------------------------------------------------------- /xgitguard/utilities/file_utilities.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2021 Comcast Cable Communications Management, LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | 16 | SPDX-License-Identifier: Apache-2.0 17 | """ 18 | 19 | import logging 20 | import os 21 | import pickle 22 | 23 | import pandas as pd 24 | import yaml 25 | 26 | logger = logging.getLogger("xgg_logger") 27 | 28 | 29 | def read_text_file(file_path): 30 | """ 31 | Read text file utility. 32 | 33 | This function performs the following steps: 34 | - Read the text file from the given path. 35 | - If the file is not present, exit. 36 | 37 | Args: 38 | file_path (str): The path to the text file. 39 | 40 | Returns: 41 | list: The content of the file as a list of lines. 42 | """ 43 | logger.debug("<<<< 'Current Executing Function' >>>>") 44 | if os.path.exists(file_path): 45 | logger.info(f"Reading text data from file path: {file_path}") 46 | try: 47 | with open(file_path, "r") as infile: 48 | file_data = infile.readlines() 49 | return file_data or [] 50 | except Exception as e: 51 | logger.error(f"File Read Error: {e}") 52 | return [] 53 | else: 54 | logger.warning(f"File not present in : {file_path}") 55 | return [] 56 | 57 | 58 | def read_yaml_file(file_path): 59 | """ 60 | Read YAML file utility. 61 | 62 | This function performs the following steps: 63 | - Read the YAML file from the given path. 64 | - If the file is not present, return empty data. 65 | 66 | Args: 67 | file_path (str): The path to the YAML file. 68 | 69 | Returns: 70 | dict: The content of the YAML file as a dictionary, or an empty dictionary if the file is not present. 71 | """ 72 | logger.debug("<<<< 'Current Executing Function' >>>>") 73 | if os.path.exists(file_path): 74 | logger.info(f"Reading yaml data from file path: {file_path}") 75 | try: 76 | with open(file_path, "r") as infile: 77 | file_data = yaml.safe_load(infile) 78 | return file_data or [] 79 | except Exception as e: 80 | logger.error(f"File Read Error: {e}") 81 | return [] 82 | else: 83 | logger.warning(f"File not present in : {file_path}") 84 | return [] 85 | 86 | 87 | def read_csv_file(file_path, output="list", header=0): 88 | """ 89 | Read CSV file utility. 90 | 91 | This function performs the following steps: 92 | - Read the CSV file from the given path. 93 | - If the file is not present, return empty data. 94 | 95 | Args: 96 | file_path (str): The path to the CSV file. 97 | output (str): The format of the output, either "dataframe" or "list". Default is "list". 98 | 99 | Returns: 100 | file_data: The content of the CSV file as a DataFrame or a list, or an empty DataFrame or list if the file is not present. 101 | """ 102 | logger.debug("<<<< 'Current Executing Function' >>>>") 103 | if os.path.exists(file_path): 104 | logger.info(f"Reading CSV data from file path: {file_path}") 105 | try: 106 | file_dataframe = pd.read_csv(file_path, header=header) 107 | if output == "list": 108 | file_data = file_dataframe.values.tolist() 109 | # file_data = [item for sublist in file_data for item in sublist] 110 | return file_data 111 | else: 112 | return file_dataframe 113 | except Exception as e: 114 | logger.error(f"Reading CSV file Error: {e}") 115 | file_dataframe = pd.DataFrame() 116 | return [] if output == "list" else file_dataframe 117 | else: 118 | logger.warning(f"File not present in : {file_path}") 119 | file_dataframe = pd.DataFrame() 120 | return [] if output == "list" else file_dataframe 121 | 122 | 123 | def write_to_csv_file(dataframe, csv_file_path, sep=",", write_mode="append"): 124 | """ 125 | Write to CSV file utility. 126 | 127 | This function performs the following steps: 128 | - Write the DataFrame to the given path if the file is not present. 129 | - Raise an exception if the column order and counts do not match. 130 | - Append to the existing file if the file is already present. 131 | 132 | Args: 133 | dataframe (pd.DataFrame): The Pandas DataFrame to write. 134 | csv_file_path (str): The path to the CSV file. 135 | sep (str, optional): The separator to use. Default is ",". 136 | 137 | Returns: 138 | bool: True if the operation was successful, False otherwise. 139 | """ 140 | logger.debug("<<<< 'Current Executing Function' >>>>") 141 | logger.info(f"Write Called on: {csv_file_path}") 142 | if not os.path.isfile(csv_file_path): 143 | dataframe.to_csv(csv_file_path, mode="a", index=False, sep=sep) 144 | return True 145 | try: 146 | if write_mode == "overwrite": 147 | dataframe.to_csv(csv_file_path, mode="w", index=False, sep=sep) 148 | return True 149 | elif len(dataframe.columns) != len( 150 | pd.read_csv(csv_file_path, nrows=1, sep=sep).columns 151 | ): 152 | logger.error( 153 | f"Columns do not match!! \ 154 | Dataframe has {len(dataframe.columns)} columns. \ 155 | CSV file has {len(pd.read_csv(csv_file_path, nrows=1, sep=sep).columns)} columns." 156 | ) 157 | raise Exception( 158 | f"Columns do not match!! \ 159 | Dataframe has {len(dataframe.columns)} columns. \ 160 | CSV file has {len(pd.read_csv(csv_file_path, nrows=1, sep=sep).columns)} columns." 161 | ) 162 | elif not ( 163 | dataframe.columns == pd.read_csv(csv_file_path, nrows=1, sep=sep).columns 164 | ).all(): 165 | logger.error( 166 | "Columns and column order of dataframe and csv file do not match!!" 167 | ) 168 | raise Exception( 169 | "Columns and column order of dataframe and csv file do not match!!" 170 | ) 171 | else: 172 | dataframe.to_csv( 173 | csv_file_path, mode="a", index=False, sep=sep, header=False 174 | ) 175 | logger.debug("CSV file Write Successful") 176 | return True 177 | except pd.errors.EmptyDataError as e: 178 | logger.error(f"CSV file is Empty. So writing like a new File. Error: {e}") 179 | dataframe.to_csv(csv_file_path, mode="a", index=False, sep=sep, header=False) 180 | logger.debug("CSV file Write Successful") 181 | return True 182 | 183 | 184 | def write_pickle_file(object, object_file): 185 | """ 186 | Write the given object as a pickle file. 187 | 188 | Args: 189 | obj (object): The object to write. 190 | object_file (str): The path to the pickle file. 191 | 192 | Returns: 193 | bool: True if the operation was successful. 194 | """ 195 | logger.debug("<<<< 'Current Executing Function' >>>>") 196 | logger.info(f"Writing object as pickle file: {object_file}") 197 | try: 198 | with open(object_file, "wb") as out_file: 199 | pickle.dump(object, out_file) 200 | logger.debug(f"Given object written to file as: {object_file}") 201 | except Exception as e: 202 | logger.error(f"Given object Write Failed. Error: {e}") 203 | raise Exception(f"Given object Write Failed. Error: {e}") 204 | return True 205 | 206 | 207 | def read_pickle_file(object_file=""): 208 | """ 209 | Read the pickle object file and return the object. 210 | 211 | Args: 212 | object_path (str): The path to the pickle file. 213 | 214 | Returns: 215 | object: The deserialized object from the pickle file. 216 | """ 217 | logger.debug("<<<< 'Current Executing Function' >>>>") 218 | if object_file: 219 | logger.info(f"Reading pickle file object: {object_file}") 220 | try: 221 | with open(object_file, "rb") as in_file: 222 | object = pickle.load(in_file) 223 | except Exception as e: 224 | logger.error(f"Error in reading Model object: {e}") 225 | raise Exception(f"Error in reading Model object: {e}") 226 | else: 227 | logger.error(f"Object File not present in : {object_file}") 228 | raise Exception(f"Object File not present in : {object_file}") 229 | return object 230 | 231 | 232 | def read_file_content(file_path, output="list"): 233 | """ 234 | Read file utility. 235 | 236 | This function performs the following steps: 237 | - Read the file from the given path. 238 | - If the file is not present, exit. 239 | 240 | Args: 241 | file_path (str): The path to the file. 242 | output (str, optional): The format of the output, either "string" or "list". Default is "list". 243 | 244 | Returns: 245 | file_data (str or list): The content of the file as a string or a list of lines. 246 | """ 247 | logger.debug("<<<< 'Current Executing Function' >>>>") 248 | if os.path.exists(file_path): 249 | logger.debug(f"Reading data from file path: {file_path}") 250 | try: 251 | with open(file_path, "r") as file: 252 | if output == "list": 253 | file_data = file.readlines() 254 | else: 255 | file_data = file.read() 256 | return file_data or ([] if output == "list" else "") 257 | except Exception as e: 258 | logger.error(f"File Read Error: {e} for file:{file_path}") 259 | return [] if output == "list" else "" 260 | else: 261 | logger.warning(f"File not present in : {file_path}") 262 | return [] if output == "list" else "" 263 | -------------------------------------------------------------------------------- /xgitguard/utilities/query_length_validator.py: -------------------------------------------------------------------------------- 1 | from common.logger import create_logger 2 | 3 | 4 | def query_length_validator( 5 | search_qualifier, query, limit=170, max_search_qualifier_per_query=10 6 | ): 7 | qualifier_query = "" 8 | qualifier_query_length = 0 9 | qualifiers_in_query = 0 10 | qualifier_string = "" 11 | for qualifier in search_qualifier: 12 | if qualifier_query_length + len(qualifier) + 1 <= limit and ( 13 | max_search_qualifier_per_query is None 14 | or qualifiers_in_query < max_search_qualifier_per_query 15 | ): 16 | qualifier_query += f""" {query}:{str(qualifier)}""" 17 | qualifier_query_length += len(qualifier) + 1 18 | qualifiers_in_query += 1 19 | else: 20 | return -1 21 | 22 | if qualifier_query: 23 | qualifier_string = qualifier_query.strip() 24 | 25 | return qualifier_string 26 | --------------------------------------------------------------------------------