├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   └── scorecard.yml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Maintainers.md
├── NOTICE
├── README.md
├── assets
    ├── cred_detection_workflow.png
    ├── keys_Token_Detection_workflow.png
    └── xgitguard_workflow.png
├── requirements.txt
├── roadmap.md
└── xgitguard
    ├── __init__.py
    ├── common
        ├── __init__.py
        ├── configs_read.py
        ├── data_format.py
        ├── github_calls.py
        ├── logger.py
        └── ml_process.py
    ├── config
        ├── confidence_values.csv
        ├── dictionary_words.csv
        ├── enterprise_keywords.csv
        ├── extensions.csv
        ├── primary_keywords.csv
        ├── public_keywords.csv
        ├── secondary_creds.csv
        ├── secondary_keys.csv
        ├── stop_words.csv
        ├── xgg_configs.yaml
        └── xgg_search_paths.csv
    ├── custom keyword search
        ├── __init__.py
        ├── enterprise_keyword_search.py
        └── public_keyword_search.py
    ├── file-scanner
        ├── extension_search.py
        └── secret_detection.py
    ├── github-enterprise
        ├── __init__.py
        ├── enterprise_cred_detections.py
        └── enterprise_key_detections.py
    ├── github-public
        ├── __init__.py
        ├── public_cred_detections.py
        └── public_key_detections.py
    ├── logs
        └── .log_desc
    ├── ml_training
        ├── __init__.py
        ├── ml_data-collector
        │   ├── __init__.py
        │   ├── github-enterprise-ml-data_collector
        │   │   ├── __init__.py
        │   │   ├── enterprise_cred_data_collector.py
        │   │   └── enterprise_key_data_collector.py
        │   └── github-public-ml-data_collector
        │   │   ├── __init__.py
        │   │   ├── public_cred_data_collector.py
        │   │   └── public_key_data_collector.py
        ├── ml_feature_engineering.py
        └── model.py
    ├── output
        └── .output
    └── utilities
        ├── __init__.py
        ├── common_utilities.py
        ├── file_utilities.py
        └── query_length_validator.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | name: Scorecard analysis workflow
 2 | on:
 3 |   push:
 4 |     # Only the default branch is supported.
 5 |     branches:
 6 |     - main
 7 |   schedule:
 8 |     # Weekly on Saturdays.
 9 |     - cron:  '30 1 * * 6'
10 | 
11 | permissions: read-all
12 | 
13 | jobs:
14 |   analysis:
15 |     name: Scorecard analysis
16 |     runs-on: ubuntu-latest
17 |     permissions:
18 |       # Needed for Code scanning upload
19 |       security-events: write
20 |       # Needed for GitHub OIDC token if publish_results is true
21 |       id-token: write
22 | 
23 |     steps:
24 |       - name: "Checkout code"
25 |         uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
26 |         with:
27 |           fetch-depth: 0
28 |           persist-credentials: false
29 | 
30 |       - name: "Run analysis"
31 |         uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
32 |         with:
33 |           results_file: results.sarif
34 |           results_format: sarif
35 |           # Scorecard team runs a weekly scan of public GitHub repos,
36 |           # see https://github.com/ossf/scorecard#public-data.
37 |           # Setting `publish_results: true` helps us scale by leveraging your workflow to
38 |           # extract the results instead of relying on our own infrastructure to run scans.
39 |           # And it's free for you!
40 |           publish_results: true
41 | 
42 |       # Upload the results as artifacts (optional). Commenting out will disable
43 |       # uploads of run results in SARIF format to the repository Actions tab.
44 |       # https://docs.github.com/en/actions/advanced-guides/storing-workflow-data-as-artifacts
45 |       - name: "Upload artifact"
46 |         uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
47 |         with:
48 |           name: SARIF file
49 |           path: results.sarif
50 |           retention-days: 5
51 | 
52 |       # Upload the results to GitHub's code scanning dashboard (optional).
53 |       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
54 |       - name: "Upload to code-scanning"
55 |         uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.16.4
56 |         with:
57 |           sarif_file: results.sarif
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # IDEs and editors
132 | /.idea
133 | .project
134 | .classpath
135 | .c9/
136 | *.launch
137 | .settings/
138 | *.sublime-workspace
139 | 
140 | # IDE - VSCode
141 | .vscode/*
142 | !.vscode/settings.json
143 | !.vscode/tasks.json
144 | !.vscode/launch.json
145 | !.vscode/extensions.json
146 | .history/*
147 | 
148 | # System Files
149 | .DS_Store
150 | Thumbs.db
151 | 
152 | # Run time files
153 | xgitguard/logs/*.log
154 | xgitguard/output/*.csv
155 | xgitguard/output/*.pickle


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # xGitGuard Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement.
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series
 85 | of actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or
 92 | permanent ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior,  harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within
112 | the community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.0, available at
118 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
119 | 
120 | Community Impact Guidelines were inspired by 
121 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
122 | 
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 
125 | at [https://www.contributor-covenant.org/translations][translations].
126 | 
127 | [homepage]: https://www.contributor-covenant.org
128 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
129 | [Mozilla CoC]: https://github.com/mozilla/diversity
130 | [FAQ]: https://www.contributor-covenant.org/faq
131 | [translations]: https://www.contributor-covenant.org/translations


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | If you would like to contribute code to this project you can do so through
 4 | GitHub by forking the repository and sending a pull request.
 5 | 
 6 | Before Comcast merges your code into the project you must sign the 
 7 | [Comcast Contributor License Agreement (CLA)](https://gist.github.com/ComcastOSS/a7b8933dd8e368535378cda25c92d19a).
 8 | 
 9 | If you haven't previously signed a Comcast CLA, you'll automatically be asked
10 | to when you open a pull request. Alternatively, we can send you a PDF that
11 | you can sign and scan back to us. Please create a new GitHub issue to request
12 | a PDF version of the CLA.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2019 Comcast Cable Communications Management, LLC
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/Maintainers.md:
--------------------------------------------------------------------------------
 1 | # 🛡️ xGitGuard Maintainers
 2 | 
 3 | Thank you for your interest in contributing to xGitGuard! Below are the dedicated individuals who contribute to the maintenance and development of xGitGuard.
 4 | 
 5 | ## Comcast Maintainers
 6 | 
 7 | Comcast maintainers are responsible for overseeing the development and maintenance of xGitGuard within the Comcast organization.
 8 | 
 9 | | Name               | GitHub Handle Card  |
10 | |--------------------|----------------------|
11 | | Dinesh Prakash      | [![Dinesh Prakash](https://img.shields.io/badge/GitHub-dinpraka-blue?logo=github)](https://github.com/dinpraka) |
12 | | David Jayaseelan    | [![David Jayaseelan](https://img.shields.io/badge/GitHub-davidjayaseelan-blue?logo=github)](https://github.com/jay6david) |
13 | | Adhithya Rajasekaran| [![Adhithya Rajasekaran](https://img.shields.io/badge/GitHub-radhi1991-blue?logo=github)](https://github.com/radhi1991) |
14 | | Gowtham Raj J       | [![Gowtham Raj J](https://img.shields.io/badge/GitHub-jgowthamr-blue?logo=github)](https://github.com/jgowthamr) |
15 | | Sai Sundar          | [![Sai Sundar](https://img.shields.io/badge/GitHub-sai100-blue?logo=github)](https://github.com/sai100) |
16 | | Nisha Balamurugan   | [![nishabalamurugan](https://img.shields.io/badge/GitHub-nishabalamurugan-blue?logo=github)](https://github.com/nishabalamurugan) |
17 | 
18 | 
19 | ## External Maintainers
20 | 
21 | External maintainers come from various organizations and institutions, contributing their expertise to xGitGuard's development.
22 | 
23 | | Name               | GitHub Handle Card  | Affiliation                       |
24 | |--------------------|----------------------|-----------------------------------|
25 | | Himaja Nimmagadda   | [![Himaja Nimmagadda](https://img.shields.io/badge/GitHub-hcn892-blue?logo=github)](https://github.com/hcn892) | [George Washington University](https://www.gwu.edu) |
26 | | Dinesh Paneerselvam | [![Dinesh Paneerselvam](https://img.shields.io/badge/GitHub-DineshPanneerselvam-blue?logo=github)](https://github.com/DineshPanneerselvam) | [Infosys](https://www.infosys.com) |
27 | | Preethi Manimaran   | [![Preethi Manimaran](https://img.shields.io/badge/GitHub-preethid03-blue?logo=github)](https://github.com/preethid03) | [Gigamon](https://www.gigamon.com)|
28 | 
29 | 
30 | ## Contact
31 | 
32 | If you have any questions, concerns, or suggestions regarding xGitGuard, feel free to reach out to any of the maintainers listed above. We welcome your feedback and contributions!
33 | 
34 | ## Acknowledgments
35 | 
36 | We extend our gratitude to all contributors, users, and organizations that have supported the xGitGuard project in various ways.
37 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | xGitGuard
 2 | 
 3 | Copyright 2021 Comcast Cable Communications Management, LLC
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 | http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and 
15 | limitations under the License.
16 | 
17 | SPDX-License-Identifier: Apache-2.0
18 | 
19 | This product includes software developed at Comcast (http://www.comcast.com/).


--------------------------------------------------------------------------------
/assets/cred_detection_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/cred_detection_workflow.png


--------------------------------------------------------------------------------
/assets/keys_Token_Detection_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/keys_Token_Detection_workflow.png


--------------------------------------------------------------------------------
/assets/xgitguard_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/assets/xgitguard_workflow.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | pandas==1.3.5
3 | requests==2.32.0
4 | scipy==1.10.0
5 | scikit-learn==1.5.2
6 | urlextract==1.5.0
7 | PyYAML==6.0


--------------------------------------------------------------------------------
/roadmap.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # xGitGuard Roadmap
 3 | 
 4 | ## How to Use This Roadmap
 5 | This document serves as a comprehensive guide to the prioritized objectives of the xGitGuard project. It offers insight into the direction of the project, aiding contributors in understanding its trajectory. It also helps contributors determine whether their contributions align with the project's long-term goals.
 6 | 
 7 | While a feature may not be listed here, it doesn't imply automatic refusal of a patch (except for "frozen features" mentioned below). We welcome patches for new features and encourage innovation. However, please be aware that such patches may take longer to review.
 8 | 
 9 | ---
10 | 
11 | ## Feature Classification
12 | 
13 | ### Adhoc Scan
14 | | Feature                          | Description                                      | Status    | Developer (GitHub ID)        |
15 | |----------------------------------|--------------------------------------------------|-----------|------------------------------|
16 | | [🎯 Targeted repository scanning](https://github.com/Comcast/xGitGuard/issues/24) | Scan user specified repositories for secrets     | ✅ Done   | [preethid03](https://github.com/preethid03) |
17 | | [🎯 Targeted organization scanning](https://github.com/Comcast/xGitGuard/issues/24) | Scan user specified organization for secrets     | ✅ Done   | [preethid03](https://github.com/preethid03) |
18 | 
19 | ---
20 | 
21 | ### File Scanner
22 | 
23 | | Feature                   | Description                                             | Status | Developer (GitHub ID) |
24 | |---------------------------|---------------------------------------------------------|--------|-----------------------|
25 | | 📁 Directory scanning        | Enable scanning user specified directories for secrets                     | ⏳ WIP  | [](https://github.com/developer6) |
26 | | 📁 Individual file scanning  | Enable scanning user specified individual files for secrets                | ⏳ WIP  | [](https://github.com/developer7) |
27 | 
28 | 
29 | 
30 | ---
31 | 
32 | ### ML Integration ---> [GitHub Issues](https://github.com/Comcast/xGitGuard/issues/32)
33 | | Feature                          | Description                                      | Status    | Developer (GitHub ID)        |
34 | |----------------------------------|--------------------------------------------------|-----------|------------------------------|
35 | | 🤖 Training ML models using BERT  | Train models for secret detection using BERT    | 🚧 To Do  | [](https://github.com/developer8) |
36 | | 🤖 Integrating BERT into scanners | Integrate BERT model into xGitGuard scanner     | 🚧 To Do  | [](https://github.com/developer9) |
37 | 
38 | ---
39 | 
40 | ### Pre-commit Hook
41 | | Feature                          | Description                                      | Status    | Developer (GitHub ID)        |
42 | |----------------------------------|--------------------------------------------------|-----------|------------------------------|
43 | | 🔒 Detecting secrets pre-commit| Detect secrets before committing changes   | 🚧 To Do  | [](https://github.com/) |
44 | 
45 | 
46 | ---
47 | 
48 | ### Others
49 | | Feature                          | Description                                      | Status    | Developer (GitHub ID)        |
50 | |----------------------------------|--------------------------------------------------|-----------|------------------------------|
51 | | Custom keyword search          | Search for specific keywords within repositories| 🚧 To Do  | [](https://github.com/developer8) |
52 | | Filtering archived repositories | Exclude archived repositories from scanning    | 🚧 To Do  | [](https://github.com/developer8) |
53 | | Filtering forked repositories   | Exclude forked repositories from scanning      | 🚧 To Do  | [](https://github.com/developer8) |
54 | 
55 | ---
56 | 
57 | 
58 | **Legend:**
59 | - ✅ Done: Completed feature.
60 | - 🚧 To Do: Feature in progress.
61 | 
62 | ---
63 | 
64 | ## Additional Issues and Contributions
65 | 
66 | Contributors are welcome to explore and contribute to other issues on the xGitGuard repository: [xGitGuard GitHub Issues](https://github.com/Comcast/xGitGuard/issues)
67 | 


--------------------------------------------------------------------------------
/xgitguard/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = """Bahman Rashidi (Senior Security Architect) - Comcast Cable,
2 |                 Saravanakumar Ramasamy (Senior Lead Engineer) - Comcast Cable,
3 |                 Dinesh Prakash (Senior Lead Engineer) - Comcast Cable
4 |             """
5 | __version__ = "2.0"
6 | 


--------------------------------------------------------------------------------
/xgitguard/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/common/__init__.py


--------------------------------------------------------------------------------
/xgitguard/common/configs_read.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import logging
 20 | import os
 21 | import sys
 22 | 
 23 | import numpy as np
 24 | from sklearn.feature_extraction.text import CountVectorizer
 25 | 
 26 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 27 | parent_dir = os.path.dirname(MODULE_DIR)
 28 | sys.path.append(parent_dir)
 29 | 
 30 | from utilities.file_utilities import read_yaml_file, read_csv_file
 31 | 
 32 | 
 33 | logger = logging.getLogger("xgg_logger")
 34 | 
 35 | 
 36 | class ConfigsData:
 37 |     """
 38 |     Initialize and Read all the configuration files needed for the xGitGuard process
 39 |     """
 40 | 
 41 |     def __init__(self):
 42 |         logger.debug("Initializing Configuration Data")
 43 |         self.config_dir = os.path.abspath(
 44 |             os.path.join(os.path.dirname(MODULE_DIR), ".", "config")
 45 |         )
 46 |         self.output_dir = os.path.abspath(
 47 |             os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
 48 |         )
 49 |         self.read_xgg_configs(file_name="xgg_configs.yaml")
 50 | 
 51 |     def read_xgg_configs(self, file_name):
 52 |         """
 53 |         Read the given xgg_configs YAML file in the config path and set the class variable for further use.
 54 | 
 55 |         Args:
 56 |             file_name (str): The name of the configuration file.
 57 |         """
 58 |         logger.debug("<<<< 'Current Executing Function' >>>>")
 59 |         # Loading xgg_configs from xgg_configs_file
 60 |         self.xgg_configs_file = os.path.join(self.config_dir, file_name)
 61 |         if os.path.exists(self.xgg_configs_file):
 62 |             self.xgg_configs = read_yaml_file(self.xgg_configs_file)
 63 |             logger.debug(f"xgg_configs: {self.xgg_configs}")
 64 |         else:
 65 |             logger.error(
 66 |                 f"Exiting as xGitGuard Configuration file not found: {self.xgg_configs_file}"
 67 |             )
 68 |             raise Exception(
 69 |                 f"Exiting as xGitGuard Configuration file not found: {self.xgg_configs_file}"
 70 |             )
 71 | 
 72 |     def read_primary_keywords(self, file_name):
 73 |         """
 74 |         Read the given primary keywords CSV file in the config path and set the class variable for further use.
 75 | 
 76 |         Args:
 77 |             file_name (str): The name of the CSV file.
 78 |         """
 79 |         logger.debug("<<<< 'Current Executing Function' >>>>")
 80 | 
 81 |         # Loading primary keywords from primary keywords file
 82 |         self.primary_keywords_file = os.path.join(self.config_dir, file_name)
 83 |         self.primary_keywords = read_csv_file(
 84 |             self.primary_keywords_file, output="list", header=0
 85 |         )
 86 |         self.primary_keywords = [
 87 |             item for sublist in self.primary_keywords for item in sublist
 88 |         ]
 89 |         # logger.debug(f"primary_keywords: {self.primary_keywords}")
 90 | 
 91 |     def read_secondary_keywords(self, file_name):
 92 |         """
 93 |         Read the given secondary keywords CSV file in the config directory and set the class variable for further use.
 94 | 
 95 |         Args:
 96 |             file_name (str): The name of the CSV file.
 97 |         """
 98 |         logger.debug("<<<< 'Current Executing Function' >>>>")
 99 | 
100 |         # Loading secondary keywords from secondary keywords file
101 |         self.secondary_keywords_file = os.path.join(self.config_dir, file_name)
102 |         self.secondary_keywords = read_csv_file(
103 |             self.secondary_keywords_file, output="list", header=0
104 |         )
105 |         self.secondary_keywords = [
106 |             item for sublist in self.secondary_keywords for item in sublist
107 |         ]
108 |         # logger.debug(f"secondary_keywords: {self.secondary_keywords}")
109 | 
110 |     def read_secondary_credentials(self, file_name):
111 |         """
112 |         Read the given secondary credentials CSV file in the config directory and set the class variable for further use.
113 | 
114 |         Args:
115 |             file_name (str): The name of the CSV file.
116 |         """
117 |         logger.debug("<<<< 'Current Executing Function' >>>>")
118 | 
119 |         # Loading secondary Credentials from secondary credentials file
120 |         self.secondary_credentials_file = os.path.join(self.config_dir, file_name)
121 |         self.secondary_credentials = read_csv_file(
122 |             self.secondary_credentials_file, output="list", header=0
123 |         )
124 |         self.secondary_credentials = [
125 |             item for sublist in self.secondary_credentials for item in sublist
126 |         ]
127 |         # logger.debug(f"secondary_credentials: {self.secondary_credentials}")
128 | 
129 |     def read_extensions(self, file_name="extensions.csv"):
130 |         """
131 |         Read the given extensions CSV file in the config path and set the class variable for further use.
132 | 
133 |         Args:
134 |             file_name (str): The name of the CSV file.
135 |         """
136 |         logger.debug("<<<< 'Current Executing Function' >>>>")
137 | 
138 |         # Get the extensions from extensions file
139 |         self.extensions_file = os.path.join(self.config_dir, file_name)
140 |         self.extensions = read_csv_file(self.extensions_file, output="list", header=0)
141 |         self.extensions = [item for sublist in self.extensions for item in sublist]
142 | 
143 |         # logger.debug(f"Extensions: {self.extensions}")
144 | 
145 |     def read_hashed_url(self, file_name):
146 |         """
147 |         Read the given hashed URL CSV file in the output path and set the class variable for further use.
148 | 
149 |         Args:
150 |             file_name (str): The name of the CSV file.
151 |         """
152 |         logger.debug("<<<< 'Current Executing Function' >>>>")
153 | 
154 |         # Loading Existing url hash detections
155 |         self.hashed_url_file = os.path.join(self.output_dir, file_name)
156 |         hashed_key_urls = read_csv_file(self.hashed_url_file, output="list", header=0)
157 |         self.hashed_urls = [row[0] for row in hashed_key_urls]
158 | 
159 |         # logger.debug(f"hashed_urls: {self.hashed_urls}")
160 | 
161 |     def read_training_data(self, file_name):
162 |         """
163 |         Read the given training data CSV file in the output path and set the class variable for further use.
164 | 
165 |         Args:
166 |             file_name (str): The name of the CSV file.
167 |         """
168 |         logger.debug("<<<< 'Current Executing Function' >>>>")
169 |         self.training_data_file = os.path.join(self.output_dir, file_name)
170 |         self.training_data = read_csv_file(
171 |             self.training_data_file, output="dataframe", header=0
172 |         )
173 |         if not self.training_data.empty:
174 |             self.training_data = self.training_data.drop(columns="Label", axis=1)
175 |         else:
176 |             logger.error(
177 |                 f"Training Data is Empty. Add proper data and rerun: {self.training_data_file}"
178 |             )
179 |             raise Exception(
180 |                 f"Training Data is Empty. Add proper data and rerun: {self.training_data_file}"
181 |             )
182 | 
183 |     def read_confidence_values(self, file_name="confidence_values.csv"):
184 |         """
185 |         Read the given confidence values CSV file in the config path and set the key as index.
186 | 
187 |         This function sets the class variable for further use.
188 | 
189 |         Args:
190 |             file_name (str): The name of the CSV file.
191 |         """
192 |         logger.debug("<<<< 'Current Executing Function' >>>>")
193 |         # Loading confidence levels from file
194 |         self.confidence_values_file = os.path.join(self.config_dir, file_name)
195 | 
196 |         self.confidence_values = read_csv_file(
197 |             self.confidence_values_file, output="dataframe", header=0
198 |         )
199 |         if not self.confidence_values.empty:
200 |             try:
201 |                 self.confidence_values = self.confidence_values.set_index("key")
202 |             except Exception as e:
203 |                 logger.error(f"Confidence Values Setting Index Error: {e}")
204 |                 raise Exception(f"Confidence Values Setting Index Error: {e}")
205 |         else:
206 |             logger.error(
207 |                 f"confidence_values file is not present/readable: {self.confidence_values_file}"
208 |             )
209 |             raise Exception(
210 |                 f"confidence_values file is not present/readable: {self.confidence_values_file}"
211 |             )
212 | 
213 |     def read_dictionary_words(self, file_name="dictionary_words.csv"):
214 |         """
215 |         Read the given dictionary words CSV file in the config path.
216 | 
217 |         This function creates dictionary similarity values and sets the class variables for further use.
218 | 
219 |         Args:
220 |             file_name (str): The name of the CSV file.
221 |         """
222 |         logger.debug("<<<< 'Current Executing Function' >>>>")
223 |         # Creating dictionary similarity values
224 |         self.dictionary_words_file = os.path.join(self.config_dir, file_name)
225 |         self.dictionary_words = read_csv_file(
226 |             self.dictionary_words_file, output="dataframe", header=0
227 |         )
228 |         # logger.debug("Dictionary_words file Read")
229 |         # run Count Vectorizer
230 |         if not self.dictionary_words.empty:
231 |             try:
232 |                 self.dict_words_vc = CountVectorizer(
233 |                     analyzer="char", ngram_range=(3, 5), min_df=1e-5, max_df=1.0
234 |                 )
235 |                 count = self.dict_words_vc.fit_transform(
236 |                     self.dictionary_words["dic_word"].apply(
237 |                         lambda count: np.str_(count)
238 |                     )
239 |                 )
240 |                 self.dict_words_ct = np.log10(count.sum(axis=0).getA1())
241 |                 # logger.debug("Dictionary_words data Count Vectorized")
242 |             except Exception as e:
243 |                 logger.error(f"Count Vectorizer Error: {e}")
244 |                 raise Exception(f"Count Vectorizer Error: {e}")
245 |         else:
246 |             logger.error(
247 |                 f"confidence_values file is not present/readable: {self.dictionary_words_file}"
248 |             )
249 |             raise Exception(
250 |                 f"confidence_values file is not present/readable: {self.dictionary_words_file}"
251 |             )
252 | 
253 |     def read_stop_words(self, file_name="stop_words.csv"):
254 |         """
255 |         Read the given stop words CSV file in the config path and set the class variable for further use.
256 | 
257 |         Args:
258 |             file_name (str): The name of the CSV file.
259 |         """
260 |         logger.debug("<<<< 'Current Executing Function' >>>>")
261 |         # Get the programming language stop words
262 |         self.stop_words_file = os.path.join(self.config_dir, file_name)
263 |         self.stop_words = read_csv_file(self.stop_words_file, output="list", header=0)
264 |         self.stop_words = [item for sublist in self.stop_words for item in sublist]
265 |         # logger.debug(f"Total Stop Words: {len(self.stop_words)}")
266 | 
267 |     def read_search_paths(self, file_name):
268 |         """
269 |         Read the given search paths CSV file in the config directory and set the class variable for further use.
270 | 
271 |         Args:
272 |             file_name (str): The name of the CSV file.
273 |         """
274 |         logger.debug("<<<< 'Current Executing Function' >>>>")
275 | 
276 |         # Loading the search paths file to retrieve the paths that need the extension filter applied
277 |         self.search_paths_file = os.path.join(self.config_dir, file_name)
278 |         self.search_paths = read_csv_file(
279 |             self.search_paths_file, output="list", header=0
280 |         )
281 |         self.search_paths = [item for sublist in self.search_paths for item in sublist]
282 |         # logger.debug(f"search_paths: {self.search_paths}")
283 | 
284 |     def read_search_files(self, file_name):
285 |         """
286 |         Read the given search paths CSV file in the config directory and set the class variable for further use.
287 | 
288 |         Args:
289 |             file_name (str): The name of the CSV file.
290 |         """
291 |         logger.debug("<<<< 'Current Executing Function' >>>>")
292 | 
293 |         # Reading the paths of files to be searched after applying the extension filter
294 |         self.target_paths_file = os.path.join(self.output_dir, file_name)
295 |         self.search_files = read_csv_file(
296 |             self.target_paths_file, output="list", header=0
297 |         )
298 |         self.search_files = [item for sublist in self.search_files for item in sublist]
299 |         # logger.debug(f"search_files: {self.search_files}")
300 | 
301 |     def read_hashed_file(self, file_name):
302 |         """
303 |         Read the given hashed file CSV file in the output path and set the class variable for further use.
304 | 
305 |         Args:
306 |             file_name (str): The name of the CSV file.
307 |         """
308 |         logger.debug("<<<< 'Current Executing Function' >>>>")
309 |         # Loading Existing url hash detections
310 |         self.hashed_file = os.path.join(self.output_dir, file_name)
311 |         hashed_key_files = read_csv_file(self.hashed_file, output="", header=0)
312 |         try:
313 |             self.hashed_files = (
314 |                 hashed_key_files.get("hashed_files").drop_duplicates().tolist()
315 |             )
316 |             self.hashed_file_modified_time = (
317 |                 hashed_key_files.get("file_modification_hash")
318 |                 .drop_duplicates()
319 |                 .tolist()
320 |             )
321 |             self.hash_file_path = (
322 |                 hashed_key_files.get("files").drop_duplicates().tolist()
323 |             )
324 |         except:
325 |             self.hashed_files = []
326 |             self.hashed_file_modified_time = []
327 |             self.hash_file_path = []
328 |         # logger.debug(f"hashed_urls: {self.hashed_urls}")
329 | 
330 | 
331 | if __name__ == "__main__":
332 | 
333 |     from datetime import datetime
334 |     from common.logger import create_logger
335 | 
336 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
337 |     log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
338 |     # Creates a logger
339 |     logger = create_logger(
340 |         log_level=10, console_logging=True, log_dir=log_dir, log_file_name=log_file_name
341 |     )
342 |     configs = ConfigsData()
343 | 


--------------------------------------------------------------------------------
/xgitguard/common/data_format.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import functools
 20 | import json
 21 | import re
 22 | from urlextract import URLExtract
 23 | 
 24 | 
 25 | def remove_url_from_keys(code_content):
 26 |     """
 27 |     Replace special chars in the given code content data
 28 |     params: code_content - string - code data with urls
 29 |     returns: data - string - Code data without url
 30 |     """
 31 |     # Remove url address if present
 32 |     code_data = re.sub(
 33 |         r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
 34 |         " ",
 35 |         code_content,
 36 |     )
 37 |     # Remove email address characters if present
 38 |     code_data = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", code_data)
 39 | 
 40 |     special_chars = [
 41 |         "'",
 42 |         "(",
 43 |         ")",
 44 |         ",",
 45 |         ".",
 46 |         "/",
 47 |         "0x",
 48 |         ";",
 49 |         "<",
 50 |         "=",
 51 |         ">",
 52 |         "@",
 53 |         "[",
 54 |         "\\",
 55 |         "]",
 56 |         "_",
 57 |         "{",
 58 |         "}",
 59 |         '"',
 60 |     ]
 61 |     # Remove special characters if present
 62 |     for special_char in special_chars:
 63 |         code_data = code_data.replace(special_char, " ")
 64 |     return code_data
 65 | 
 66 | 
 67 | def remove_url_from_creds(code_content, key):
 68 |     """
 69 |     Replace special chars in the given code content data
 70 |     params: code_content - string - code data with urls
 71 |     returns: data - string - Code data without url
 72 |     """
 73 |     extractor = URLExtract()
 74 |     blacklisted_urls = extractor.find_urls(code_content)
 75 | 
 76 |     code_data = re.sub(
 77 |         r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
 78 |         " ",
 79 |         code_content,
 80 |     )
 81 |     code_data = re.sub("[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", " ", code_data)
 82 | 
 83 |     for url in blacklisted_urls:
 84 |         code_data = code_data.replace(url, " ")
 85 | 
 86 |     special_chars = [
 87 |         "'",
 88 |         '"',
 89 |         "#",
 90 |         "%",
 91 |         "&",
 92 |         "(",
 93 |         ")",
 94 |         "*",
 95 |         "+",
 96 |         ",",
 97 |         "-",
 98 |         ".",
 99 |         "/",
100 |         ":",
101 |         ";",
102 |         "<",
103 |         "=",
104 |         ">",
105 |         "?",
106 |         "[",
107 |         "\\",
108 |         "]",
109 |         "`",
110 |         "{",
111 |         "|",
112 |         "}",
113 |         "~",
114 |     ]
115 |     # Remove special characters if present
116 |     for special_char in special_chars:
117 |         code_data = code_data.replace(special_char, " ")
118 |     codes_list = code_data.split()
119 |     return codes_list
120 | 
121 | 
122 | def keys_extractor(code_content):
123 |     """
124 |     Extract keys from the given code content
125 |     params: code_content - string
126 |     returns: keys - List - List of secret keys
127 |     """
128 | 
129 |     regexes = {
130 |         "AWS Tokens": "(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}",
131 |         "AWS Access Key ID": "[0-9a-zA-Z/+=]{40}",
132 |         "Google OAuth Secret": "[0-9a-zA-Zn\-_]{24}",
133 |         "Google OAuth Auth Code": "4/[0-9A-Za-zn\-_]+",
134 |         "Google OAuth Refresh Token": "1/[0-9A-Za-zn\-_]{43}|1/[0-9A-Za-zn\-_]{64}",
135 |         "Google OAuth Access Token": "ya29n.[0-9A-Za-zn\-_]+",
136 |         "Google API Key": "AIza[0-9A-Za-zn\-_]{35}",
137 |         "RSA Private Key": "BEGIN RSA PRIVATE KEY",
138 |         "EC Private Key": "BEGIN EC PRIVATE KEY",
139 |         "PGP Private Key": "BEGIN PGP PRIVATE KEY BLOCK",
140 |         "General Private Key": "BEGIN PRIVATE KEY",
141 |         "Google YouTube OAuth ID Gmail, GCloud": "[0-9]+-[0-9A-Za-z_]f32gn.appsn.googleusercontentn.com",
142 |         "Amazon MWS": "access_tokenn$productionn$[0-9a-z]f16gn$[0-9a-f]f32g",
143 |         "PayPal": "amznn.mwsn.[0-9a-f]f8g-[0-9a-f]f4g-[0-9a-f]f4g-[0-9a-f]f4g-[0-9a-f]f12g",
144 |         "Slack Token": "(xox[pbaor]-[0-9]{12}-[0-9]{12}-[0-9]{12}-[a-z0-9]{32})",
145 |         "AWS": "(?:.*awsSecretKey|.*aws_secret|.*api-key|.*aws_account_secret).*"
146 |         "(?=.*[A-Z])(?<![A-Za-z0-9/+=])[A-Za-z0-9/+=]{40}(?![A-Za-z0-9/+=])",
147 |         "Slack Webook": "T[a-zA-Z0-9_]{8}/B[a-zA-Z0-9_]{8}/[a-zA-Z0-9_]{24}",
148 |     }
149 | 
150 |     keys = []
151 | 
152 |     for regex_value in regexes.values():
153 |         find_keys = re.findall(regex_value, code_content)
154 |         if find_keys:
155 |             keys.append(find_keys)
156 |     if keys:
157 |         keys = list(functools.reduce(set.union, [set(item) for item in keys]))
158 |         keys = list(set(keys))
159 |         keys = list(filter(None, keys))
160 | 
161 |     return keys
162 | 
163 | 
164 | def credential_extractor(code_content, stop_words):
165 |     """
166 |     Extract Credentials from the given code content
167 |     params: code_content - string
168 |     returns: keys - List - List of secret keys
169 |     """
170 |     creds = []
171 |     for word in code_content:
172 |         if (
173 |             len(word) >= 7
174 |             and not (word in stop_words)
175 |             and not (word.lower().startswith("u0"))
176 |             and not (word.lower().startswith("0x"))
177 |             and not (word.lower().startswith("rfc"))
178 |             and not ("http" in word.lower())
179 |             and (bool(re.match("^(?=.*[0-9])(?=.*[a-zA-Z])", word)))
180 |         ):
181 |             creds.append(word)
182 | 
183 |     """creds = [word for word in code_content if len(word) >= 7]
184 |     creds = [word for word in creds if not word in stop_words]
185 |     creds = [word for word in creds if not word.lower().startswith('u0')]
186 |     creds = [word for word in creds if not word.lower().startswith('0x')]
187 |     creds = [word for word in creds if not word.lower().startswith('rfc')]
188 |     creds = [word for word in creds if "http" not in word.lower()]
189 |     creds = [word for word in creds if bool(re.match('^(?=.*[0-9])(?=.*[a-zA-Z])', word))]"""
190 | 
191 |     creds = list(set(creds))
192 |     creds = list(filter(None, creds))
193 |     return creds
194 | 
195 | 
196 | def format_commit_details(api_response_commit_data):
197 |     """
198 |     Format the commit details from the api response
199 |     params: api_response_commit_data - dict
200 |     returns: commit_details - json dictionary
201 |     """
202 |     try:
203 |         response = api_response_commit_data
204 |         if response.status_code == 200:
205 |             commit_details = {}
206 |             commit_data = []
207 |             commits_response = response.json()
208 |             commit_details["status"] = response.status_code
209 | 
210 |             for commit in commits_response:
211 |                 commit_detail = {}
212 | 
213 |                 try:
214 |                     commit_detail["commit_id"] = commit["sha"]
215 |                 except (IndexError, KeyError):
216 |                     commit_detail["commit_id"] = ""
217 | 
218 |                 try:
219 |                     commit_detail["email"] = commit["commit"]["author"]["email"]
220 |                 except (IndexError, KeyError):
221 |                     commit_detail["email"] = ""
222 | 
223 |                 try:
224 |                     commit_detail["commiter_name"] = commit["commit"]["author"]["name"]
225 |                 except (IndexError, KeyError):
226 |                     commit_detail["commiter_name"] = ""
227 | 
228 |                 try:
229 |                     commit_detail["commit_date"] = commit["commit"]["author"]["date"]
230 |                 except (IndexError, KeyError):
231 |                     commit_detail["commit_date"] = ""
232 | 
233 |                 try:
234 |                     if commit["author"] is not None:
235 |                         commit_detail["user_id"] = commit["author"]["login"]
236 |                     else:
237 |                         commit_detail["user_id"] = ""
238 |                 except (IndexError, KeyError):
239 |                     commit_detail["user_id"] = ""
240 | 
241 |                 commit_data.append(commit_detail)
242 |             commit_details["commits"] = commit_data
243 |         else:
244 |             commit_details = {}
245 | 
246 |     except (IndexError, KeyError):
247 |         commit_details = {}
248 | 
249 |     commit_details = json.dumps(commit_details)
250 |     return commit_details
251 | 


--------------------------------------------------------------------------------
/xgitguard/common/github_calls.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import logging
 20 | import os
 21 | import sys
 22 | import time
 23 | 
 24 | import requests
 25 | from utilities.query_length_validator import query_length_validator
 26 | 
 27 | logger = logging.getLogger("xgg_logger")
 28 | 
 29 | 
 30 | class GithubCalls:
 31 | 
 32 |     def __init__(self, base_url, token_env, commits_api_url, throttle_time=2):
 33 |         assert (
 34 |             token_env == "public" or token_env == "enterprise"
 35 |         ), f"token_env must be either 'public' or 'enterprise'. current: {token_env}"
 36 |         self._base_url = base_url
 37 |         self._token_env = token_env
 38 |         self._commits_api_url = commits_api_url
 39 |         self._throttle_time = throttle_time
 40 | 
 41 |     def run_github_search(self, search_query, extension, org=[], repo=[]):
 42 |         """
 43 |         Run the GitHub API search with given search query
 44 |         Get the items from the response content and Return
 45 |         params: search_query - string - Search keyword
 46 |         params: extension - string - Search extension
 47 |         params: org - list
 48 |         params: repo - list
 49 |         returns: search_response - list
 50 |         """
 51 |         logger.debug("<<<< 'Current Executing Function' >>>>")
 52 | 
 53 |         org_qualifiers = []
 54 |         repo_qualifiers = []
 55 | 
 56 |         if len(org) > 0:
 57 |             # Checks if the length of additional qualifiers has exceeded the character limit of 170.
 58 |             org_qualifiers = query_length_validator(org, "user")
 59 |             if org_qualifiers == -1:
 60 |                 logger.error(
 61 |                     "Character Limit reached. Please consider limiting the number of characters in orgs."
 62 |                 )
 63 |                 sys.exit(1)
 64 | 
 65 |         elif len(repo) > 0:
 66 |             # Checks if the length of additional qualifiers has exceeded the character limit of 170.
 67 |             repo_qualifiers = query_length_validator(repo, "repo")
 68 |             if repo_qualifiers == -1:
 69 |                 logger.error(
 70 |                     "Character Limit reached. Please consider limiting the number of characters in repo."
 71 |                 )
 72 |                 sys.exit(1)
 73 | 
 74 |         if not extension or extension == "others" or len(extension) == 0:
 75 |             response = self.__github_api_get_params(
 76 |                 search_query, org_qualifiers, repo_qualifiers
 77 |             )
 78 |         elif self._token_env == "public":
 79 | 
 80 |             response = self.__github_api_get_params(
 81 |                 (search_query + " extension:" + extension),
 82 |                 org_qualifiers,
 83 |                 repo_qualifiers,
 84 |             )
 85 |         else:
 86 |             response = self.__github_api_get_params(
 87 |                 (search_query + " extension:" + extension),
 88 |                 org_qualifiers,
 89 |                 repo_qualifiers,
 90 |             )
 91 | 
 92 |         if response:
 93 |             return response
 94 | 
 95 |         return []
 96 | 
 97 |     def __github_api_get_params(
 98 |         self, search_query, org_qualifiers="", repo_qualifiers=""
 99 |     ):
100 |         """
101 |         For the given GITHUB API url and search query, call the api
102 |         Get and return the response
103 |         ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN"
104 | 
105 |         params: search_query - string
106 |         params: org_qualifiers - string
107 |         params: repo_qualifiers - string
108 |         returns: response - dict
109 |         """
110 |         logger.debug("<<<< 'Current Executing Function' >>>>")
111 |         if self._token_env == "public":
112 |             token_var = "GITHUB_TOKEN"
113 |             time.sleep(self._throttle_time)
114 |         else:
115 |             time.sleep(self._throttle_time)
116 |             token_var = "GITHUB_ENTERPRISE_TOKEN"
117 |             if "<< Enterprise Name >>" in self._base_url:
118 |                 logger.error(
119 |                     f"GitHub API URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry"
120 |                 )
121 |                 sys.exit(1)
122 | 
123 |         if not os.getenv(token_var):
124 |             logger.error(
125 |                 f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry"
126 |             )
127 |             sys.exit(1)
128 | 
129 |         additional_qualifiers = ""
130 |         if len(org_qualifiers) > 0:
131 |             additional_qualifiers = org_qualifiers
132 |         elif len(repo_qualifiers) > 0:
133 |             additional_qualifiers = repo_qualifiers
134 | 
135 |         search_response = []
136 |         if additional_qualifiers:
137 |             try:
138 |                 response = requests.get(
139 |                     self._base_url,
140 |                     params={
141 |                         "q": f"{search_query} {additional_qualifiers}",
142 |                         "order": "desc",
143 |                         "sort": "indexed",
144 |                         "per_page": 100,
145 |                     },
146 |                     auth=("token", os.getenv(token_var)),
147 |                 )
148 |             except Exception as e:
149 |                 logger.error(f"Github API call Error: {e}")
150 |         else:
151 |             try:
152 |                 response = requests.get(
153 |                     self._base_url,
154 |                     params={
155 |                         "q": f"{search_query}",
156 |                         "order": "desc",
157 |                         "sort": "indexed",
158 |                         "per_page": 100,
159 |                     },
160 |                     auth=("token", os.getenv(token_var)),
161 |                 )
162 |             except Exception as e:
163 |                 logger.error(f"Github API call Error: {e}")
164 | 
165 |         if response.status_code == 200:
166 |             content = response.json()
167 |             search_response.extend(content["items"])
168 |             try:
169 |                 while "next" in response.links.keys():
170 |                     time.sleep(6)
171 |                     response = requests.get(
172 |                         response.links["next"]["url"],
173 |                         auth=("token", os.getenv(token_var)),
174 |                     )
175 | 
176 |                     if response.status_code == 200:
177 |                         content = response.json()
178 |                         if len(content["items"]) < 1:
179 |                             break
180 |                         search_response.extend(content["items"])
181 | 
182 |                     else:
183 |                         logger.info(
184 |                             f"Encountered an error in processing request.Response Status Code:{response.status_code}"
185 |                         )
186 |                         break
187 |             except Exception as e:
188 |                 logger.error(
189 |                     f"Error occured while iterating through file contents: {e}"
190 |                 )
191 |         else:
192 |             logger.info(
193 |                 f"Encountered an error in processing request.Response Status Code:{response.status_code}"
194 |             )
195 |         return search_response
196 | 
197 |     def public_url_content_get(self, file_url):
198 |         """
199 |         For the given GitHub url, call the api
200 |         Get and return the response
201 |         ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN"
202 | 
203 |         params: api_url - string
204 |         returns: response - string
205 |         """
206 |         logger.debug("<<<< 'Current Executing Function' >>>>")
207 | 
208 |         token_key = "GITHUB_TOKEN"
209 |         if not os.getenv(token_key):
210 |             logger.error(
211 |                 f"GitHub API Token Environment variable '{token_key}' not set. API Search will fail/return no results. Please Setup and retry"
212 |             )
213 |             sys.exit(1)
214 | 
215 |         try:
216 |             time.sleep(self._throttle_time)
217 |             response = requests.get(
218 |                 file_url, auth=("token", os.getenv(token_key)), timeout=10
219 |             )
220 |             return response
221 |         except Exception as e:
222 |             logger.error(f"Github API file content get Error: {e}")
223 | 
224 |         return {}
225 | 
226 |     def enterprise_url_content_get(self, file_url, header):
227 |         """
228 |         For the given GitHub url, call the api
229 |         Get and return the response
230 |         ### Need GitHub Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN"
231 | 
232 |         params: api_url - string
233 |         returns: response - string
234 |         """
235 |         logger.debug("<<<< 'Current Executing Function' >>>>")
236 | 
237 |         token_key = "GITHUB_ENTERPRISE_TOKEN"
238 |         if not os.getenv(token_key):
239 |             logger.error(
240 |                 f"GitHub API Token Environment variable '{token_key}' not set. API Search will fail/return no results. Please Setup and retry"
241 |             )
242 |             sys.exit(1)
243 |         elif "<< Enterprise Name >>" in self._base_url:
244 |             logger.error(
245 |                 f"GitHub API Content URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry"
246 |             )
247 |             sys.exit(1)
248 | 
249 |         try:
250 |             time.sleep(self._throttle_time)
251 |             response = requests.get(
252 |                 file_url,
253 |                 auth=("token", os.getenv(token_key)),
254 |                 headers=header,
255 |                 timeout=10,
256 |             )
257 |             return response
258 |         except Exception as e:
259 |             logger.error(f"Github API file content get Error: {e}")
260 | 
261 |         return {}
262 | 
263 |     def get_github_public_commits(self, user_name, repo_name, file_path):
264 |         """
265 |         For the given GitHub details, call the api and get commit details
266 |         Get and return the response
267 |         ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN"
268 |         params: commits_api_url - string
269 |         returns: response - string
270 |         """
271 |         logger.debug("<<<< 'Current Executing Function' >>>>")
272 |         full_commit_url = self._commits_api_url % (user_name, repo_name, file_path)
273 |         token_var = "GITHUB_TOKEN"
274 |         if not os.getenv(token_var):
275 |             logger.error(
276 |                 f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry"
277 |             )
278 |             sys.exit(1)
279 | 
280 |         try:
281 |             time.sleep(self._throttle_time)
282 |             response = requests.get(
283 |                 full_commit_url, auth=("token", os.getenv(token_var)), timeout=25
284 |             )
285 |             return response
286 |         except Exception as e:
287 |             logger.error(f"Github API commit content get Error: {e}")
288 |         return {}
289 | 
290 |     def get_github_enterprise_commits(self, user_name, repo_name, file_path, header):
291 |         """
292 |         For the given GitHub details, call the api and get commit details
293 |         Get and return the response
294 |         ### Need GitHub Enterprise Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN"
295 |         params: commits_api_url - string
296 |         params: header - dict
297 |         returns: response - string
298 |         """
299 |         logger.debug("<<<< 'Current Executing Function' >>>>")
300 | 
301 |         token_var = "GITHUB_ENTERPRISE_TOKEN"
302 |         if not os.getenv(token_var):
303 |             logger.error(
304 |                 f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry"
305 |             )
306 |             sys.exit(1)
307 |         elif "<< Enterprise Name >>" in self._commits_api_url:
308 |             logger.error(
309 |                 f"GitHub API Commits URL not set for Enterprise in xgg_configs.yaml file in config folder. API Search will fail/return no results. Please Setup and retry"
310 |             )
311 |             sys.exit(1)
312 | 
313 |         try:
314 |             time.sleep(self._throttle_time)
315 |             full_commit_url = self._commits_api_url.format(
316 |                 user_name=user_name, repo_name=repo_name, file_path=file_path
317 |             )
318 |             response = requests.get(
319 |                 full_commit_url,
320 |                 auth=("token", os.getenv(token_var)),
321 |                 headers=header,
322 |                 timeout=25,
323 |             )
324 |             return response
325 |         except Exception as e:
326 |             logger.error(f"Github API commit content get Error: {e}")
327 |         return {}
328 | 


--------------------------------------------------------------------------------
/xgitguard/common/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2021 Comcast Cable Communications Management, LLC
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | 
16 | SPDX-License-Identifier: Apache-2.0
17 | """
18 | 
19 | import logging
20 | import os
21 | from datetime import datetime
22 | 
23 | 
24 | def create_logger(log_level=20, console_logging=True, log_dir=None, log_file_name=None):
25 |     """
26 |     Create logging class and return
27 |     params: log_level - int - Default - 10
28 |     returns: console_logging - Boolean - Default - True
29 |     returns: log_dir - string - optional
30 |     returns: log_file_name - string - optional
31 |     returns: logger - logging class
32 |     """
33 |     logger_name = "xgg_logger"
34 |     # Gets or creates a logger
35 |     logger = logging.getLogger(logger_name)
36 | 
37 |     # set log level
38 |     logger.setLevel(log_level)
39 | 
40 |     formatter = logging.Formatter(
41 |         "[%(asctime)s] [ %(levelname)8s ] [%(filename)40s:%(funcName)30s] : %(message)s"
42 |     )
43 | 
44 |     # add file handler to logger
45 |     logger.addHandler(set_file_handler(logger_name, formatter, log_dir, log_file_name))
46 | 
47 |     if console_logging:
48 |         logger.addHandler(set_console_handler(formatter))
49 | 
50 |     return logger
51 | 
52 | 
53 | def set_file_handler(logger_name, formatter, log_dir, log_file_name):
54 |     """Setting File streaming Handler"""
55 |     # define file handler and set formatter
56 |     if log_dir and os.path.exists(log_dir):
57 |         log_dir = log_dir
58 |     else:
59 |         module_dir = os.path.dirname(os.path.realpath(__file__))
60 |         log_dir = os.path.abspath(
61 |             os.path.join(os.path.dirname(module_dir), ".", "logs")
62 |         )
63 |     if not log_file_name:
64 |         log_file_name = f"{logger_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
65 |     log_file = os.path.join(log_dir, log_file_name)
66 |     file_handler = logging.FileHandler(log_file)
67 |     file_handler.setFormatter(formatter)
68 |     print(f"Current run logs file: {log_file}")
69 |     return file_handler
70 | 
71 | 
72 | def set_console_handler(formatter):
73 |     """Setting Console logging Handler"""
74 |     # define console handler and set formatter
75 |     console_handler = logging.StreamHandler()
76 |     console_handler.setFormatter(formatter)
77 |     return console_handler
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     from configs_read import ConfigsData
82 | 
83 |     configs = ConfigsData()
84 |     module_dir = os.path.dirname(os.path.realpath(__file__))
85 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(module_dir), ".", "logs"))
86 | 
87 |     logger = create_logger(
88 |         log_level=10,
89 |         console_logging=False,
90 |         log_dir=log_dir,
91 |         log_file_name=f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log",
92 |     )
93 | 
94 |     logger.debug("A debug message")
95 |     logger.info("An info message")
96 |     logger.warning("Something is not right.")
97 |     logger.error("A Major error has happened.")
98 |     logger.critical("Fatal error. Cannot continue")
99 | 


--------------------------------------------------------------------------------
/xgitguard/common/ml_process.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import logging
 20 | import os
 21 | import sys
 22 | import numpy as np
 23 | import pandas as pd
 24 | from scipy.stats import entropy
 25 | 
 26 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 27 | parent_dir = os.path.dirname(MODULE_DIR)
 28 | sys.path.append(parent_dir)
 29 | 
 30 | from common.configs_read import ConfigsData
 31 | from utilities.common_utilities import is_num_present, is_uppercase_present
 32 | from utilities.file_utilities import read_pickle_file
 33 | 
 34 | logger = logging.getLogger("xgg_logger")
 35 | 
 36 | 
 37 | def ml_prediction_process(model_name, training_data, detection_data, git_env=""):
 38 |     """
 39 |     for the given training data and detection data
 40 |         Format the detections snf training data as model needed
 41 |         Predict the detection using model
 42 |         Return the Dataframe of actual detections
 43 |     params: training_data - dataframe
 44 |     params: detection_data - dataframe - Detection Data
 45 |     returns: post_prediction_data - Dataframe - Actual detections
 46 |     """
 47 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 48 |     pre_prediction_data = detection_data.copy()
 49 |     if git_env:
 50 |         if git_env == "public":
 51 |             detection_data = detection_data.drop(
 52 |                 [
 53 |                     "Source",
 54 |                     "Primary_Key",
 55 |                     "Commit_Details",
 56 |                     "URL",
 57 |                     "Owner",
 58 |                     "Repo_Name",
 59 |                     "Detected_Timestamp",
 60 |                     "Year",
 61 |                     "Month",
 62 |                     "Day",
 63 |                 ],
 64 |                 axis=1,
 65 |             )
 66 |         else:
 67 |             detection_data = detection_data.drop(
 68 |                 [
 69 |                     "Source",
 70 |                     "Commit_Details",
 71 |                     "URL",
 72 |                     "Owner",
 73 |                     "Repo_Name",
 74 |                     "Detected_Timestamp",
 75 |                     "Year",
 76 |                     "Month",
 77 |                     "Day",
 78 |                 ],
 79 |                 axis=1,
 80 |             )
 81 |     else:
 82 |         detection_data = detection_data.drop(
 83 |             [
 84 |                 "Source",
 85 |                 "URL",
 86 |                 "Detected_Timestamp",
 87 |                 "Year",
 88 |                 "Month",
 89 |                 "Day",
 90 |             ],
 91 |             axis=1,
 92 |         )
 93 |     try:
 94 |         detection_data["Len_Key"] = detection_data.apply(
 95 |             lambda x: len(x["Secret"]), axis=1
 96 |         )
 97 |         detection_data["Len_Code"] = detection_data.apply(
 98 |             lambda x: len(x["Code"]), axis=1
 99 |         )
100 |         detection_data["Has_Digit"] = detection_data.apply(
101 |             lambda x: is_num_present(x["Secret"]), axis=1
102 |         )
103 |         detection_data["Has_Cap"] = detection_data.apply(
104 |             lambda x: is_uppercase_present(x["Secret"]), axis=1
105 |         )
106 | 
107 |         detection_data = detection_data.drop(["Secret", "Code"], axis=1)
108 |         train_dummies = pd.get_dummies(training_data)
109 |         detection_dummies = pd.get_dummies(detection_data)
110 |         train_dummies, detection_dummies = train_dummies.align(
111 |             detection_dummies, join="left", axis=1
112 |         )
113 |         detection_dummies = detection_dummies.fillna(0)
114 | 
115 |         config_dir = os.path.abspath(
116 |             os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
117 |         )
118 |         model_file = os.path.join(config_dir, model_name)
119 |         # Read pre trained Model object
120 |         rf = read_pickle_file(model_file)
121 |         # Predict the current detection
122 |         predictions = rf.predict(detection_dummies)
123 |         indexes = [i for i, e in enumerate(predictions) if e != 0]
124 |         post_prediction_data = pre_prediction_data.iloc[indexes, :]
125 |         return post_prediction_data
126 |     except Exception as e:
127 |         print(f"Error in predicting through model: {e}")
128 |         post_prediction_data = pd.DataFrame()
129 |         return post_prediction_data
130 | 
131 | 
132 | def entropy_calc(labels, base=None):
133 |     """
134 |     Calculates Shannon Entropy for given labels
135 |     params: labels - list
136 |     params: base - Optional
137 |     returns: entropy values - list
138 |     """
139 |     # logger.debug("<<<< 'Current Executing Function' >>>>")
140 |     _, counts = np.unique(labels, return_counts=True)
141 |     return entropy(counts, base=base)
142 | 


--------------------------------------------------------------------------------
/xgitguard/config/confidence_values.csv:
--------------------------------------------------------------------------------
  1 | key,value
  2 | --password,5
  3 | --token,5
  4 | ?access_token,5
  5 | ?accesskeyid,5
  6 | access_key,5
  7 | access_key_id,5
  8 | access_key_secret,5
  9 | access_secret,5
 10 | access_token,5
 11 | account_sid,2
 12 | algolia_api_key,3
 13 | amazon_secret_access_key,5
 14 | api_key,5
 15 | api_key_secret,4
 16 | api_key_sid,2
 17 | app_token,1
 18 | artifacts_bucket,1
 19 | artifacts_secret,1
 20 | ASPX,1
 21 | atoken,1
 22 | auth,4
 23 | yml,5
 24 | auth_token,5
 25 | aws_access_key,5
 26 | aws_access_key_id,2
 27 | aws_secret_access_key,5
 28 | aws_secret_key,5
 29 | bintray_key,1
 30 | codecov_token,2
 31 | get_token,1
 32 | mapbox_access_token,3
 33 | agfa,1
 34 | twig,2
 35 | c,5
 36 | csv,2
 37 | aspx,3
 38 | p12,4
 39 | cf_password,2
 40 | client_secret,5
 41 | cloudflare_api_key,2
 42 | conf,5
 43 | config,5
 44 | consumer_secret,5
 45 | coveralls_repo_token,2
 46 | coverity_scan_token,2
 47 | cpp,5
 48 | cred,1
 49 | cs,1
 50 | cshtml,1
 51 | CSV,1
 52 | customer_secret,2
 53 | dat,1
 54 | database_password,3
 55 | datadog_api_key,1
 56 | db_password,3
 57 | db_pw,3
 58 | deploy_password,3
 59 | deploy_token,3
 60 | docker_hub_password,3
 61 | docker_key,2
 62 | docker_pass,2
 63 | docker_passwd,2
 64 | docker_password,2
 65 | dockerhubpassword,2
 66 | ejs,2
 67 | encryption_password,2
 68 | erb,1
 69 | fg,1
 70 | file_password,1
 71 | firebase_token,1
 72 | ftp_password,5
 73 | ftp_pw,5
 74 | gh_token,2
 75 | github_access_token,5
 76 | github_api_key,5
 77 | github_auth,5
 78 | github_key,5
 79 | github_oauth_token,5
 80 | github_password,5
 81 | github_pwd,5
 82 | github_token,5
 83 | gitignore,5
 84 | go,5
 85 | gpg_passphrase,2
 86 | h,5
 87 | heroku_api_key,2
 88 | html,1
 89 | ini,1
 90 | ipynb,3
 91 | java,5
 92 | js,5
 93 | json,3
 94 | jsp,5
 95 | jsx,5
 96 | key,1
 97 | keystore_pass,2
 98 | log,1
 99 | mysql_password,5
100 | npm_auth_token,4
101 | npm_token,2
102 | oauth_token,5
103 | os_password,3
104 | others,1
105 | ovpn,3
106 | pass,1
107 | passphrase,3
108 | password,4
109 | pem,5
110 | php,4
111 | phtml,1
112 | pkey,5
113 | plist,1
114 | ppk,4
115 | priv,2
116 | properties,5
117 | publish_key,3
118 | py,5
119 | pypi_password,5
120 | rb,1
121 | release_token,2
122 | repotoken,1
123 | rsa,4
124 | s3_access_key,5
125 | s3_access_key_id,5
126 | s3_key,5
127 | s3_secret_key,5
128 | sauce_access_key,2
129 | secret,4
130 | secret_key_base,1
131 | sh,5
132 | signing_key,1
133 | sonar_token,2
134 | sonatype_password,1
135 | sshpass,3
136 | sshpassword,4
137 | swift,1
138 | token,5
139 | ts,1
140 | txt,1
141 | user_secret,5
142 | private_key,5
143 | vue,1
144 | xhtml,1
145 | xml,2
146 | yaml,4


--------------------------------------------------------------------------------
/xgitguard/config/enterprise_keywords.csv:
--------------------------------------------------------------------------------
1 | keyword
2 | 


--------------------------------------------------------------------------------
/xgitguard/config/extensions.csv:
--------------------------------------------------------------------------------
 1 | type
 2 | json
 3 | py
 4 | js
 5 | java
 6 | php
 7 | xml
 8 | others
 9 | cpp
10 | cs
11 | cshtml
12 | ejs
13 | erb
14 | go
15 | h
16 | rb
17 | sh
18 | swift
19 | properties
20 | ovpn
21 | conf
22 | config
23 | ini
24 | plist
25 | yaml
26 | yml
27 | fg
28 | gitignore
29 | key
30 | p12
31 | pem
32 | pkey
33 | ppk
34 | priv
35 | rsa
36 | aspx
37 | c
38 | ts
39 | html
40 | ipynb
41 | jsp
42 | jsx
43 | phtml
44 | twig
45 | vue
46 | xhtml
47 | csv
48 | dat
49 | log
50 | txt


--------------------------------------------------------------------------------
/xgitguard/config/primary_keywords.csv:
--------------------------------------------------------------------------------
1 | ﻿primary_keys
2 | 
3 | 


--------------------------------------------------------------------------------
/xgitguard/config/public_keywords.csv:
--------------------------------------------------------------------------------
1 | keyword
2 | 


--------------------------------------------------------------------------------
/xgitguard/config/secondary_creds.csv:
--------------------------------------------------------------------------------
 1 | keyword
 2 | password
 3 | --token
 4 | ?access_token
 5 | ?accesskeyid
 6 | access_key_id
 7 | access_key_secret
 8 | access_key
 9 | access_secret
10 | access_token
11 | account_sid
12 | api_key_secret
13 | api_key_sid
14 | api_key
15 | app_token
16 | artifacts_bucket
17 | artifacts_secret
18 | atoken
19 | auth_token
20 | auth
21 | cf_password
22 | ci_deploy_password
23 | cloudflare_api_key
24 | codecov_token
25 | coveralls_repo_token
26 | coverity_scan_token
27 | cred
28 | database_password
29 | datadog_api_key
30 | db_password
31 | db_pw
32 | deploy_password
33 | deploy_token
34 | docker_hub_password
35 | docker_key
36 | docker_pass
37 | docker_passwd
38 | docker_password
39 | dockerhubpassword
40 | encryption_password
41 | file_password
42 | firebase_token
43 | ftp_password
44 | ftp_pw
45 | github_password
46 | github_pwd
47 | gpg_passphrase
48 | key
49 | passphrase
50 | keystore_pass
51 | mapbox_access_token
52 | mysql_password
53 | npm_auth_token
54 | npm_token
55 | oauth_token
56 | os_password
57 | pass
58 | password
59 | publish_key
60 | pypi_password
61 | release_token
62 | repotoken
63 | sauce_access_key
64 | secret_key_base
65 | secret
66 | signing_key
67 | sonar_token
68 | sonatype_password
69 | sshpass
70 | token
71 | twine_password
72 | customer_secret
73 | consumer_secret
74 | --password
75 | sshpassword
76 | private_key


--------------------------------------------------------------------------------
/xgitguard/config/secondary_keys.csv:
--------------------------------------------------------------------------------
 1 | keyword
 2 | token
 3 | --token
 4 | ?access_token
 5 | ?accesskeyid
 6 | access_key_id
 7 | access_key_secret
 8 | access_key
 9 | access_secret
10 | access_token
11 | user_secret
12 | customer_secret
13 | consumer_secret
14 | client_secret
15 | account_sid
16 | agfa
17 | algolia_api_key
18 | amazon_secret_access_key
19 | api_key_secret
20 | api_key_sid
21 | api_key
22 | app_token
23 | artifacts_bucket
24 | artifacts_secret
25 | atoken
26 | auth_token
27 | auth
28 | aws_access_key_id
29 | aws_access_key
30 | aws_secret_access_key
31 | aws_secret_key
32 | bintray_key
33 | cloudflare_api_key
34 | codecov_token
35 | coveralls_repo_token
36 | coverity_scan_token
37 | cred
38 | datadog_api_key
39 | deploy_token
40 | docker_key
41 | firebase_token
42 | gh_token
43 | github_access_token
44 | get_token
45 | github_api_key
46 | github_auth
47 | github_key
48 | github_oauth_token
49 | github_token
50 | heroku_api_key
51 | key
52 | mapbox_access_token
53 | npm_auth_token
54 | npm_token
55 | oauth_token
56 | publish_key
57 | release_token
58 | repotoken
59 | s3_access_key_id
60 | s3_access_key
61 | s3_key
62 | s3_secret_key
63 | sauce_access_key
64 | secret_key_base
65 | secret
66 | signing_key
67 | sonar_token


--------------------------------------------------------------------------------
/xgitguard/config/stop_words.csv:
--------------------------------------------------------------------------------
 1 | Stop Words
 2 | static
 3 | static1
 4 | static2
 5 | static5
 6 | static6
 7 | static4
 8 | static3
 9 | server2
10 | images2
11 | secure1
12 | zipCode
13 | streetAddress
14 | forever
15 | secure
16 | tracking
17 | malloc
18 | calloc
19 | realloc
20 | memcpy
21 | int
22 | float
23 | char
24 | sizeof
25 | http
26 | https
27 | def
28 | class
29 | list
30 | str
31 | tuple
32 | dict
33 | collections
34 | this
35 | else
36 | if
37 | elif
38 | dynamic
39 | free 
40 | pointer
41 | func
42 | are 
43 | counters
44 | struct 
45 | type 
46 | memptr
47 | memcmp
48 | heap 
49 | stack 
50 | ptr 
51 | apache 
52 | nginx 
53 | ram 
54 | rom 
55 | tcp 
56 | ip 
57 | ping 
58 | icmp 
59 | bgp
60 | header
61 | protocol


--------------------------------------------------------------------------------
/xgitguard/config/xgg_configs.yaml:
--------------------------------------------------------------------------------
  1 | # xGitGuard Input Configurations
  2 | default:
  3 |   log_dir: None
  4 | 
  5 | github:
  6 |   throttle_time: 10
  7 |   # GitHub Public
  8 |   public_api_url: "https://api.github.com/search/code"
  9 |   public_commits_url: "https://api.github.com/repos/%s/%s/commits?path=%s"
 10 | 
 11 |   # GitHub Enterprise - For Open Source
 12 |   enterprise_api_url: "https://github.<< Enterprise Name >>.com/api/v3/search/code"
 13 |   enterprise_pre_url: "https://github.<< Enterprise Name >>.com/api/v3/repos/"
 14 |   url_validator: "https://github.<< Enterprise Name >>.com/api/v3/search/code"
 15 |   enterprise_commits_url: "https://github.<< Enterprise Name >>.com/api/v3/repos/{user_name}/{repo_name}/commits?path={file_path}"
 16 |   enterprise_header: { "Accept": "application/vnd.github.v3.raw" }
 17 | 
 18 | model:
 19 |   # Model Configurations
 20 | 
 21 |   # GitHub Public
 22 |   public:
 23 |     training_data_key: "public_key_train.csv"
 24 |     training_data_cred: "public_cred_train.csv"
 25 |     model_key_file: "public_xgg_key_rf_model_object.pickle"
 26 |     model_cred_file: "public_xgg_cred_rf_model_object.pickle"
 27 | 
 28 |   # GitHub Enterprise
 29 |   enterprise:
 30 |     training_data_key: "key_train.csv"
 31 |     training_data_cred: "cred_train.csv"
 32 |     model_key_file: "xgg_key_rf_model_object.pickle"
 33 |     model_cred_file: "xgg_cred_rf_model_object.pickle"
 34 | 
 35 | secrets:
 36 |   public_data_columns:
 37 |     [
 38 |       "Source",
 39 |       "Primary_Key",
 40 |       "Second_Key",
 41 |       "Extension",
 42 |       "URL",
 43 |       "Owner",
 44 |       "Repo_Name",
 45 |       "Commit_Details",
 46 |       "Secret",
 47 |       "Code",
 48 |       "Detected_Timestamp",
 49 |       "Key_Weight",
 50 |       "SKey_Count",
 51 |       "Entropy",
 52 |       "Dictionary_Similarity",
 53 |       "Score",
 54 |       "Year",
 55 |       "Month",
 56 |       "Day",
 57 |       "Hour",
 58 |     ]
 59 |   enterprise_data_columns:
 60 |     [
 61 |       "Source",
 62 |       "Second_Key",
 63 |       "Extension",
 64 |       "URL",
 65 |       "Owner",
 66 |       "Repo_Name",
 67 |       "Commit_Details",
 68 |       "Secret",
 69 |       "Code",
 70 |       "Detected_Timestamp",
 71 |       "Key_Weight",
 72 |       "SKey_Count",
 73 |       "Entropy",
 74 |       "Dictionary_Similarity",
 75 |       "Score",
 76 |       "Year",
 77 |       "Month",
 78 |       "Day",
 79 |       "Hour",
 80 |     ]
 81 |   enterprise_data_collector_columns:
 82 |     [
 83 |       "Source",
 84 |       "Second_Key",
 85 |       "Extension",
 86 |       "URL",
 87 |       "Owner",
 88 |       "Repo_Name",
 89 |       "Secret",
 90 |       "Code",
 91 |       "Detected_Timestamp",
 92 |       "Key_Weight",
 93 |       "SKey_Count",
 94 |       "Entropy",
 95 |       "Dictionary_Similarity",
 96 |       "Score",
 97 |       "Year",
 98 |       "Month",
 99 |       "Day",
100 |       "Hour",
101 |     ]
102 |   public_data_collector_columns:
103 |     [
104 |       "Source",
105 |       "Primary_Key",
106 |       "Second_Key",
107 |       "Extension",
108 |       "URL",
109 |       "Owner",
110 |       "Repo_Name",
111 |       "Secret",
112 |       "Code",
113 |       "Detected_Timestamp",
114 |       "Key_Weight",
115 |       "SKey_Count",
116 |       "Entropy",
117 |       "Dictionary_Similarity",
118 |       "Score",
119 |       "Year",
120 |       "Month",
121 |       "Day",
122 |       "Hour",
123 |     ]
124 | 
125 | file_scanner:
126 |   local_file_scan_detection_columns:
127 |     [
128 |       "Source",
129 |       "Second_Key",
130 |       "Extension",
131 |       "URL",
132 |       "Secret",
133 |       "Code",
134 |       "Detected_Timestamp",
135 |       "Key_Weight",
136 |       "SKey_Count",
137 |       "Entropy",
138 |       "Dictionary_Similarity",
139 |       "Score",
140 |       "Year",
141 |       "Month",
142 |       "Day",
143 |       "Hour",
144 |     ]
145 |   unique_columns: ["Source", "Second_Key", "Extension", "URL", "Code"]
146 | 
147 | keywords:
148 |   public_data_columns:
149 |     [
150 |       "Source",
151 |       "Second_Key",
152 |       "URL",
153 |       "Owner",
154 |       "Repo_Name",
155 |       "Commit_Details",
156 |       "Detected_Timestamp",
157 |       "Year",
158 |       "Month",
159 |       "Day",
160 |       "Hour",
161 |     ]
162 |   enterprise_data_columns:
163 |     [
164 |       "Source",
165 |       "Second_Key",
166 |       "URL",
167 |       "Owner",
168 |       "Repo_Name",
169 |       "Commit_Details",
170 |       "Detected_Timestamp",
171 |       "Year",
172 |       "Month",
173 |       "Day",
174 |       "Hour",
175 |     ]
176 | 


--------------------------------------------------------------------------------
/xgitguard/config/xgg_search_paths.csv:
--------------------------------------------------------------------------------
1 | scan_paths


--------------------------------------------------------------------------------
/xgitguard/custom keyword search/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/custom keyword search/__init__.py


--------------------------------------------------------------------------------
/xgitguard/custom keyword search/enterprise_keyword_search.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import hashlib
  3 | import os
  4 | import sys
  5 | import pandas as pd
  6 | import time
  7 | from datetime import datetime
  8 | 
  9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 10 | 
 11 | parent_dir = os.path.dirname(MODULE_DIR)
 12 | sys.path.insert(0, parent_dir)
 13 | 
 14 | from common.configs_read import ConfigsData
 15 | from common.data_format import (
 16 |     format_commit_details,
 17 | )
 18 | from common.github_calls import GithubCalls
 19 | from common.logger import create_logger
 20 | from utilities.common_utilities import check_github_token_env
 21 | from utilities.file_utilities import write_to_csv_file
 22 | 
 23 | file_prefix = "xgg_"
 24 | 
 25 | 
 26 | def format_detection(skeyword, org_url, url):
 27 |     """
 28 |     Format the  data from the given  content and other data
 29 |     params: skeyword - string - Secondary Keyword
 30 |     params: org_url - string - github url
 31 |     params: url - string - github url
 32 |     returns: secrets_data_list - list - List of formatted detections
 33 |     """
 34 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 35 |     secrets_data_list = []
 36 |     secret_data = []
 37 | 
 38 |     user_name = org_url.split("/")[3]
 39 |     repo_name = org_url.split("/")[4]
 40 | 
 41 |     try:
 42 |         file_path = url.split("/contents/")[1]
 43 |         header = configs.xgg_configs["github"]["enterprise_header"]
 44 |         api_response_commit_data = githubCalls.get_github_enterprise_commits(
 45 |             user_name, repo_name, file_path, header
 46 |         )
 47 |         commit_details = format_commit_details(api_response_commit_data)
 48 |     except Exception as e:
 49 |         logger.warning(f"Github commit content formation error: {e}")
 50 |         commit_details = {}
 51 | 
 52 |     secret_data.insert(0, commit_details)
 53 |     secret_data.insert(0, repo_name)
 54 |     secret_data.insert(0, user_name)
 55 |     secret_data.insert(0, org_url)
 56 |     secret_data.insert(0, skeyword)
 57 |     secret_data.insert(0, "xGG_Enterprise")
 58 |     valid_secret_row = [value for value in secret_data]
 59 |     valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
 60 |     now = datetime.now()
 61 |     valid_secret_row.append(now.year)
 62 |     valid_secret_row.append(now.month)
 63 |     valid_secret_row.append(now.day)
 64 |     valid_secret_row.append(now.hour)
 65 |     secrets_data_list.append(valid_secret_row)
 66 |     return secrets_data_list
 67 | 
 68 | 
 69 | def process_search_urls(org_urls_list, url_list, search_query):
 70 |     """
 71 | 
 72 |     params: org_urls_list - list - list of html urls to get code content
 73 |     params: url_list - list - list of html urls to get code content
 74 |     params: search_query - string
 75 |     returns: secrets_data_list - list - Detected secrets data
 76 |     """
 77 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 78 |     # Processes search findings
 79 |     skeyword = search_query.split('"')[1].strip()
 80 |     secrets_data_list = []
 81 |     try:
 82 |         for url in url_list:
 83 |             org_url = org_urls_list[url_list.index(url)]
 84 |             secret_data_list = format_detection(skeyword, org_url, url)
 85 |             if secret_data_list:
 86 |                 for secret_data in secret_data_list:
 87 |                     secrets_data_list.append(secret_data)
 88 |     except Exception as e:
 89 |         logger.error(f"Total Process Search (Exception Error): {e}")
 90 |     return secrets_data_list
 91 | 
 92 | 
 93 | def check_existing_detections(org_url_list, url_list, search_query):
 94 |     """
 95 |     Check whether the current urs where processed in previous runs
 96 |     for each url in url list
 97 |         create hex hash value for the url
 98 |         check the url hash in previous detected urls
 99 |         if not present add them to further process
100 |         skip if its already present in detected urls
101 |     params:org_url_list - List - List of search org urls
102 |     params: url_list - List - List of search result urls
103 |     params: search_query - String - Search query string
104 | 
105 |     returns: new_urls_list - List - New url list
106 |     returns: new_hashed_urls - List - New Url Hash detected
107 |     """
108 |     logger.debug("<<<< 'Current Executing Function' >>>>")
109 |     new_org_url_list, new_urls_list, new_hashed_urls = [], [], []
110 |     global file_prefix
111 |     # Get the Already predicted hashed url list if present
112 |     try:
113 |         # for Reading training Data only one time
114 |         if configs.hashed_urls:
115 |             pass
116 |     except:
117 |         configs.read_hashed_url(
118 |             file_name=file_prefix + "enterprise_hashed_url_custom_keywords.csv"
119 |         )
120 | 
121 |     if url_list:
122 |         for url in url_list:
123 |             url_to_hash = url + search_query
124 |             hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest()
125 |             new_hashed_url = []
126 |             if not hashed_url in configs.hashed_urls:
127 |                 new_org_url_list.append(org_url_list[url_list.index(url)])
128 |                 new_urls_list.append(url)
129 |                 new_hashed_url.append(hashed_url)
130 |                 new_hashed_url.append(url)
131 |             if new_hashed_url:
132 |                 new_hashed_urls.append(new_hashed_url)
133 |     return new_org_url_list, new_urls_list, new_hashed_urls
134 | 
135 | 
136 | def process_search_results(search_response_lines, search_query):
137 |     """
138 |     params: search_response_lines - list
139 |     params: search_query - string
140 | 
141 |     returns: detection_writes_per_query - int - Total detections written to file
142 |     returns: new_results_per_query - int - No of new urls per query
143 |     returns: detections_per_query - int - No of detections per search
144 |     """
145 |     logger.debug("<<<< 'Current Executing Function' >>>>")
146 |     detection_writes_per_query = 0
147 |     new_results_per_query = 0
148 |     detections_per_query = 0
149 |     new_hashed_urls = []
150 |     global file_prefix
151 | 
152 |     url_list, org_url_list = [], []
153 | 
154 |     hashed_urls_file = os.path.join(
155 |         configs.output_dir, file_prefix + "enterprise_hashed_url_custom_keywords.csv"
156 |     )
157 |     for line in search_response_lines:
158 |         html_url = line["html_url"]
159 |         org_url_list.append(html_url)
160 |         html_url = (
161 |             configs.xgg_configs["github"]["enterprise_pre_url"]
162 |             + line["repository"]["full_name"]
163 |             + "/contents/"
164 |             + line["path"]
165 |         )
166 |         url_list.append(html_url)
167 | 
168 |     if url_list:
169 |         # Check if current url is processed in previous runs
170 |         new_org_urls_list, new_urls_list, new_hashed_urls = check_existing_detections(
171 |             org_url_list, url_list, search_query
172 |         )
173 |         new_results_per_query = len(new_urls_list)
174 |         if new_hashed_urls:
175 |             secrets_detected = process_search_urls(
176 |                 new_org_urls_list, new_urls_list, search_query
177 |             )
178 |             detections_per_query += len(secrets_detected)
179 |             if secrets_detected:
180 |                 try:
181 |                     logger.debug(
182 |                         f"Current secrets_detected count: {len(secrets_detected)}"
183 |                     )
184 |                     secrets_detected_df = pd.DataFrame(
185 |                         secrets_detected,
186 |                         columns=configs.xgg_configs["keywords"][
187 |                             "enterprise_data_columns"
188 |                         ],
189 |                     )
190 |                     detection_writes_per_query += secrets_detected_df.shape[0]
191 |                     try:
192 |                         secrets_detected_file = os.path.join(
193 |                             configs.output_dir,
194 |                             "xgg_enterprise_custom_keywords_detected.csv",
195 |                         )
196 |                         write_to_csv_file(secrets_detected_df, secrets_detected_file)
197 |                     except Exception as e:
198 |                         logger.error(f"Process Error: {e}")
199 |                 except Exception as e:
200 |                     logger.error(f"keywords Dataframe creation failed. Error: {e}")
201 |                     secrets_detected_df = pd.DataFrame(
202 |                         columns=configs.xgg_configs["keywords"][
203 |                             "enterprise_data_columns"
204 |                         ],
205 |                     )
206 | 
207 |             else:
208 |                 logger.info("No keywords in current search results")
209 | 
210 |             try:
211 |                 new_hashed_urls_df = pd.DataFrame(
212 |                     new_hashed_urls, columns=["hashed_url", "url"]
213 |                 )
214 |                 write_to_csv_file(new_hashed_urls_df, hashed_urls_file)
215 |             except Exception as e:
216 |                 logger.error(f"File Write error: {e}")
217 |                 sys.exit(1)
218 |         else:
219 |             logger.info(
220 |                 f"All {len(url_list)} urls in current search is already processed and hashed"
221 |             )
222 |     else:
223 |         logger.info(f"No valid html urls in the current search results to process.")
224 |     return detection_writes_per_query, new_results_per_query, detections_per_query
225 | 
226 | 
227 | def format_search_query_list(secondary_keywords):
228 |     """
229 |     Create the search query list using Secondary Keywords
230 |     params: secondary_keywords - list
231 |     returns: search_query_list - list
232 |     """
233 |     logger.debug("<<<< 'Current Executing Function' >>>>")
234 |     search_query_list = []
235 |     # Format GitHub Search Query
236 |     for secondary_keyword in secondary_keywords:
237 |         search_query_list.append('"' + secondary_keyword + '"')
238 |     logger.info(f"Total number of items in search_query_list: {len(search_query_list)}")
239 |     return search_query_list
240 | 
241 | 
242 | def run_detection(enterprise_keywords=[], org=[], repo=[]):
243 |     """
244 |     Run GitHub search
245 |     If a Enterprise keyword is provided, perform the search using the Enterprise keyword.
246 |     params: enterprise_keywords - list - optional
247 |     params: org - list - optional
248 |     params: repo - list - optional
249 |     returns: True or False
250 | 
251 |     """
252 |     if enterprise_keywords:
253 |         if isinstance(enterprise_keywords, list):
254 |             configs.secondary_keywords = enterprise_keywords
255 |         else:
256 |             logger.error(
257 |                 f"Please pass Enterprise keywords in List like '['password',]'"
258 |             )
259 |             sys.exit(1)
260 |     else:
261 |         # Get the enterprise_keywords from enterprise_keywords file
262 |         configs.read_secondary_keywords(file_name="enterprise_keywords.csv")
263 |     logger.info(f"Total Enterprise keywords : {len(configs.secondary_keywords)}")
264 | 
265 |     total_search_pairs = len(configs.secondary_keywords)
266 |     logger.info(f"Total Search Pairs: {total_search_pairs}")
267 | 
268 |     total_processed_search, total_detection_writes = 0, 0
269 |     search_query_list = []
270 |     # Format GitHub Search Query List
271 |     search_query_list = format_search_query_list(configs.secondary_keywords)
272 |     logger.info(f"Total search_query_list count: {len(search_query_list)}")
273 | 
274 |     # Loop over each extension for each search query
275 |     for search_query in search_query_list:
276 |         detection_writes_per_query = 0
277 |         new_results_per_query = 0
278 |         detections_per_query = 0
279 |         logger.info(f"*******  Processing Search Query: {search_query}   *******")
280 |         try:
281 |             # Search GitHub and return search response confidence_score
282 |             total_processed_search += 1
283 |             # time.sleep(2)
284 |             search_response_lines = githubCalls.run_github_search(
285 |                 search_query,
286 |                 "",
287 |                 org,
288 |                 repo,
289 |             )
290 |             # If search has detections, process the result urls else continue next search
291 |             if search_response_lines:
292 |                 (
293 |                     detection_writes_per_query,
294 |                     new_results_per_query,
295 |                     detections_per_query,
296 |                 ) = process_search_results(
297 |                     search_response_lines,
298 |                     search_query,
299 |                 )
300 |                 logger.info(
301 |                     f"Detection writes in current search query: {detection_writes_per_query}"
302 |                 )
303 |                 total_detection_writes += detection_writes_per_query
304 |             else:
305 |                 logger.info(
306 |                     f"Search '{search_query}' returns no results. Continuing..."
307 |                 )
308 |                 continue
309 |         except Exception as e:
310 |             logger.error(f"Process Error: {e}")
311 |     logger.info(f"Current Total Processed Search: {total_processed_search}")
312 |     logger.info(f"Current Total Detections Write: {total_detection_writes}")
313 | 
314 |     if new_results_per_query >= 0:
315 |         logger.info(
316 |             f"Total: {total_search_pairs} " + f"Processed: {total_processed_search} "
317 |         )
318 | 
319 |     return True
320 | 
321 | 
322 | def setup_logger(log_level=10, console_logging=True):
323 |     """
324 |     Call logger create module and setup the logger for current run
325 |     params: log_level - int - optional - Default - 20 - INFO
326 |     params: console_logging - Boolean - optional - Enable console logging - default True
327 |     """
328 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
329 |     log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
330 |     global logger
331 |     # Creates a logger
332 |     logger = create_logger(
333 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
334 |     )
335 | 
336 | 
337 | def arg_parser():
338 |     """
339 |     Parse the command line Arguments and return the values
340 |     params: None
341 |     returns: enterprise_keywords - list
342 |     returns: org - list
343 |     returns: repo - list
344 |     returns: log_level - int - Default - 20  - INFO
345 |     returns: console_logging - Boolean - Default - True
346 |     """
347 |     global file_prefix
348 | 
349 |     argparser = argparse.ArgumentParser()
350 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
351 |     log_level_choices = [10, 20, 30, 40, 50]
352 |     argparser.add_argument(
353 |         "-e",
354 |         "--enterprise_keywords",
355 |         metavar="Enterprise Keywords",
356 |         action="store",
357 |         type=str,
358 |         default="",
359 |         help="Pass the Enterprise Keywords list as comma separated string",
360 |     )
361 | 
362 |     argparser.add_argument(
363 |         "-o",
364 |         "--org",
365 |         metavar="Owner",
366 |         action="store",
367 |         type=str,
368 |         default="",
369 |         help="Pass the Org name list as comma separated string",
370 |     )
371 | 
372 |     argparser.add_argument(
373 |         "-r",
374 |         "--repo",
375 |         metavar="Repo",
376 |         action="store",
377 |         type=str,
378 |         default="",
379 |         help="Pass the repo name list as comma separated string",
380 |     )
381 | 
382 |     argparser.add_argument(
383 |         "-l",
384 |         "--log_level",
385 |         metavar="Logger Level",
386 |         action="store",
387 |         type=int,
388 |         default=20,
389 |         choices=log_level_choices,
390 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
391 |     )
392 | 
393 |     argparser.add_argument(
394 |         "-c",
395 |         "--console_logging",
396 |         metavar="Console Logging",
397 |         action="store",
398 |         type=str,
399 |         default="Yes",
400 |         choices=flag_choices,
401 |         help="Pass the Console Logging as Yes or No. Default is Yes",
402 |     )
403 | 
404 |     args = argparser.parse_args()
405 | 
406 |     if args.enterprise_keywords:
407 |         enterprise_keywords = args.enterprise_keywords.split(",")
408 |     else:
409 |         enterprise_keywords = []
410 | 
411 |     if args.org:
412 |         org = args.org.split(",")
413 |     else:
414 |         org = []
415 | 
416 |     if args.repo:
417 |         if len(org) <= 0:
418 |             repo = args.repo.split(",")
419 |         else:
420 |             repo = []
421 |     else:
422 |         repo = []
423 | 
424 |     if args.log_level in log_level_choices:
425 |         log_level = args.log_level
426 |     else:
427 |         log_level = 20
428 |     if args.console_logging.lower() in flag_choices[:5]:
429 |         console_logging = True
430 |     else:
431 |         console_logging = False
432 | 
433 |     return (
434 |         enterprise_keywords,
435 |         org,
436 |         repo,
437 |         log_level,
438 |         console_logging,
439 |     )
440 | 
441 | 
442 | if __name__ == "__main__":
443 |     # Argument Parsing
444 |     (
445 |         enterprise_keywords,
446 |         org,
447 |         repo,
448 |         log_level,
449 |         console_logging,
450 |     ) = arg_parser()
451 | 
452 |     # Setting up Logger
453 |     setup_logger(log_level, console_logging)
454 | 
455 |     logger.info("xGitGuard Custom keyword search Process Started")
456 | 
457 |     # Read and Setup Global Configuration Data to reference in all process
458 |     configs = ConfigsData()
459 |     githubCalls = GithubCalls(
460 |         configs.xgg_configs["github"]["enterprise_api_url"],
461 |         "enterprise",
462 |         configs.xgg_configs["github"]["enterprise_commits_url"],
463 |     )
464 | 
465 |     # Check if the GitHub API token environment variable for "enterprise" is set
466 |     valid_config, token_var = check_github_token_env("enterprise")
467 |     if not valid_config:
468 |         logger.error(
469 |             f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry"
470 |         )
471 |         sys.exit(1)
472 | 
473 |     run_detection(enterprise_keywords, org, repo)
474 |     logger.info("xGitGuard Custom keyword search Process  Completed")
475 | 


--------------------------------------------------------------------------------
/xgitguard/custom keyword search/public_keyword_search.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import hashlib
  3 | import os
  4 | import sys
  5 | import pandas as pd
  6 | import time
  7 | from datetime import datetime
  8 | 
  9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 10 | 
 11 | parent_dir = os.path.dirname(MODULE_DIR)
 12 | sys.path.insert(0, parent_dir)
 13 | 
 14 | from common.configs_read import ConfigsData
 15 | from common.data_format import (
 16 |     format_commit_details,
 17 | )
 18 | from common.github_calls import GithubCalls
 19 | from common.logger import create_logger
 20 | from utilities.common_utilities import check_github_token_env
 21 | from utilities.file_utilities import write_to_csv_file
 22 | 
 23 | file_prefix = "xgg_"
 24 | 
 25 | 
 26 | def format_search_query_list(secondary_keywords):
 27 |     """
 28 |     Create the search query list using Secondary Keywords
 29 |     params: secondary_keywords - list
 30 |     returns: search_query_list - list
 31 |     """
 32 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 33 |     search_query_list = []
 34 |     # Format GitHub Search Query
 35 |     for secondary_keyword in secondary_keywords:
 36 |         search_query_list.append('"' + secondary_keyword + '"')
 37 |     logger.info(f"Total number of items in search_query_list: {len(search_query_list)}")
 38 |     return search_query_list
 39 | 
 40 | 
 41 | def format_detection(skeyword, url):
 42 |     """
 43 |     Format the  data from the given  content and other data
 44 |     params: skeyword - string - Secondary Keyword
 45 |     params: url - string - github url
 46 |     returns: secrets_data_list - list - List of formatted detections
 47 |     """
 48 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 49 |     secrets_data_list = []
 50 |     secret_data = []
 51 |     user_name = url.split("/")[3]
 52 |     repo_name = url.split("/")[4]
 53 |     raw_url = url.replace("raw.githubusercontent.com", "github.com")
 54 |     raw_url_splits = raw_url.split(repo_name)
 55 |     raw_url = raw_url_splits[0] + repo_name + "/blob" + raw_url_splits[1]
 56 |     try:
 57 |         file_path = "/".join(raw_url_splits[1].split("/")[2:])
 58 |         api_response_commit_data = githubCalls.get_github_public_commits(
 59 |             user_name, repo_name, file_path
 60 |         )
 61 |         commit_details = format_commit_details(api_response_commit_data)
 62 |     except Exception as e:
 63 |         logger.warning(f"Github commit content formation error: {e}")
 64 |         commit_details = {}
 65 |     secret_data.insert(0, commit_details)
 66 |     secret_data.insert(0, repo_name)
 67 |     secret_data.insert(0, user_name)
 68 |     secret_data.insert(0, raw_url)
 69 |     secret_data.insert(0, skeyword)
 70 |     secret_data.insert(0, "xGG_Public")
 71 |     valid_secret_row = [value for value in secret_data]
 72 |     valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
 73 |     now = datetime.now()
 74 |     valid_secret_row.append(now.year)
 75 |     valid_secret_row.append(now.month)
 76 |     valid_secret_row.append(now.day)
 77 |     valid_secret_row.append(now.hour)
 78 |     secrets_data_list.append(valid_secret_row)
 79 |     logger.debug(f"Current formatted secrets_data_list count: {len(secrets_data_list)}")
 80 | 
 81 |     return secrets_data_list
 82 | 
 83 | 
 84 | def process_search_urls(url_list, search_query):
 85 |     """
 86 |     params: url_list - list - list of html urls to get code content
 87 |     params: search_query - string
 88 |     returns: secrets_data_list - list - Detected secrets data
 89 |     """
 90 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 91 |     # Processes search findings
 92 |     skeyword = search_query
 93 |     secrets_data_list = []
 94 |     try:
 95 |         for url in url_list:
 96 |             secret_data_list = format_detection(
 97 |                 skeyword,
 98 |                 url,
 99 |             )
100 |             if secret_data_list:
101 |                 for secret_data in secret_data_list:
102 |                     secrets_data_list.append(secret_data)
103 |     except Exception as e:
104 |         logger.error(f"Total Process Search (Exception Error): {e}")
105 |     return secrets_data_list
106 | 
107 | 
108 | def check_existing_detections(url_list, search_query):
109 |     """
110 |     Check whether the current urs where processed in previous runs
111 |     for each url in url list
112 |         create hex hash value for the url
113 |         check the url hash in previous detected urls
114 |         if not present add them to further process
115 |         skip if its already present in detected urls
116 |     params: url_list - List - List of search result urls
117 |     params: search_query - String - Search query string
118 |     returns: new_urls_list - List - New url list
119 |     returns: new_hashed_urls - List - New Url Hash detected
120 |     """
121 |     logger.debug("<<<< 'Current Executing Function' >>>>")
122 |     new_urls_list, new_hashed_urls = [], []
123 |     global file_prefix
124 |     # Get the Already predicted hashed url list if present
125 |     try:
126 |         # for Reading training Data only one time
127 |         if configs.hashed_urls:
128 |             pass
129 |     except:
130 |         configs.read_hashed_url(
131 |             file_name=file_prefix + "public_hashed_url_custom_keywords.csv"
132 |         )
133 |     if url_list:
134 |         for url in url_list:
135 |             url_to_hash = url + search_query
136 |             hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest()
137 |             new_hashed_url = []
138 |             if not hashed_url in configs.hashed_urls:
139 |                 new_urls_list.append(url)
140 |                 new_hashed_url.append(hashed_url)
141 |                 new_hashed_url.append(url)
142 |             if new_hashed_url:
143 |                 new_hashed_urls.append(new_hashed_url)
144 |     return new_urls_list, new_hashed_urls
145 | 
146 | 
147 | def process_search_results(search_response_lines, search_query):
148 |     """
149 |     params: search_response_lines - list
150 |     params: search_query - string
151 | 
152 |     returns: detection_writes_per_query - int - Total detections written to file
153 |     returns: new_results_per_query - int - No of new urls per query
154 |     returns: detections_per_query - int - No of detections per search
155 |     """
156 |     logger.debug("<<<< 'Current Executing Function' >>>>")
157 |     detection_writes_per_query = 0
158 |     new_results_per_query = 0
159 |     detections_per_query = 0
160 |     new_hashed_urls = []
161 |     global file_prefix
162 |     url_list = []
163 |     hashed_urls_file = os.path.join(
164 |         configs.output_dir, file_prefix + "public_hashed_url_custom_keywords.csv"
165 |     )
166 |     for line in search_response_lines:
167 |         html_url = line["html_url"]
168 |         html_url = html_url.replace("blob/", "")
169 |         html_url = html_url.replace(
170 |             "https://github.com", "https://raw.githubusercontent.com"
171 |         )
172 |         url_list.append(html_url)
173 |     if url_list:
174 |         # Check if current url is processed in previous runs
175 |         new_urls_list, new_hashed_urls = check_existing_detections(
176 |             url_list, search_query
177 |         )
178 |         new_results_per_query = len(new_urls_list)
179 |         if new_hashed_urls:
180 |             secrets_detected = process_search_urls(new_urls_list, search_query)
181 |             detections_per_query += len(secrets_detected)
182 |             if secrets_detected:
183 |                 try:
184 |                     logger.debug(
185 |                         f"Current secrets_detected count: {len(secrets_detected)}"
186 |                     )
187 |                     secrets_detected_df = pd.DataFrame(
188 |                         secrets_detected,
189 |                         columns=configs.xgg_configs["keywords"]["public_data_columns"],
190 |                     )
191 |                     detection_writes_per_query += secrets_detected_df.shape[0]
192 |                     try:
193 |                         secrets_detected_file = os.path.join(
194 |                             configs.output_dir,
195 |                             "xgg_public_custom_keywords_detected.csv",
196 |                         )
197 |                         write_to_csv_file(secrets_detected_df, secrets_detected_file)
198 |                     except Exception as e:
199 |                         logger.error(f"Process Error: {e}")
200 |                 except Exception as e:
201 |                     logger.error(f"Keywords Dataframe creation failed. Error: {e}")
202 |                     secrets_detected_df = pd.DataFrame(
203 |                         columns=configs.xgg_configs["keywords"]["public_data_columns"],
204 |                     )
205 |             else:
206 |                 logger.info("No keywords in current search results")
207 |             try:
208 |                 new_hashed_urls_df = pd.DataFrame(
209 |                     new_hashed_urls, columns=["hashed_url", "url"]
210 |                 )
211 |                 write_to_csv_file(new_hashed_urls_df, hashed_urls_file)
212 |             except Exception as e:
213 |                 logger.error(f"File Write error: {e}")
214 |                 sys.exit(1)
215 |         else:
216 |             logger.info(
217 |                 f"All {len(url_list)} urls in current search is already processed and hashed"
218 |             )
219 |     else:
220 |         logger.info(f"No valid html urls in the current search results to process.")
221 |     return detection_writes_per_query, new_results_per_query, detections_per_query
222 | 
223 | 
224 | def run_detection(public_keywords=[], org=[], repo=[]):
225 |     """
226 |     Run GitHub search
227 |     If a primary keyword is provided, perform the search using the primary keyword.
228 |     params: public_keywords - list - optional
229 |     params: org - list - optional
230 |     params: repo - list - optional
231 |     returns: True or False
232 |     """
233 |     if public_keywords:
234 |         if isinstance(public_keywords, list):
235 |             configs.secondary_keywords = public_keywords
236 |         else:
237 |             logger.error(f"Please pass public_keywords in List like '['password',]'")
238 |             sys.exit(1)
239 |     else:
240 |         # Get the secondary_keywords from secondary_keywords file
241 |         configs.read_secondary_keywords(file_name="public_keywords.csv")
242 |     logger.info(f"Total Public keywords : {len(configs.secondary_keywords)}")
243 | 
244 |     total_search_pairs = len(configs.secondary_keywords)
245 |     logger.info(f"Total Search Pairs: {total_search_pairs}")
246 | 
247 |     total_processed_search, total_detection_writes = 0, 0
248 |     search_query_list = []
249 |     # Format GitHub Search Query List
250 |     search_query_list = format_search_query_list(configs.secondary_keywords)
251 |     logger.info(f"Total No.of search queries: {len(search_query_list)}")
252 | 
253 |     # Loop over each search query
254 |     for search_query in search_query_list:
255 |         detection_writes_per_query = 0
256 |         new_results_per_query = 0
257 |         logger.info(f"*******  Processing Search Query: {search_query}   *******")
258 |         try:
259 |             # Search GitHub and return search response confidence_score
260 |             total_processed_search += 1
261 |             time.sleep(2)
262 |             search_response_lines = githubCalls.run_github_search(
263 |                 search_query,
264 |                 "",
265 |                 org,
266 |                 repo,
267 |             )
268 |             # If search has detections, process the result urls else continue next search
269 |             if search_response_lines:
270 |                 (
271 |                     detection_writes_per_query,
272 |                     new_results_per_query,
273 |                     detections_per_query,
274 |                 ) = process_search_results(
275 |                     search_response_lines,
276 |                     search_query,
277 |                 )
278 |                 logger.info(
279 |                     f"Detection writes in current search query: {detection_writes_per_query}"
280 |                 )
281 |                 total_detection_writes += detection_writes_per_query
282 |             else:
283 |                 logger.info(
284 |                     f"Search '{search_query}' returns no results. Continuing..."
285 |                 )
286 |                 continue
287 |         except Exception as e:
288 |             logger.error(f"Process Error: {e}")
289 |     logger.info(f"Current Total Processed Search: {total_processed_search}")
290 |     logger.info(f"Current Total Detections Write: {total_detection_writes}")
291 |     if new_results_per_query >= 0:
292 |         logger.info(
293 |             f"Total: {total_search_pairs} " + f"Processed: {total_processed_search} "
294 |         )
295 |     return True
296 | 
297 | 
298 | def setup_logger(log_level=10, console_logging=True):
299 |     """
300 |     Call logger create module and setup the logger for current run
301 |     params: log_level - int - optional - Default - 20 - INFO
302 |     params: console_logging - Boolean - optional - Enable console logging - default True
303 |     """
304 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
305 |     log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
306 |     global logger
307 |     # Creates a logger
308 |     logger = create_logger(
309 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
310 |     )
311 | 
312 | 
313 | def arg_parser():
314 |     """
315 |     Parse the command line Arguments and return the values
316 |     params: None
317 |     returns: public_keywords - list
318 |     returns: org - list
319 |     returns: repo - list
320 |     returns: log_level - int - Default - 20  - INFO
321 |     returns: console_logging - Boolean - Default - True
322 |     """
323 |     global file_prefix
324 |     global ml_prediction
325 |     global unmask_secret
326 |     argparser = argparse.ArgumentParser()
327 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
328 |     log_level_choices = [10, 20, 30, 40, 50]
329 |     argparser.add_argument(
330 |         "-p",
331 |         "--public_keywords",
332 |         metavar="public keywords",
333 |         action="store",
334 |         type=str,
335 |         default="",
336 |         help="Pass the Primary Keywords list as comma separated string",
337 |     )
338 |     argparser.add_argument(
339 |         "-o",
340 |         "--org",
341 |         metavar="Owner",
342 |         action="store",
343 |         type=str,
344 |         default="",
345 |         help="Pass the Org name list as comma separated string",
346 |     )
347 |     argparser.add_argument(
348 |         "-r",
349 |         "--repo",
350 |         metavar="Repo",
351 |         action="store",
352 |         type=str,
353 |         default="",
354 |         help="Pass the repo name list as comma separated string",
355 |     )
356 |     argparser.add_argument(
357 |         "-l",
358 |         "--log_level",
359 |         metavar="Logger Level",
360 |         action="store",
361 |         type=int,
362 |         default=20,
363 |         choices=log_level_choices,
364 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
365 |     )
366 |     argparser.add_argument(
367 |         "-c",
368 |         "--console_logging",
369 |         metavar="Console Logging",
370 |         action="store",
371 |         type=str,
372 |         default="Yes",
373 |         choices=flag_choices,
374 |         help="Pass the Console Logging as Yes or No. Default is Yes",
375 |     )
376 |     args = argparser.parse_args()
377 |     if args.public_keywords:
378 |         public_keywords = args.public_keywords.split(",")
379 |     else:
380 |         public_keywords = []
381 |     if args.org:
382 |         org = args.org.split(",")
383 |     else:
384 |         org = []
385 |     if args.repo:
386 |         if len(org) <= 0:
387 |             repo = args.repo.split(",")
388 |         else:
389 |             repo = []
390 |     else:
391 |         repo = []
392 |     if args.log_level in log_level_choices:
393 |         log_level = args.log_level
394 |     else:
395 |         log_level = 20
396 |     if args.console_logging.lower() in flag_choices[:5]:
397 |         console_logging = True
398 |     else:
399 |         console_logging = False
400 |     return (
401 |         public_keywords,
402 |         org,
403 |         repo,
404 |         log_level,
405 |         console_logging,
406 |     )
407 | 
408 | 
409 | if __name__ == "__main__":
410 |     # Argument Parsing
411 |     (
412 |         public_keywords,
413 |         org,
414 |         repo,
415 |         log_level,
416 |         console_logging,
417 |     ) = arg_parser()
418 | 
419 |     # Setting up Logger
420 |     setup_logger(log_level, console_logging)
421 |     logger.info("xGitGuard Custom keyword search Process Started")
422 | 
423 |     # Read and Setup Global Configuration Data to reference in all process
424 |     configs = ConfigsData()
425 |     githubCalls = GithubCalls(
426 |         configs.xgg_configs["github"]["public_api_url"],
427 |         "public",
428 |         configs.xgg_configs["github"]["public_commits_url"],
429 |         configs.xgg_configs["github"]["throttle_time"],
430 |     )
431 | 
432 |     # Check if the GitHub API token environment variable for "public" is set
433 |     valid_config, token_var = check_github_token_env("public")
434 |     if not valid_config:
435 |         logger.error(
436 |             f"GitHub API Token Environment variable '{token_var}' not set. API Search will fail/return no results. Please Setup and retry"
437 |         )
438 |         sys.exit(1)
439 |     run_detection(public_keywords, org, repo)
440 |     logger.info("xGitGuard custom keyword search Process  Completed")
441 | 


--------------------------------------------------------------------------------
/xgitguard/file-scanner/extension_search.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | from datetime import datetime
  4 | import logging
  5 | import os
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 10 | parent_dir = os.path.dirname(MODULE_DIR)
 11 | sys.path.insert(0, parent_dir)
 12 | 
 13 | from common.configs_read import ConfigsData
 14 | from common.logger import create_logger
 15 | 
 16 | logger = logging.getLogger("xgg_logger")
 17 | new_search = 0
 18 | 
 19 | 
 20 | def write_data(data):
 21 |     """
 22 |     Write the searched data for a given extension.
 23 | 
 24 |     Args:
 25 |         data (str): The file path.
 26 | 
 27 |     Returns:
 28 |         bool: Indicates whether the operation was successful.
 29 |     """
 30 |     global new_search
 31 |     try:
 32 |         detected_file = os.path.join(
 33 |             configs.output_dir,
 34 |             "xgg_search_files.csv",
 35 |         )
 36 |         if new_search != 0:
 37 |             with open(detected_file, "a") as f:
 38 |                 writer = csv.writer(f)
 39 |                 writer.writerow([data])
 40 |         else:
 41 |             with open(detected_file, "w") as f:
 42 |                 writer = csv.writer(f)
 43 |                 writer.writerow(["target_file_paths"])
 44 |                 writer.writerow([data])
 45 |                 new_search = 1
 46 |     except Exception as e:
 47 |         logger.error(f"Content File Write error: {e}")
 48 |         return False
 49 |     return True
 50 | 
 51 | 
 52 | def find_files(extensions=[], search_path=""):
 53 |     """
 54 |     Run search for the given directory using extensions and return file paths where these extensions are present.
 55 | 
 56 |     Args:
 57 |         extensions (list): The list of file extensions to search for.
 58 |         search_path (str): The file or directory path.
 59 | 
 60 |     Returns:
 61 |         list: A list of file paths where the specified extensions are present.
 62 |     """
 63 |     if os.path.isfile(search_path):
 64 |         write_data([search_path])
 65 |         return True
 66 | 
 67 |     if extensions:
 68 |         if isinstance(extensions, list):
 69 |             configs.extensions = extensions
 70 |         else:
 71 |             logger.error(f"Please pass extensions in List like '['py',]'")
 72 |             sys.exit()
 73 |     else:
 74 |         # Get the extensions from extensions file
 75 |         configs.read_extensions(file_name="extensions.csv")
 76 |     logger.info(f"Total Extensions: {len(configs.extensions)}")
 77 | 
 78 |     try:
 79 |         BASE_DIR = Path(search_path)
 80 |         for path in BASE_DIR.glob(r"**/*"):
 81 |             if path.suffix[1:] in configs.extensions:
 82 |                 if os.path.isfile(path):
 83 |                     write_data(path)
 84 |     except:
 85 |         logger.error(f"File search exception")
 86 | 
 87 | 
 88 | def setup_logger(log_level=10, console_logging=True):
 89 |     """
 90 |     Call the logger creation module and set up the logger for the current run.
 91 | 
 92 |     Args:
 93 |         log_level (int, optional): The logging level. Default is 20 (INFO).
 94 |         console_logging (bool, optional): Enable console logging. Default is True.
 95 |     """
 96 |     global logger
 97 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
 98 |     log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
 99 |     # Creates a logger
100 |     logger = create_logger(
101 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
102 |     )
103 | 
104 | 
105 | def arg_parser():
106 |     """
107 |     Parse the command line arguments and return the values.
108 | 
109 |     Args:
110 |         None
111 | 
112 |     Returns:
113 |         extensions (list): The list of file extensions to search for.
114 |         search_path (str): The file or directory path.
115 |         log_level (int): The logging level. Default is 20 (INFO).
116 |         console_logging (bool): Enable console logging. Default is True.
117 |     """
118 |     argparser = argparse.ArgumentParser()
119 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
120 |     log_level_choices = [10, 20, 30, 40, 50]
121 | 
122 |     argparser.add_argument(
123 |         "-e",
124 |         "--extensions",
125 |         metavar="Extensions",
126 |         action="store",
127 |         type=str,
128 |         default="",
129 |         help="Pass the Extensions list as comma separated string",
130 |     )
131 | 
132 |     argparser.add_argument(
133 |         "-p",
134 |         "--search_path",
135 |         metavar="Search path",
136 |         action="store",
137 |         type=str,
138 |         default="",
139 |         help="Pass the Search Path for scanner",
140 |     )
141 | 
142 |     argparser.add_argument(
143 |         "-l",
144 |         "--log_level",
145 |         metavar="Logger Level",
146 |         action="store",
147 |         type=int,
148 |         default=20,
149 |         choices=log_level_choices,
150 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
151 |     )
152 | 
153 |     argparser.add_argument(
154 |         "-c",
155 |         "--console_logging",
156 |         metavar="Console Logging",
157 |         action="store",
158 |         type=str,
159 |         default="Yes",
160 |         choices=flag_choices,
161 |         help="Pass the Console Logging as Yes or No. Default is Yes",
162 |     )
163 | 
164 |     args = argparser.parse_args()
165 | 
166 |     if args.extensions:
167 |         extensions = args.extensions.split(",")
168 |     else:
169 |         extensions = []
170 | 
171 |     if args.search_path:
172 |         search_path = args.search_path
173 |     else:
174 |         search_path = ""
175 | 
176 |     if args.log_level in log_level_choices:
177 |         log_level = args.log_level
178 |     else:
179 |         log_level = 20
180 |     if args.console_logging.lower() in flag_choices[:5]:
181 |         console_logging = True
182 |     else:
183 |         console_logging = False
184 | 
185 |     return (
186 |         extensions,
187 |         search_path,
188 |         log_level,
189 |         console_logging,
190 |     )
191 | 
192 | 
193 | if __name__ == "__main__":
194 |     # Argument Parsing
195 |     (
196 |         extensions,
197 |         search_path,
198 |         log_level,
199 |         console_logging,
200 |     ) = arg_parser()
201 | 
202 |     try:
203 |         # Setting up Logger
204 |         setup_logger(log_level, console_logging)
205 | 
206 |         logger.info("xGitGuard File Extension Process Started")
207 |         # Read and Setup Global Configuration Data to reference in all process
208 |         configs = ConfigsData()
209 | 
210 |         if search_path:
211 |             find_files(extensions, search_path)
212 |         else:
213 |             configs.read_search_paths(file_name="xgg_search_paths.csv")
214 |             search_paths = configs.search_paths
215 |             if search_paths:
216 |                 for search_path in search_paths:
217 |                     find_files(extensions, search_path)
218 |             else:
219 |                 logger.info(f"No Search paths to process from config file. Ending.")
220 |                 sys.exit(1)
221 | 
222 |         logger.info("xGitGuard File Extension Process Completed")
223 |     except Exception as e:
224 |         logger.error(
225 |             f"xGitGuard Secret detection process encountered an exception: {e}"
226 |         )
227 |         sys.exit(1)
228 | 


--------------------------------------------------------------------------------
/xgitguard/github-enterprise/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/github-enterprise/__init__.py


--------------------------------------------------------------------------------
/xgitguard/github-public/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/github-public/__init__.py


--------------------------------------------------------------------------------
/xgitguard/logs/.log_desc:
--------------------------------------------------------------------------------
1 | #directory for collecting app logs


--------------------------------------------------------------------------------
/xgitguard/ml_training/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/__init__.py


--------------------------------------------------------------------------------
/xgitguard/ml_training/ml_data-collector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/__init__.py


--------------------------------------------------------------------------------
/xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/__init__.py


--------------------------------------------------------------------------------
/xgitguard/ml_training/ml_data-collector/github-enterprise-ml-data_collector/enterprise_key_data_collector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | """
 19 | xGitGuard Enterprise GitHub Keys and Token ML Data Collection process
 20 |     Steps:
 21 |         Get Secondary Keywords and Extension file data from config path
 22 |         Prepare the search query list with each Secondary Keyword
 23 |         Loop over each extension for each search query
 24 |             Search GitHub and get response data
 25 |             Process the response urls
 26 |             If url is already processed in previous runs, skip the same
 27 |             Get the code content for the html urls
 28 |             Clean the code content and extract Secrets
 29 |             Detect the Secrets using RegEx and format Secret records
 30 |             Write the cleaned and detected url data
 31 |     calling Examples:
 32 |     By default the all configuration keys will be taken from config files
 33 | 
 34 |     # Run with Secondary Keywords and extensions from config files
 35 |     python enterprise_key_data_collector.py
 36 | """
 37 | 
 38 | import argparse
 39 | import hashlib
 40 | import math
 41 | import os
 42 | import re
 43 | import sys
 44 | import time
 45 | from datetime import datetime
 46 | 
 47 | import pandas as pd
 48 | from urlextract import URLExtract
 49 | 
 50 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 51 | parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(MODULE_DIR)))
 52 | sys.path.insert(0, parent_dir)
 53 | 
 54 | from common.github_calls import GithubCalls
 55 | from common.configs_read import ConfigsData
 56 | from common.data_format import keys_extractor, remove_url_from_keys
 57 | from common.logger import create_logger
 58 | from common.ml_process import entropy_calc
 59 | from utilities.common_utilities import check_github_token_env
 60 | from utilities.file_utilities import write_to_csv_file
 61 | 
 62 | 
 63 | def calculate_confidence(secondary_keyword, extension, secret):
 64 |     """
 65 |     Calculates confidence scores for given Keywords
 66 |     params: secondary_keyword - string
 67 |     params: extension - string
 68 |     params: secret - string - Detected secret
 69 |     returns: confidence score
 70 |     """
 71 |     # logger.debug("<<<< 'Current Executing Function' >>>>")
 72 |     try:
 73 |         if not configs.confidence_values.empty:
 74 |             pass
 75 |     except:
 76 |         configs.read_confidence_values(file_name="confidence_values.csv")
 77 | 
 78 |     try:
 79 |         if not configs.dictionary_words.empty:
 80 |             pass
 81 |     except:
 82 |         # Get the dictionary_words from dictionary words file
 83 |         configs.read_dictionary_words(file_name="dictionary_words.csv")
 84 |         logger.info(
 85 |             "Reading dictionary_words.csv file completed. Proceeding for search result processing"
 86 |         )
 87 | 
 88 |     secondary_keyword_value = int(
 89 |         configs.confidence_values.loc[secondary_keyword]["value"]
 90 |     )
 91 | 
 92 |     try:
 93 |         extension_value = int(configs.confidence_values.loc[extension]["value"])
 94 |     except:
 95 |         extension = 0
 96 |         extension_value = 0
 97 | 
 98 |     entro = entropy_calc(list(secret))
 99 |     d_match = configs.dict_words_ct * configs.dict_words_vc.transform([secret]).T
100 | 
101 |     return [sum([secondary_keyword_value, extension_value]), entro, d_match[0]]
102 | 
103 | 
104 | def format_detection(skeyword, org_url, url, code_content, secrets, skeyword_count):
105 |     """
106 |     Format the secret data from the given code content and other data
107 |         Format the secrets data in the required format
108 |         Calculate the secrets confidence values
109 |         Return the final formatted detections
110 | 
111 |     params: skeyword - string - Secondary Keyword
112 |     params: org_url - string - github url
113 |     params: url - string - github url
114 |     params: code_content - list - User code content
115 |     params: secrets - list - Detected secrets list
116 |     params: skeyword_count - int - secondary keyword count
117 |     returns: secrets_data_list - list - List of formatted detections
118 |     """
119 |     logger.debug("<<<< 'Current Executing Function' >>>>")
120 |     valid_secret = False
121 |     secrets_data_list = []
122 |     secret_data = []
123 | 
124 |     extension = org_url.split(".")[-1]
125 |     user_name = org_url.split("/")[3]
126 |     repo_name = org_url.split("/")[4]
127 | 
128 |     secret_data.insert(0, repo_name)
129 |     secret_data.insert(0, user_name)
130 |     secret_data.insert(0, org_url)
131 |     secret_data.insert(0, extension)
132 |     secret_data.insert(0, skeyword)
133 |     secret_data.insert(0, "xGG_Enterprise_Key & Token")
134 |     logger.debug("<<<< 'Current Executing Function calculate_confidence loop' >>>>")
135 |     for secret in secrets:
136 |         # Calculate confidence values for detected secrets
137 |         confidence_score = calculate_confidence(skeyword, extension, secret)
138 | 
139 |         if confidence_score[1] > 1.5:
140 |             valid_secret_row = [value for value in secret_data]
141 |             secret_lines = re.findall(".*" + secret + ".*$", code_content, re.MULTILINE)
142 |             code_line = secret
143 |             for secret_line in secret_lines:
144 |                 if (
145 |                     (skeyword in secret_line)
146 |                     and (secret_line != secret)
147 |                     and not (
148 |                         [
149 |                             element
150 |                             for element in ["http", "www", "uuid"]
151 |                             if (element in secret_line)
152 |                         ]
153 |                     )
154 |                     and (secret_line.find(skeyword) < secret_line.find(secret))
155 |                 ):
156 |                     if len(secret_line) < 300:
157 |                         code_line = secret_line
158 |                         valid_secret_row.append(secret)
159 |                         valid_secret = True
160 |                         break
161 |             if valid_secret:
162 |                 valid_secret_row.append(code_line)
163 |                 valid_secret_row.append(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
164 |                 valid_secret_row.append(confidence_score[0])
165 |                 count_score = math.log2(50) / (math.log2(skeyword_count + 1) + 1)
166 |                 valid_secret_row.append(count_score)
167 |                 valid_secret_row.append(confidence_score[1])
168 |                 d_match = math.log2(100) / (math.log2(confidence_score[2] + 1) + 1)
169 |                 valid_secret_row.append(d_match)
170 |                 valid_secret_row.append(
171 |                     confidence_score[0] + confidence_score[1] + count_score + d_match
172 |                 )
173 |                 now = datetime.now()
174 |                 valid_secret_row.append(now.year)
175 |                 valid_secret_row.append(now.month)
176 |                 valid_secret_row.append(now.day)
177 |                 valid_secret_row.append(now.hour)
178 |                 secrets_data_list.append(valid_secret_row)
179 |                 valid_secret = False
180 |     logger.debug(f"Current formatted secrets_data_list count: {len(secrets_data_list)}")
181 |     # logger.debug(f"secrets_data_list: {secrets_data_list}")
182 |     return secrets_data_list
183 | 
184 | 
185 | def process_search_urls(org_urls_list, url_list, search_query):
186 |     """
187 |     Process the Search html url as below
188 |         Get code content from GitHub for the html url
189 |         Remove Url data from code content
190 |         Extract secret values using regex
191 |         Format the secrets detected
192 |         Return the secrets detected
193 | 
194 |     params: org_urls_list - list - list of html urls to get code content
195 |     params: url_list - list - list of html urls to get code content
196 |     params: search_query - string
197 |     returns: secrets_data_list - list - Detected secrets data
198 |     """
199 |     logger.debug("<<<< 'Current Executing Function' >>>>")
200 |     # Processes search findings
201 |     skeyword = search_query.split('"')[1].strip()
202 |     secrets_data_list = []
203 |     extractor = URLExtract()
204 |     try:
205 |         for url in url_list:
206 |             header = configs.xgg_configs["github"]["enterprise_header"]
207 |             code_content_response = githubCalls.enterprise_url_content_get(url, header)
208 |             if code_content_response:
209 |                 code_content = code_content_response.text
210 |             else:
211 |                 logger.debug("No response for url content get call")
212 |                 continue
213 | 
214 |             try:
215 |                 url_file_extension = url.split(".")[-1]
216 |                 url_counts = extractor.find_urls(code_content)
217 |                 if len(url_counts) > 30 or url_file_extension == "md":
218 |                     logger.debug(
219 |                         f"Skip processing URL extract from code content as at url counts is beyond 30: {len(url_counts)}"
220 |                     )
221 |                     continue
222 |             except Exception as e:
223 |                 logger.debug(
224 |                     f"Skip processing URL extract from code content at first 10000 URL limits"
225 |                 )
226 |                 continue
227 | 
228 |             lines = code_content.split("\n")
229 |             if len(lines) <= 2:
230 |                 logger.debug(
231 |                     f"Skiping processing URL extract from code content as url lines is beyond 2: {len(lines)}"
232 |                 )
233 |                 continue
234 | 
235 |             code_contents = remove_url_from_keys(code_content)
236 |             secrets_data = keys_extractor(code_contents)
237 | 
238 |             skeyword_count = code_content.lower().count(skeyword.lower())
239 | 
240 |             if len(secrets_data) >= 1 and len(secrets_data) <= 20:
241 |                 org_url = org_urls_list[url_list.index(url)]
242 |                 secret_data_list = format_detection(
243 |                     skeyword, org_url, url, code_content, secrets_data, skeyword_count
244 |                 )
245 |                 if secret_data_list:
246 |                     for secret_data in secret_data_list:
247 |                         secrets_data_list.append(secret_data)
248 |             else:
249 |                 logger.debug(
250 |                     f"Skipping secrets_data as length is not between 1 to 20. Length: {len(secrets_data)}"
251 |                 )
252 |     except Exception as e:
253 |         logger.error(f"Total Process Search (Exception Error): {e}")
254 |     return secrets_data_list
255 | 
256 | 
257 | def check_existing_detections(org_url_list, url_list, search_query):
258 |     """
259 |     Check whether the current urs where processed in previous runs
260 |     for each url in url list
261 |         create hex hash value for the url
262 |         check the url hash in previous detected urls
263 |         if not present add them to further process
264 |         skip if its already present in detected urls
265 | 
266 |     params: org_url_list - List - List of original result urls
267 |     params: url_list - List - List of search result urls
268 |     params: search_query - String - Search query string
269 | 
270 |     returns: new_org_url_list - List - New original url list
271 |     returns: new_urls_list - List - New url list
272 |     returns: new_hashed_urls - List - New Url Hash detected
273 |     """
274 |     logger.debug("<<<< 'Current Executing Function' >>>>")
275 | 
276 |     new_org_url_list, new_urls_list, new_hashed_urls = [], [], []
277 | 
278 |     # Get the Already predicted hashed url list if present
279 |     try:
280 |         # for Reading training Data only one time
281 |         if configs.hashed_urls:
282 |             pass
283 |     except:
284 |         configs.read_hashed_url(file_name="train_enterprise_hashed_url_keys.csv")
285 | 
286 |     if url_list:
287 |         for url in url_list:
288 |             url_to_hash = url + search_query
289 |             hashed_url = hashlib.md5(url_to_hash.encode()).hexdigest()
290 |             new_hashed_url = []
291 |             if not hashed_url in configs.hashed_urls:
292 |                 new_org_url_list.append(org_url_list[url_list.index(url)])
293 |                 new_urls_list.append(url)
294 |                 new_hashed_url.append(hashed_url)
295 |                 new_hashed_url.append(url)
296 |             if new_hashed_url:
297 |                 new_hashed_urls.append(new_hashed_url)
298 |     return new_org_url_list, new_urls_list, new_hashed_urls
299 | 
300 | 
301 | def process_search_results(search_response_lines, search_query):
302 |     """
303 |     For each search response items, process as below
304 |         Get the html urls from the search response
305 |         Check if the current url is already processed
306 |         if not processed, continue. else skip the url and proceed
307 |         Get the user code content for the html url
308 |         Format and clean the code content
309 |         Find the secrets
310 |         Format and write data
311 |         Write the hashed urls to file
312 | 
313 |     params: search_response_lines - list
314 |     params: search_query - string
315 | 
316 |     returns: detection_writes_per_query - int - Total detections written to file
317 |     returns: new_results_per_query - int - No of new urls per query
318 |     returns: detections_per_query - int - No of detections per search
319 |     """
320 |     logger.debug("<<<< 'Current Executing Function' >>>>")
321 |     detection_writes_per_query = 0
322 |     new_results_per_query = 0
323 |     detections_per_query = 0
324 |     new_hashed_urls = []
325 | 
326 |     url_list, org_url_list = [], []
327 | 
328 |     hashed_urls_file = os.path.join(
329 |         configs.output_dir, "train_enterprise_hashed_url_keys.csv"
330 |     )
331 |     for line in search_response_lines:
332 |         html_url = line["html_url"]
333 |         org_url_list.append(html_url)
334 |         html_url = (
335 |             configs.xgg_configs["github"]["enterprise_pre_url"]
336 |             + line["repository"]["full_name"]
337 |             + "/contents/"
338 |             + line["path"]
339 |         )
340 |         url_list.append(html_url)
341 | 
342 |     if url_list:
343 |         # Check if current url is processed in previous runs
344 |         new_org_urls_list, new_urls_list, new_hashed_urls = check_existing_detections(
345 |             org_url_list, url_list, search_query
346 |         )
347 |         new_results_per_query = len(new_urls_list)
348 |         if new_hashed_urls:
349 |             secrets_detected = process_search_urls(
350 |                 new_org_urls_list, new_urls_list, search_query
351 |             )
352 |             detections_per_query += len(secrets_detected)
353 |             if secrets_detected:
354 |                 try:
355 |                     logger.debug(
356 |                         f"Current secrets_detected count: {len(secrets_detected)}"
357 |                     )
358 |                     # logger.debug(f"secrets_detected: {secrets_detected}")
359 |                     secrets_detected_df = pd.DataFrame(
360 |                         secrets_detected,
361 |                         columns=configs.xgg_configs["secrets"][
362 |                             "enterprise_data_collector_columns"
363 |                         ],
364 |                     )
365 |                 except Exception as e:
366 |                     logger.error(
367 |                         f"secrets_detected Dataframe creation failed. Error: {e}"
368 |                     )
369 |                     secrets_detected_df = pd.DataFrame(
370 |                         columns=configs.xgg_configs["secrets"][
371 |                             "enterprise_data_collector_columns"
372 |                         ],
373 |                     )
374 |                 if not secrets_detected_df.empty:
375 |                     secrets_detected_df = secrets_detected_df[
376 |                         [
377 |                             "Secret",
378 |                             "Second_Key",
379 |                             "Extension",
380 |                             "Code",
381 |                             "Key_Weight",
382 |                             "SKey_Count",
383 |                             "Entropy",
384 |                             "Dictionary_Similarity",
385 |                             "Score",
386 |                         ]
387 |                     ]
388 |                     secrets_detected_df["Label"] = 1
389 |                     if not secrets_detected_df.empty:
390 |                         detection_writes_per_query += secrets_detected_df.shape[0]
391 |                         logger.debug(
392 |                             f"Current secrets_detected_df count: {secrets_detected_df.shape[0]}"
393 |                         )
394 |                         try:
395 |                             secrets_detected_file = os.path.join(
396 |                                 configs.output_dir, "key_train_source.csv"
397 |                             )
398 |                             write_to_csv_file(
399 |                                 secrets_detected_df, secrets_detected_file
400 |                             )
401 |                         except Exception as e:
402 |                             logger.error(f"Process Error: {e}")
403 | 
404 |                 else:
405 |                     logger.debug(
406 |                         "secrets_detected_df is empty. So skipping collection."
407 |                     )
408 |             else:
409 |                 logger.info("No Secrets in current search results")
410 | 
411 |             try:
412 |                 new_hashed_urls_df = pd.DataFrame(
413 |                     new_hashed_urls, columns=["hashed_url", "url"]
414 |                 )
415 |                 write_to_csv_file(new_hashed_urls_df, hashed_urls_file)
416 |             except Exception as e:
417 |                 logger.error(f"File Write error: {e}")
418 |                 sys.exit(1)
419 |         else:
420 |             logger.info(
421 |                 f"All {len(url_list)} urls in current search is already processed and hashed"
422 |             )
423 |     else:
424 |         logger.info(f"No valid html urls in the current search results to process.")
425 |     return detection_writes_per_query, new_results_per_query, detections_per_query
426 | 
427 | 
428 | def format_search_query_list(secondary_keywords):
429 |     """
430 |     Create the search query list using Secondary Keywords
431 |     params: secondary_keywords - list
432 |     returns: search_query_list - list
433 |     """
434 |     logger.debug("<<<< 'Current Executing Function' >>>>")
435 |     search_query_list = []
436 |     # Format GitHub Search Query
437 |     for secondary_keyword in secondary_keywords:
438 |         search_query_list.append('"' + secondary_keyword + '"')
439 |     logger.info(f"Total search_query_list count: {len(search_query_list)}")
440 |     return search_query_list
441 | 
442 | 
443 | def run_data_collector(secondary_keywords=[], extensions=[]):
444 |     """
445 |     Run GitHub detections
446 |     Run search with Secondary Keywords and extension combination
447 |     Steps:
448 |         Get Secondary Keywords and Extension file data from config path
449 |         Prepare the search query list by combining Primary Keyword with each Secondary Keyword
450 |         Loop over each extension for each search query
451 |             Search GitHub and get response data
452 |             Process the response urls
453 |             If url is already processed in previous runs, skip the same
454 |             Get the code content for the html urls
455 |             Clean the code content and extract secrets
456 |             Detect the secrets using RegEx and format secret records
457 |             Write the cleaned and detected secret data
458 | 
459 |     params: secondary_keywords - list - optional
460 |     params: extensions - list - optional
461 |     returns: True or False
462 | 
463 |     Examples:
464 |     Run for Data collection for preparing model Features
465 |         run_data_collector()
466 | 
467 |     Run for given Secondary Keyword and extension
468 |         run_data_collector(secondary_keywords=["auth"], extensions=["py"])
469 | 
470 |     Run without Secondary Keywords from config file and given list of extensions
471 |         run_data_collector(extension = ["py","txt"])
472 |     """
473 |     logger.debug("<<<< 'Current Executing Function' >>>>")
474 | 
475 |     if secondary_keywords:
476 |         if isinstance(secondary_keywords, list):
477 |             configs.secondary_keywords = secondary_keywords
478 |         else:
479 |             logger.error(f"Please pass secondary_keywords in List like '['token',]'")
480 |             sys.exit(1)
481 |     else:
482 |         # Get the secondary_keywords from secondary_keywords file
483 |         configs.read_secondary_keywords(file_name="secondary_keys.csv")
484 |     logger.info(f"Total Secondary Keywords: {len(configs.secondary_keywords)}")
485 | 
486 |     if extensions:
487 |         if isinstance(secondary_keywords, list):
488 |             configs.extensions = extensions
489 |         else:
490 |             logger.error(f"Please pass extensions in List like '['py',]'")
491 |             sys.exit()
492 |     else:
493 |         # Get the extensions from extensions file
494 |         configs.read_extensions(file_name="extensions.csv")
495 |     logger.info(f"Total Extensions: {len(configs.extensions)}")
496 | 
497 |     total_search_pairs = len(configs.secondary_keywords) * len(configs.extensions)
498 |     logger.info(f"Total Search Pairs: {total_search_pairs}")
499 | 
500 |     total_processed_search, total_detection_writes = 0, 0
501 |     search_query_list = []
502 |     # Format GitHub Search Query List
503 |     search_query_list = format_search_query_list(configs.secondary_keywords)
504 |     if not search_query_list:
505 |         logger.info(f"No Search query to process. Ending.")
506 |         sys.exit(1)
507 | 
508 |     # Loop over each extension for each search query
509 |     for extension in configs.extensions:
510 |         for search_query in search_query_list:
511 |             detection_writes_per_query = 0
512 |             new_results_per_query = 0
513 |             detections_per_query = 0
514 |             logger.info(
515 |                 f"*******  Processing Search Query: '{search_query} extension:{extension}'  *******"
516 |             )
517 |             try:
518 |                 # Search GitHub and return search response confidence_score
519 |                 total_processed_search += 1
520 |                 # time.sleep(2)
521 |                 search_response_lines = githubCalls.run_github_search(
522 |                     search_query,
523 |                     extension,
524 |                 )
525 |                 # If search has detections, process the result urls else continue next search
526 |                 if search_response_lines:
527 |                     (
528 |                         detection_writes_per_query,
529 |                         new_results_per_query,
530 |                         detections_per_query,
531 |                     ) = process_search_results(search_response_lines, search_query)
532 |                     logger.info(
533 |                         f"Detection writes in current search query: {detection_writes_per_query}"
534 |                     )
535 |                     total_detection_writes += detection_writes_per_query
536 |                 else:
537 |                     # time.sleep(2)
538 |                     logger.info(
539 |                         f"Search '{search_query}' returns no results. Continuing..."
540 |                     )
541 |                     continue
542 |             except Exception as e:
543 |                 logger.error(f"Process Error: {e}")
544 |         logger.info(f"Current Total Processed Search: {total_processed_search}")
545 |         logger.info(f"Current Total Detections Write: {total_detection_writes}")
546 | 
547 |         if new_results_per_query >= 0:
548 |             logger.info(
549 |                 f"Total: {total_search_pairs} "
550 |                 + f"Processed: {total_processed_search} "
551 |                 + f"Detected: {detections_per_query} "
552 |                 + f"Total Writes: {detection_writes_per_query} "
553 |                 + f"Count URL: {new_results_per_query}"
554 |             )
555 | 
556 |     logger.info(f"Total Processed Search: {total_processed_search}")
557 |     logger.info(f"Total Detections Write: {total_detection_writes}")
558 |     return True
559 | 
560 | 
561 | def setup_logger(log_level=10, console_logging=True):
562 |     """
563 |     Call logger create module and setup the logger for current run
564 |     params: log_level - int - optional - Default - 20 - INFO
565 |     params: console_logging - Boolean - optional - Enable console logging - default True
566 |     """
567 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
568 |     log_file_name = f"{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
569 |     global logger
570 |     # Creates a logger
571 |     logger = create_logger(
572 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
573 |     )
574 | 
575 | 
576 | def arg_parser():
577 |     """
578 |     Parse the command line Arguments and return the values
579 |     params: None
580 |     returns: secondary_keywords - list
581 |     returns: extensions - list
582 |     returns: log_level - int - Default - 20  - INFO
583 |     returns: console_logging - Boolean - Default - True
584 |     """
585 | 
586 |     argparser = argparse.ArgumentParser()
587 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
588 |     log_level_choices = [10, 20, 30, 40, 50]
589 |     argparser.add_argument(
590 |         "-s",
591 |         "--secondary_keywords",
592 |         metavar="Secondary Keywords",
593 |         action="store",
594 |         type=str,
595 |         default="",
596 |         help="Pass the Secondary Keywords list as comma separated string",
597 |     )
598 |     argparser.add_argument(
599 |         "-e",
600 |         "--extensions",
601 |         metavar="Extensions",
602 |         action="store",
603 |         type=str,
604 |         default="",
605 |         help="Pass the Extensions list as comma separated string",
606 |     )
607 | 
608 |     argparser.add_argument(
609 |         "-l",
610 |         "--log_level",
611 |         metavar="Logger Level",
612 |         action="store",
613 |         type=int,
614 |         default=20,
615 |         choices=log_level_choices,
616 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
617 |     )
618 | 
619 |     argparser.add_argument(
620 |         "-c",
621 |         "--console_logging",
622 |         metavar="Console Logging",
623 |         action="store",
624 |         type=str,
625 |         default="Yes",
626 |         choices=flag_choices,
627 |         help="Pass the Console Logging as Yes or No. Default is Yes",
628 |     )
629 | 
630 |     args = argparser.parse_args()
631 | 
632 |     if args.secondary_keywords:
633 |         secondary_keywords = args.secondary_keywords.split(",")
634 |     else:
635 |         secondary_keywords = []
636 |     if args.extensions:
637 |         extensions = args.extensions.split(",")
638 |     else:
639 |         extensions = []
640 | 
641 |     if args.log_level in log_level_choices:
642 |         log_level = args.log_level
643 |     else:
644 |         log_level = 20
645 |     if args.console_logging.lower() in flag_choices[:5]:
646 |         console_logging = True
647 |     else:
648 |         console_logging = False
649 | 
650 |     return secondary_keywords, extensions, log_level, console_logging
651 | 
652 | 
653 | if __name__ == "__main__":
654 |     # Argument Parsing
655 |     (
656 |         secondary_keywords,
657 |         extensions,
658 |         log_level,
659 |         console_logging,
660 |     ) = arg_parser()
661 | 
662 |     # Setting up Logger
663 |     setup_logger(log_level, console_logging)
664 | 
665 |     logger.info("xGitGuard Enterprise Keys and Token Data Collection Process Started")
666 | 
667 |     configs = ConfigsData()
668 |     githubCalls = GithubCalls(
669 |         configs.xgg_configs["github"]["enterprise_api_url"],
670 |         "enterprise",
671 |         configs.xgg_configs["github"]["enterprise_commits_url"],
672 |     )
673 | 
674 |     valid_config, token_var = check_github_token_env("enterprise")
675 |     if not valid_config:
676 |         logger.error(
677 |             f"GitHub API Token Environment variable '{token_var}' is not set. API Search will fail/return no results. Please Setup and retry"
678 |         )
679 |         sys.exit(1)
680 | 
681 |     run_data_collector(secondary_keywords, extensions)
682 | 
683 |     logger.info("xGitGuard Enterprise Keys and Token Data Collection Process Completed")
684 | 


--------------------------------------------------------------------------------
/xgitguard/ml_training/ml_data-collector/github-public-ml-data_collector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/ml_training/ml_data-collector/github-public-ml-data_collector/__init__.py


--------------------------------------------------------------------------------
/xgitguard/ml_training/ml_feature_engineering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import argparse
 20 | import logging
 21 | import os
 22 | import sys
 23 | from datetime import datetime
 24 | 
 25 | import pandas as pd
 26 | 
 27 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 28 | parent_dir = os.path.dirname(MODULE_DIR)
 29 | sys.path.append(parent_dir)
 30 | 
 31 | from common.logger import create_logger
 32 | from utilities.common_utilities import is_num_present, is_uppercase_present
 33 | from utilities.file_utilities import read_csv_file, write_to_csv_file
 34 | 
 35 | logger = logging.getLogger("xgg_logger")
 36 | 
 37 | 
 38 | def get_training_data(file_name):
 39 |     """
 40 |     Read the given training data file or default training data file and return the training data
 41 |     params: training_data_file - string - Training data file path
 42 |     returns: training_data - Datafrmae
 43 |     """
 44 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 45 |     if file_name:
 46 |         output_dir = os.path.abspath(
 47 |             os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
 48 |         )
 49 |         training_data_file = os.path.join(output_dir, file_name)
 50 |         if os.path.exists(training_data_file):
 51 |             logger.debug(f"Reading Training data from file: {training_data_file}")
 52 |             training_data = read_csv_file(training_data_file, output="dataframe")
 53 |         else:
 54 |             logger.error(
 55 |                 f"Training_data_file given is not present. Please check the file path: {training_data_file}"
 56 |             )
 57 |             raise Exception(
 58 |                 f"Training_data_file given is not present. Please check the file path: {training_data_file}"
 59 |             )
 60 |     else:
 61 |         logger.error(
 62 |             "Training data file is not given. Please pass the input training Data file"
 63 |         )
 64 |         raise Exception(
 65 |             "Training data file is not given. Please pass the input training Data file"
 66 |         )
 67 | 
 68 |     return training_data
 69 | 
 70 | 
 71 | def xgg_engineer_model(training_source_data_file, training_data_file=""):
 72 |     """
 73 |     Get clean data and Engineer the Model.
 74 |     params: training_source_data_file - string - file path
 75 |     params: training_data_file - string - file path - optional
 76 |     returns: None
 77 |     """
 78 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 79 |     logger.info("xGitGuard Feature Engineering started")
 80 |     train_data = get_training_data(training_source_data_file)
 81 |     train_data["Len_Key"] = train_data.apply(lambda x: len(x["Secret"]), axis=1)
 82 |     train_data["Len_Code"] = train_data.apply(lambda x: len(x["Code"]), axis=1)
 83 |     train_data["Has_Digit"] = train_data.apply(
 84 |         lambda x: is_num_present(x["Secret"]), axis=1
 85 |     )
 86 |     train_data["Has_Cap"] = train_data.apply(
 87 |         lambda x: is_uppercase_present(x["Secret"]), axis=1
 88 |     )
 89 |     train_data = train_data.drop(["Secret", "Code"], axis=1)
 90 | 
 91 |     train_data = pd.get_dummies(train_data)
 92 |     if not train_data.empty:
 93 |         try:
 94 |             output_dir = os.path.abspath(
 95 |                 os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
 96 |             )
 97 |             training_src_file = os.path.join(output_dir, training_data_file)
 98 |             write_to_csv_file(train_data, training_src_file, write_mode="overwrite")
 99 |         except Exception as e:
100 |             logger.error(f"Process Error: {e}")
101 |     else:
102 |         logger.error(f"Empty Training source data")
103 |     logger.info("xGitGuard Feature Engineering Ended")
104 | 
105 | 
106 | def setup_logger(run_mode="training", log_level=10, console_logging=True):
107 |     """
108 |     Call logger create module and setup the logger for current run
109 |     params: run_mode - str - optional - Default - training
110 |     params: log_level - int - optional - Default - 20 - INFO
111 |     params: console_logging - Boolean - optional - Enable console logging - default True
112 |     """
113 | 
114 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
115 |     log_file_name = f"{run_mode}_{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
116 | 
117 |     global logger
118 |     # Creates a logger
119 |     logger = create_logger(
120 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
121 |     )
122 | 
123 | 
124 | def arg_parser():
125 |     """
126 |     Parse the command line Arguments and return the values
127 |     params: None
128 |     returns: data_type - string
129 |     returns: source_data - string - Default - enterprise
130 |     returns: log_level - int - Default - 20  - INFO
131 |     returns: console_logging - Boolean - Default - True
132 |     """
133 | 
134 |     argparser = argparse.ArgumentParser()
135 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
136 |     log_level_choices = [10, 20, 30, 40, 50]
137 |     argparser.add_argument(
138 |         "data_type",
139 |         metavar="Data_Type",
140 |         action="store",
141 |         type=str,
142 |         choices=["key", "cred"],
143 |         help="Pass the Data_Type as cred or key",
144 |     )
145 |     argparser.add_argument(
146 |         "-s",
147 |         "--source_data",
148 |         metavar="Source Data",
149 |         action="store",
150 |         type=str,
151 |         default="enterprise",
152 |         choices=["enterprise", "public"],
153 |         help="Pass the source of data as public or enterprise. Default is enterprise",
154 |     )
155 | 
156 |     argparser.add_argument(
157 |         "-l",
158 |         "--log_level",
159 |         metavar="Logger Level",
160 |         action="store",
161 |         type=int,
162 |         default=20,
163 |         choices=log_level_choices,
164 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
165 |     )
166 |     argparser.add_argument(
167 |         "-c",
168 |         "--console_logging",
169 |         metavar="Console Logging",
170 |         action="store",
171 |         type=str,
172 |         default="Yes",
173 |         choices=flag_choices,
174 |         help="Pass the Console Logging as Yes or No. Default is Yes",
175 |     )
176 | 
177 |     args = argparser.parse_args()
178 | 
179 |     if args.data_type:
180 |         data_type = args.data_type.lower()
181 |     else:
182 |         logger.error(f"No Data Type is passed in comand line.")
183 |         sys.exit(1)
184 | 
185 |     if args.source_data:
186 |         source_data = args.source_data.lower()
187 |     else:
188 |         logger.error(f"No Source Data is passed in command line.")
189 |         sys.exit(1)
190 | 
191 |     if args.log_level in log_level_choices:
192 |         log_level = args.log_level
193 |     else:
194 |         log_level = 20
195 |     if args.console_logging.lower() in flag_choices[:5]:
196 |         console_logging = True
197 |     else:
198 |         console_logging = False
199 | 
200 |     return data_type, source_data, log_level, console_logging
201 | 
202 | 
203 | if __name__ == "__main__":
204 | 
205 |     (
206 |         data_type,
207 |         source_data,
208 |         log_level,
209 |         console_logging,
210 |     ) = arg_parser()
211 | 
212 |     run_mode = source_data + "_" + data_type
213 |     setup_logger(run_mode, log_level, console_logging)
214 | 
215 |     logger.info(f"{run_mode.upper()} Feature Engineering process Started")
216 |     output_dir = os.path.abspath(
217 |         os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
218 |     )
219 |     training_source_data_file, training_data_file = "", ""
220 |     if source_data == "public":
221 |         if data_type == "cred":
222 | 
223 |             public_cred_src_file = os.path.join(
224 |                 output_dir, "public_cred_train_source.csv"
225 |             )
226 |             if os.path.exists(public_cred_src_file):
227 |                 logger.info(
228 |                     "Using public cred source data to engineer for public model"
229 |                 )
230 |                 training_source_data_file = "public_cred_train_source.csv"
231 |                 training_data_file = "public_cred_train.csv"
232 | 
233 |             else:
234 |                 logger.error(
235 |                     f"Cred Training source data file for engineering not found"
236 |                 )
237 |         elif data_type == "key":
238 |             public_key_src_file = os.path.join(
239 |                 output_dir, "public_key_train_source.csv"
240 |             )
241 |             if os.path.exists(public_key_src_file):
242 |                 logger.info("Using public key source data to engineer for public model")
243 | 
244 |                 training_source_data_file = "public_key_train_source.csv"
245 |                 training_data_file = "public_key_train.csv"
246 |             else:
247 |                 logger.error(f"Key Training source data file for engineering not found")
248 |     else:
249 |         if data_type == "cred":
250 |             logger.info(
251 |                 "Using enterprise cred source data to engineer for enterprise model"
252 |             )
253 |             training_source_data_file = "cred_train_source.csv"
254 |             training_data_file = "cred_train.csv"
255 | 
256 |         elif data_type == "key":
257 |             logger.info(
258 |                 "Using enterprise key source data to engineer for enterprise model"
259 |             )
260 |             training_source_data_file = "key_train_source.csv"
261 |             training_data_file = "key_train.csv"
262 | 
263 |     if training_source_data_file and training_data_file:
264 |         xgg_engineer_model(
265 |             training_source_data_file=training_source_data_file,
266 |             training_data_file=training_data_file,
267 |         )
268 | 


--------------------------------------------------------------------------------
/xgitguard/ml_training/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | import argparse
 19 | import logging
 20 | import os
 21 | import sys
 22 | from datetime import datetime
 23 | 
 24 | from sklearn import metrics
 25 | from sklearn.ensemble import RandomForestClassifier
 26 | from sklearn.model_selection import train_test_split
 27 | 
 28 | MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
 29 | parent_dir = os.path.dirname(MODULE_DIR)
 30 | sys.path.append(parent_dir)
 31 | 
 32 | from common.logger import create_logger
 33 | from utilities.file_utilities import read_csv_file, write_pickle_file
 34 | 
 35 | logger = logging.getLogger("xgg_logger")
 36 | 
 37 | 
 38 | def get_training_data(file_name):
 39 |     """
 40 |     Read the given training data file or default training data file and return the training data
 41 |     params: training_data_file - string - Training data file path
 42 |     returns: training_data - Datafrmae
 43 |     """
 44 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 45 |     if file_name:
 46 |         output_dir = os.path.abspath(
 47 |             os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
 48 |         )
 49 |         training_data_file = os.path.join(output_dir, file_name)
 50 |         if os.path.exists(training_data_file):
 51 |             logger.debug(f"Reading Training data from file: {training_data_file}")
 52 |             training_data = read_csv_file(training_data_file, output="dataframe")
 53 |         else:
 54 |             logger.error(
 55 |                 f"Training_data_file given is not present. Please check the file path: {training_data_file}"
 56 |             )
 57 |             raise Exception(
 58 |                 f"Training_data_file given is not present. Please check the file path: {training_data_file}"
 59 |             )
 60 |     else:
 61 |         logger.error(
 62 |             "Training data file is not given. Please pass the input training Data file"
 63 |         )
 64 |         raise Exception(
 65 |             "Training data file is not given. Please pass the input training Data file"
 66 |         )
 67 | 
 68 |     return training_data
 69 | 
 70 | 
 71 | def train_and_test_model(training_data):
 72 |     """
 73 |     Train the model with training data and test the model.
 74 |     params: training_data - dataframe - Training Data
 75 |     returns: rf - object - Trained model
 76 |     """
 77 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 78 |     # Get Training Data
 79 |     x = training_data.drop(columns="Label", axis=1)
 80 |     # target variable
 81 |     y = training_data["Label"]
 82 | 
 83 |     if training_data.shape[0] < 2:
 84 |         logger.error(
 85 |             "Collect and add more training data for model creation. Minimum 2 rows required"
 86 |         )
 87 |         raise Exception(
 88 |             "Collect and add more training data for model creation. Minimum 2 rows required"
 89 |         )
 90 | 
 91 |     x_train, x_test, y_train, y_test = train_test_split(
 92 |         x, y, test_size=0.3, random_state=123
 93 |     )
 94 | 
 95 |     rf = RandomForestClassifier(n_estimators=500, max_depth=3)
 96 |     rf.fit(x_train, y_train)
 97 | 
 98 |     y_pred = rf.predict(x_test)
 99 | 
100 |     logger.debug("Detection Validation model is trained.")
101 |     logger.debug(f"Random Forest Accuracy:{metrics.accuracy_score(y_test, y_pred)}")
102 |     logger.debug(f"Precision: {metrics.precision_score(y_test, y_pred)}")
103 |     logger.debug(f"Recall: {metrics.recall_score(y_test, y_pred)}")
104 |     logger.debug(f"F1 Score: {metrics.f1_score(y_test, y_pred)}")
105 | 
106 |     return rf
107 | 
108 | 
109 | def xgg_train_model(training_data_file, model_name=""):
110 |     """
111 |     Get trainind data and Train the Model. Test and persist the model
112 |     params: training_data_file - string - file path
113 |     returns: None
114 |     """
115 |     logger.debug("<<<< 'Current Executing Function' >>>>")
116 |     logger.info("xGitGuard Model Training started")
117 |     training_data = get_training_data(training_data_file)
118 |     ml_model = train_and_test_model(training_data)
119 |     output_dir = os.path.abspath(
120 |         os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
121 |     )
122 |     model_file = os.path.join(output_dir, model_name + "model_object.pickle")
123 |     write_pickle_file(object=ml_model, object_file=model_file)
124 |     logger.info("xGitGuard Model Training Ended")
125 | 
126 | 
127 | def setup_logger(run_mode="training", log_level=10, console_logging=True):
128 |     """
129 |     Call logger create module and setup the logger for current run
130 |     params: run_mode - str - optional - Default - training
131 |     params: log_level - int - optional - Default - 20 - INFO
132 |     params: console_logging - Boolean - optional - Enable console logging - default True
133 |     """
134 | 
135 |     log_dir = os.path.abspath(os.path.join(os.path.dirname(MODULE_DIR), ".", "logs"))
136 |     log_file_name = f"{run_mode}_{os.path.basename(__file__).split('.')[0]}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
137 | 
138 |     global logger
139 |     # Creates a logger
140 |     logger = create_logger(
141 |         log_level, console_logging, log_dir=log_dir, log_file_name=log_file_name
142 |     )
143 | 
144 | 
145 | def arg_parser():
146 |     """
147 |     Parse the command line Arguments and return the values
148 |     params: None
149 |     returns: data_type - string
150 |     returns: source_data - string - Default - enterprise
151 |     returns: log_level - int - Default - 20  - INFO
152 |     returns: console_logging - Boolean - Default - True
153 |     """
154 | 
155 |     argparser = argparse.ArgumentParser()
156 |     flag_choices = ["Y", "y", "Yes", "YES", "yes", "N", "n", "No", "NO", "no"]
157 |     log_level_choices = [10, 20, 30, 40, 50]
158 |     argparser.add_argument(
159 |         "data_type",
160 |         metavar="Data_Type",
161 |         action="store",
162 |         type=str,
163 |         choices=["key", "cred"],
164 |         help="Pass the Data_Type as cred or key",
165 |     )
166 |     argparser.add_argument(
167 |         "-s",
168 |         "--source_data",
169 |         metavar="Source Data",
170 |         action="store",
171 |         type=str,
172 |         default="enterprise",
173 |         choices=["enterprise", "public"],
174 |         help="Pass the source of data as public or enterprise. Default is enterprise",
175 |     )
176 | 
177 |     argparser.add_argument(
178 |         "-l",
179 |         "--log_level",
180 |         metavar="Logger Level",
181 |         action="store",
182 |         type=int,
183 |         default=20,
184 |         choices=log_level_choices,
185 |         help="Pass the Logging level as for CRITICAL - 50, ERROR - 40  WARNING - 30  INFO  - 20  DEBUG - 10. Default is 20",
186 |     )
187 |     argparser.add_argument(
188 |         "-c",
189 |         "--console_logging",
190 |         metavar="Console Logging",
191 |         action="store",
192 |         type=str,
193 |         default="Yes",
194 |         choices=flag_choices,
195 |         help="Pass the Console Logging as Yes or No. Default is Yes",
196 |     )
197 | 
198 |     args = argparser.parse_args()
199 | 
200 |     if args.data_type:
201 |         data_type = args.data_type.lower()
202 |     else:
203 |         logger.error(f"No Data Type is passed in command line.")
204 |         sys.exit(1)
205 | 
206 |     if args.source_data:
207 |         source_data = args.source_data.lower()
208 |     else:
209 |         logger.error(f"No Source Data is passed in command line.")
210 |         sys.exit(1)
211 | 
212 |     if args.log_level in log_level_choices:
213 |         log_level = args.log_level
214 |     else:
215 |         log_level = 20
216 |     if args.console_logging.lower() in flag_choices[:5]:
217 |         console_logging = True
218 |     else:
219 |         console_logging = False
220 | 
221 |     return data_type, source_data, log_level, console_logging
222 | 
223 | 
224 | if __name__ == "__main__":
225 | 
226 |     data_type, source_data, log_level, console_logging = arg_parser()
227 | 
228 |     run_mode = source_data + "_" + data_type
229 |     setup_logger(run_mode, log_level, console_logging)
230 | 
231 |     logger.info(f"{run_mode.upper()} Training Model process Started")
232 |     output_dir = os.path.abspath(
233 |         os.path.join(os.path.dirname(MODULE_DIR), ".", "output")
234 |     )
235 |     training_data_file, model_name = "", ""
236 |     if source_data == "public":
237 |         if data_type == "cred":
238 |             public_cred_training_data_file = os.path.join(
239 |                 output_dir, "public_cred_train.csv"
240 |             )
241 |             enterprise_cred_training_data_file = os.path.join(
242 |                 output_dir, "cred_train.csv"
243 |             )
244 |             if os.path.exists(public_cred_training_data_file):
245 |                 logger.info("Using public cred training data to train the public model")
246 | 
247 |                 training_data_file = "public_cred_train.csv"
248 |                 model_name = "public_xgg_cred_rf_"
249 | 
250 |             elif os.path.exists(enterprise_cred_training_data_file):
251 |                 logger.info(
252 |                     "Using enterprise cred training data to train the public model"
253 |                 )
254 |                 training_data_file = "cred_train.csv"
255 |                 model_name = "public_xgg_cred_rf_"
256 |             else:
257 |                 logger.error(
258 |                     f"Cred Training data file not found for cred ml model creation"
259 |                 )
260 |         elif data_type == "key":
261 |             public_key_training_data_file = os.path.join(
262 |                 output_dir, "public_key_train.csv"
263 |             )
264 |             enterprise_key_training_data_file = os.path.join(
265 |                 output_dir, "key_train.csv"
266 |             )
267 |             if os.path.exists(public_key_training_data_file):
268 |                 logger.info("Using public key training data to train the public model")
269 |                 training_data_file = "public_key_train.csv"
270 |                 model_name = "public_xgg_key_rf_"
271 |             elif os.path.exists(enterprise_key_training_data_file):
272 |                 logger.info(
273 |                     "Using enterprise key training data to train the public model"
274 |                 )
275 |                 training_data_file = "key_train.csv"
276 |                 model_name = "public_xgg_key_rf_"
277 | 
278 |             else:
279 |                 logger.error(
280 |                     f"Key Training data file not found for key ml model creation"
281 |                 )
282 |     else:
283 |         if data_type == "cred":
284 |             logger.info(
285 |                 "Using enterprise cred training data to train the enterprise model"
286 |             )
287 |             training_data_file = "cred_train.csv"
288 |             model_name = "xgg_cred_rf_"
289 | 
290 |         elif data_type == "key":
291 |             logger.info(
292 |                 "Using enterprise key training data to train the enterprise model"
293 |             )
294 |             training_data_file = "key_train.csv"
295 |             model_name = "xgg_key_rf_"
296 | 
297 |     if training_data_file and model_name:
298 |         xgg_train_model(
299 |             training_data_file=training_data_file,
300 |             model_name=model_name,
301 |         )
302 |     logger.info("Training, Testing and Persisting the xgg Model Completed")
303 | 


--------------------------------------------------------------------------------
/xgitguard/output/.output:
--------------------------------------------------------------------------------
1 | #output directory


--------------------------------------------------------------------------------
/xgitguard/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Comcast/xGitGuard/b784882673a2a0db3185e46ba17072b8d8370015/xgitguard/utilities/__init__.py


--------------------------------------------------------------------------------
/xgitguard/utilities/common_utilities.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2021 Comcast Cable Communications Management, LLC
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | 
16 | SPDX-License-Identifier: Apache-2.0
17 | """
18 | import re
19 | import os
20 | 
21 | 
22 | def check_github_token_env(token_env):
23 |     """
24 |     For the Given GITHUB Type, check whether ENV and URL set properly or not
25 |     ### Need GitHub Auth Token as Env variable named "GITHUB_TOKEN" for Public
26 |     ### Need GitHub Auth Token as Env variable named "GITHUB_ENTERPRISE_TOKEN" for Enterprise
27 |     params: token_env - string
28 | 
29 |     returns: (1,0), token_var
30 |     """
31 | 
32 |     if token_env == "public":
33 |         token_var = "GITHUB_TOKEN"
34 |     else:
35 |         token_var = "GITHUB_ENTERPRISE_TOKEN"
36 | 
37 |     if os.getenv(token_var):
38 |         return 1, token_var
39 | 
40 |     return 0, token_var
41 | 
42 | 
43 | def is_num_present(word):
44 |     """
45 |     Check if any number present in Given String
46 |     params:  word - string
47 |     returns:  0 or 1
48 |     """
49 |     check = any(letter.isdigit() for letter in word)
50 |     return 1 if check else 0
51 | 
52 | 
53 | def is_uppercase_present(word):
54 |     """
55 |     Check if any Upper Case Letter present in Given String
56 |     params: word - string
57 |     returns: 0 or 1
58 |     """
59 |     check = any(letter.isupper() for letter in word)
60 |     return 1 if check else 0
61 | 
62 | 
63 | def is_special_chars_present(word):
64 |     """
65 |     Check if any special characters present in Given String
66 |     params: word - string
67 |     returns: 0 or 1
68 |     """
69 |     regex = re.compile("[@_!#$%^&*()<>?/\|}{~:]")
70 |     check = regex.search(word)
71 |     return 1 if check else 0
72 | 
73 | 
74 | def mask_data(code, secret):
75 |     """
76 |     Mask the letters except first 4 chars
77 |     params:  code - string - full key line
78 |     params:  secret - string - Secret
79 |     returns:  masked_code - string
80 |     """
81 |     try:
82 |         match_group = re.search("(?<=:|=).*$", code)
83 |         if match_group:
84 |             match = match_group.group(0).strip()
85 |             masked_code = re.sub(r"(?<=:|=).*$", "", code)
86 |             if match[len(match) - 1] == '"':
87 |                 masked_code = masked_code + match[0:4] + "#" * (10) + '"'
88 |             else:
89 |                 masked_code = masked_code + match[0:4] + "#" * (10)
90 |         else:
91 |             masked_code = re.sub(secret, "##########", code)
92 |     except Exception as e:
93 |         masked_code = re.sub(secret, "##########", code)
94 |     return masked_code
95 | 


--------------------------------------------------------------------------------
/xgitguard/utilities/file_utilities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2021 Comcast Cable Communications Management, LLC
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 | http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | 
 16 | SPDX-License-Identifier: Apache-2.0
 17 | """
 18 | 
 19 | import logging
 20 | import os
 21 | import pickle
 22 | 
 23 | import pandas as pd
 24 | import yaml
 25 | 
 26 | logger = logging.getLogger("xgg_logger")
 27 | 
 28 | 
 29 | def read_text_file(file_path):
 30 |     """
 31 |     Read text file utility.
 32 | 
 33 |     This function performs the following steps:
 34 |         - Read the text file from the given path.
 35 |         - If the file is not present, exit.
 36 | 
 37 |     Args:
 38 |         file_path (str): The path to the text file.
 39 | 
 40 |     Returns:
 41 |         list: The content of the file as a list of lines.
 42 |     """
 43 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 44 |     if os.path.exists(file_path):
 45 |         logger.info(f"Reading text data from file path: {file_path}")
 46 |         try:
 47 |             with open(file_path, "r") as infile:
 48 |                 file_data = infile.readlines()
 49 |             return file_data or []
 50 |         except Exception as e:
 51 |             logger.error(f"File Read Error: {e}")
 52 |             return []
 53 |     else:
 54 |         logger.warning(f"File not present in : {file_path}")
 55 |         return []
 56 | 
 57 | 
 58 | def read_yaml_file(file_path):
 59 |     """
 60 |     Read YAML file utility.
 61 | 
 62 |     This function performs the following steps:
 63 |         - Read the YAML file from the given path.
 64 |         - If the file is not present, return empty data.
 65 | 
 66 |     Args:
 67 |         file_path (str): The path to the YAML file.
 68 | 
 69 |     Returns:
 70 |         dict: The content of the YAML file as a dictionary, or an empty dictionary if the file is not present.
 71 |     """
 72 |     logger.debug("<<<< 'Current Executing Function' >>>>")
 73 |     if os.path.exists(file_path):
 74 |         logger.info(f"Reading yaml data from file path: {file_path}")
 75 |         try:
 76 |             with open(file_path, "r") as infile:
 77 |                 file_data = yaml.safe_load(infile)
 78 |             return file_data or []
 79 |         except Exception as e:
 80 |             logger.error(f"File Read Error: {e}")
 81 |             return []
 82 |     else:
 83 |         logger.warning(f"File not present in : {file_path}")
 84 |         return []
 85 | 
 86 | 
 87 | def read_csv_file(file_path, output="list", header=0):
 88 |     """
 89 |     Read CSV file utility.
 90 | 
 91 |     This function performs the following steps:
 92 |         - Read the CSV file from the given path.
 93 |         - If the file is not present, return empty data.
 94 | 
 95 |     Args:
 96 |         file_path (str): The path to the CSV file.
 97 |         output (str): The format of the output, either "dataframe" or "list". Default is "list".
 98 | 
 99 |     Returns:
100 |         file_data: The content of the CSV file as a DataFrame or a list, or an empty DataFrame or list if the file is not present.
101 |     """
102 |     logger.debug("<<<< 'Current Executing Function' >>>>")
103 |     if os.path.exists(file_path):
104 |         logger.info(f"Reading CSV data from file path: {file_path}")
105 |         try:
106 |             file_dataframe = pd.read_csv(file_path, header=header)
107 |             if output == "list":
108 |                 file_data = file_dataframe.values.tolist()
109 |                 # file_data = [item for sublist in file_data for item in sublist]
110 |                 return file_data
111 |             else:
112 |                 return file_dataframe
113 |         except Exception as e:
114 |             logger.error(f"Reading CSV file Error: {e}")
115 |             file_dataframe = pd.DataFrame()
116 |             return [] if output == "list" else file_dataframe
117 |     else:
118 |         logger.warning(f"File not present in : {file_path}")
119 |         file_dataframe = pd.DataFrame()
120 |         return [] if output == "list" else file_dataframe
121 | 
122 | 
123 | def write_to_csv_file(dataframe, csv_file_path, sep=",", write_mode="append"):
124 |     """
125 |     Write to CSV file utility.
126 | 
127 |     This function performs the following steps:
128 |         - Write the DataFrame to the given path if the file is not present.
129 |         - Raise an exception if the column order and counts do not match.
130 |         - Append to the existing file if the file is already present.
131 | 
132 |     Args:
133 |         dataframe (pd.DataFrame): The Pandas DataFrame to write.
134 |         csv_file_path (str): The path to the CSV file.
135 |         sep (str, optional): The separator to use. Default is ",".
136 | 
137 |     Returns:
138 |         bool: True if the operation was successful, False otherwise.
139 |     """
140 |     logger.debug("<<<< 'Current Executing Function' >>>>")
141 |     logger.info(f"Write Called on: {csv_file_path}")
142 |     if not os.path.isfile(csv_file_path):
143 |         dataframe.to_csv(csv_file_path, mode="a", index=False, sep=sep)
144 |         return True
145 |     try:
146 |         if write_mode == "overwrite":
147 |             dataframe.to_csv(csv_file_path, mode="w", index=False, sep=sep)
148 |             return True
149 |         elif len(dataframe.columns) != len(
150 |             pd.read_csv(csv_file_path, nrows=1, sep=sep).columns
151 |         ):
152 |             logger.error(
153 |                 f"Columns do not match!! \
154 |                 Dataframe has {len(dataframe.columns)} columns. \
155 |                 CSV file has {len(pd.read_csv(csv_file_path, nrows=1, sep=sep).columns)} columns."
156 |             )
157 |             raise Exception(
158 |                 f"Columns do not match!! \
159 |                 Dataframe has {len(dataframe.columns)} columns. \
160 |                 CSV file has {len(pd.read_csv(csv_file_path, nrows=1, sep=sep).columns)} columns."
161 |             )
162 |         elif not (
163 |             dataframe.columns == pd.read_csv(csv_file_path, nrows=1, sep=sep).columns
164 |         ).all():
165 |             logger.error(
166 |                 "Columns and column order of dataframe and csv file do not match!!"
167 |             )
168 |             raise Exception(
169 |                 "Columns and column order of dataframe and csv file do not match!!"
170 |             )
171 |         else:
172 |             dataframe.to_csv(
173 |                 csv_file_path, mode="a", index=False, sep=sep, header=False
174 |             )
175 |             logger.debug("CSV file Write Successful")
176 |             return True
177 |     except pd.errors.EmptyDataError as e:
178 |         logger.error(f"CSV file is Empty. So writing like a new File. Error: {e}")
179 |         dataframe.to_csv(csv_file_path, mode="a", index=False, sep=sep, header=False)
180 |         logger.debug("CSV file Write Successful")
181 |         return True
182 | 
183 | 
184 | def write_pickle_file(object, object_file):
185 |     """
186 |     Write the given object as a pickle file.
187 | 
188 |     Args:
189 |         obj (object): The object to write.
190 |         object_file (str): The path to the pickle file.
191 | 
192 |     Returns:
193 |         bool: True if the operation was successful.
194 |     """
195 |     logger.debug("<<<< 'Current Executing Function' >>>>")
196 |     logger.info(f"Writing object as pickle file: {object_file}")
197 |     try:
198 |         with open(object_file, "wb") as out_file:
199 |             pickle.dump(object, out_file)
200 |         logger.debug(f"Given object written to file as: {object_file}")
201 |     except Exception as e:
202 |         logger.error(f"Given object Write Failed. Error: {e}")
203 |         raise Exception(f"Given object Write Failed. Error: {e}")
204 |     return True
205 | 
206 | 
207 | def read_pickle_file(object_file=""):
208 |     """
209 |     Read the pickle object file and return the object.
210 | 
211 |     Args:
212 |         object_path (str): The path to the pickle file.
213 | 
214 |     Returns:
215 |         object: The deserialized object from the pickle file.
216 |     """
217 |     logger.debug("<<<< 'Current Executing Function' >>>>")
218 |     if object_file:
219 |         logger.info(f"Reading pickle file object: {object_file}")
220 |         try:
221 |             with open(object_file, "rb") as in_file:
222 |                 object = pickle.load(in_file)
223 |         except Exception as e:
224 |             logger.error(f"Error in reading Model object: {e}")
225 |             raise Exception(f"Error in reading Model object: {e}")
226 |     else:
227 |         logger.error(f"Object File not present in : {object_file}")
228 |         raise Exception(f"Object File not present in : {object_file}")
229 |     return object
230 | 
231 | 
232 | def read_file_content(file_path, output="list"):
233 |     """
234 |     Read file utility.
235 | 
236 |     This function performs the following steps:
237 |         - Read the file from the given path.
238 |         - If the file is not present, exit.
239 | 
240 |     Args:
241 |         file_path (str): The path to the file.
242 |         output (str, optional): The format of the output, either "string" or "list". Default is "list".
243 | 
244 |     Returns:
245 |         file_data (str or list): The content of the file as a string or a list of lines.
246 |     """
247 |     logger.debug("<<<< 'Current Executing Function' >>>>")
248 |     if os.path.exists(file_path):
249 |         logger.debug(f"Reading data from file path: {file_path}")
250 |         try:
251 |             with open(file_path, "r") as file:
252 |                 if output == "list":
253 |                     file_data = file.readlines()
254 |                 else:
255 |                     file_data = file.read()
256 |             return file_data or ([] if output == "list" else "")
257 |         except Exception as e:
258 |             logger.error(f"File Read Error: {e} for file:{file_path}")
259 |             return [] if output == "list" else ""
260 |     else:
261 |         logger.warning(f"File not present in : {file_path}")
262 |         return [] if output == "list" else ""
263 | 


--------------------------------------------------------------------------------
/xgitguard/utilities/query_length_validator.py:
--------------------------------------------------------------------------------
 1 | from common.logger import create_logger
 2 | 
 3 | 
 4 | def query_length_validator(
 5 |     search_qualifier, query, limit=170, max_search_qualifier_per_query=10
 6 | ):
 7 |     qualifier_query = ""
 8 |     qualifier_query_length = 0
 9 |     qualifiers_in_query = 0
10 |     qualifier_string = ""
11 |     for qualifier in search_qualifier:
12 |         if qualifier_query_length + len(qualifier) + 1 <= limit and (
13 |             max_search_qualifier_per_query is None
14 |             or qualifiers_in_query < max_search_qualifier_per_query
15 |         ):
16 |             qualifier_query += f""" {query}:{str(qualifier)}"""
17 |             qualifier_query_length += len(qualifier) + 1
18 |             qualifiers_in_query += 1
19 |         else:
20 |             return -1
21 | 
22 |     if qualifier_query:
23 |         qualifier_string = qualifier_query.strip()
24 | 
25 |     return qualifier_string
26 | 


--------------------------------------------------------------------------------