├── .github └── workflows │ ├── build.yml │ └── deploy.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benchmarks ├── gitleaks.sh ├── gitleaks.toml └── pyrepscan_bench.py ├── cortex.yaml ├── images └── logo.png ├── pyproject.toml ├── pyrepscan ├── __init__.py ├── py.typed └── pyrepscan.pyi ├── src ├── git_repository_scanner.rs ├── lib.rs └── rules_manager.rs └── tests ├── __init__.py ├── test_git_repository_scanner.py └── test_rules_manager.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | lint: 7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Install latest rust 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | profile: minimal 17 | override: true 18 | components: clippy 19 | - name: Lint with clippy 20 | uses: actions-rs/cargo@v1 21 | with: 22 | command: clippy 23 | args: --all-targets --all-features 24 | test: 25 | runs-on: ${{ matrix.os }} 26 | needs: lint 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | python-version: 31 | - '3.7' 32 | - '3.8' 33 | - '3.9' 34 | - '3.10' 35 | - '3.11' 36 | os: 37 | - ubuntu-latest 38 | - macos-latest 39 | - windows-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v3 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v3 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install Poetry 48 | uses: abatilo/actions-poetry@v2.1.3 49 | - name: Install Rust 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Install dependencies 56 | run: poetry install 57 | - name: Build Python package 58 | run: poetry run maturin develop 59 | - name: Test 60 | run: poetry run pytest -Werror tests 61 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | release: 4 | types: 5 | - released 6 | jobs: 7 | deploy: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: 13 | - '3.7' 14 | - '3.8' 15 | - '3.9' 16 | - '3.10' 17 | - '3.11' 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Rust 30 | uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: stable 34 | override: true 35 | - name: Install Cross-compilers (macOS) 36 | if: matrix.os == 'macos-latest' 37 | run: | 38 | rustup target add x86_64-apple-darwin 39 | rustup target add aarch64-apple-darwin 40 | - name: Publish Package 41 | uses: messense/maturin-action@v1 42 | if: matrix.os != 'macos-latest' 43 | with: 44 | command: publish 45 | args: --username=__token__ --no-sdist --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} 46 | env: 47 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 48 | - name: Publish macOS (x86_64) Package 49 | if: matrix.os == 'macos-latest' 50 | uses: PyO3/maturin-action@v1 51 | with: 52 | command: publish 53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist 54 | env: 55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 56 | - name: Publish macOS (arm64) Package 57 | if: matrix.os == 'macos-latest' 58 | uses: PyO3/maturin-action@v1 59 | with: 60 | command: publish 61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist 62 | env: 63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,rust,python 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,rust,python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | .vscode/ 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | # pytype static type analyzer 148 | .pytype/ 149 | 150 | # Cython debug symbols 151 | cython_debug/ 152 | 153 | # PyCharm 154 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 155 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 156 | # and can be added to the global gitignore or merged into this file. For a more nuclear 157 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 158 | #.idea/ 159 | 160 | ### Rust ### 161 | # Generated by Cargo 162 | # will have compiled files and executables 163 | debug/ 164 | 165 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 166 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 167 | Cargo.lock 168 | 169 | # These are backup files generated by rustfmt 170 | **/*.rs.bk 171 | 172 | # MSVC Windows builds of rustc generate these, which store debugging information 173 | *.pdb 174 | 175 | ### VisualStudioCode ### 176 | .vscode/* 177 | !.vscode/settings.json 178 | !.vscode/tasks.json 179 | !.vscode/launch.json 180 | !.vscode/extensions.json 181 | !.vscode/*.code-snippets 182 | 183 | # Local History for Visual Studio Code 184 | .history/ 185 | 186 | # Built Visual Studio Code Extensions 187 | *.vsix 188 | 189 | ### VisualStudioCode Patch ### 190 | # Ignore all local history of files 191 | .history 192 | .ionide 193 | 194 | # Support for Project snippet scope 195 | 196 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,rust,python 197 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pyrepscan" 3 | version = "0.12.0" 4 | authors = ["Gal Ben David "] 5 | edition = "2021" 6 | description = "A Git Repository Secrets Scanner written in Rust" 7 | readme = "README.md" 8 | repository = "https://github.com/intsights/pyrepscan" 9 | homepage = "https://github.com/intsights/pyrepscan" 10 | license = "MIT" 11 | keywords = [ 12 | "git", 13 | "secrets", 14 | "scanner", 15 | "rust", 16 | "pyo3", 17 | ] 18 | 19 | [lib] 20 | name = "pyrepscan" 21 | crate-type = ["cdylib"] 22 | 23 | [dependencies] 24 | aho-corasick = "0.7.18" 25 | chrono = "0.4.19" 26 | crossbeam = "0.8.1" 27 | crossbeam-utils = "0.8.10" 28 | parking_lot = "0.12.1" 29 | regex = "1.6.0" 30 | 31 | [dependencies.libgit2-sys] 32 | version = "0.13.4" 33 | features = ["https"] 34 | 35 | [dependencies.git2] 36 | version = "0.14.4" 37 | features = ["vendored-openssl"] 38 | 39 | [dependencies.pyo3] 40 | version = "0.16.5" 41 | features = ["extension-module"] 42 | 43 | [profile.release] 44 | lto = true 45 | panic = "abort" 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Gal Ben David 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 | A Git Repository Secrets Scanner written in Rust 7 |

8 |

9 | 10 | ![license](https://img.shields.io/badge/MIT-License-blue) 11 | ![Python](https://img.shields.io/badge/Python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9-blue) 12 | ![Build](https://github.com/intsights/PyRepScan/workflows/Build/badge.svg) 13 | [![PyPi](https://img.shields.io/pypi/v/PyRepScan.svg)](https://pypi.org/project/PyRepScan/) 14 | 15 | ## Table of Contents 16 | 17 | - [Table of Contents](#table-of-contents) 18 | - [About The Project](#about-the-project) 19 | - [Built With](#built-with) 20 | - [Performance](#performance) 21 | - [CPU](#cpu) 22 | - [Installation](#installation) 23 | - [Documentation](#documentation) 24 | - [Usage](#usage) 25 | - [License](#license) 26 | - [Contact](#contact) 27 | 28 | 29 | ## About The Project 30 | 31 | PyRepScan is a python library written in Rust. The library uses [git2-rs](https://github.com/rust-lang/git2-rs) for repository parsing and traversing, [regex](https://github.com/rust-lang/regex) for regex pattern matching and [crossbeam](https://github.com/crossbeam-rs/crossbeam) for concurrency. The library was written to achieve high performance and python bindings. 32 | 33 | 34 | ### Built With 35 | 36 | * [git2-rs](https://github.com/rust-lang/git2-rs) 37 | * [regex](https://github.com/rust-lang/regex) 38 | * [crossbeam](https://github.com/crossbeam-rs/crossbeam) 39 | * [parking-lot](https://github.com/Amanieu/parking_lot) 40 | 41 | 42 | ### Performance 43 | 44 | #### CPU 45 | | Library | Time | Peak Memory | 46 | | ------------- | ------------- | ------------- | 47 | | [PyRepScan](https://github.com/intsights/PyRepScan) | 8.74s | 1,149,152 kb | 48 | | [gitleaks](https://github.com/zricethezav/gitleaks) | 1118s | 1,146,300 kb | 49 | 50 | 51 | ### Installation 52 | 53 | ```sh 54 | pip3 install PyRepScan 55 | ``` 56 | 57 | 58 | ## Documentation 59 | 60 | ```python 61 | class GitRepositoryScanner: 62 | def __init__( 63 | self, 64 | ) -> None 65 | ``` 66 | This class holds all the added rules for fast reuse. 67 | 68 | 69 | ```python 70 | def add_content_rule( 71 | self, 72 | name: str, 73 | pattern: str, 74 | whitelist_patterns: typing.List[str], 75 | blacklist_patterns: typing.List[str], 76 | ) -> None 77 | ``` 78 | The `add_content_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. Content rule means that the regex pattern would be tested against the content of the files. 79 | - `name` - The name of the rule so it can be identified. 80 | - `pattern` - The regex pattern (Rust Regex syntax) to match against the content of the commited files. 81 | - `whitelist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter in results. Only one of the patterns should be matched to pass through the result. There is an OR relation between the patterns. 82 | - `blacklist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter out results. Only one of the patterns should be matched to omit the result. There is an OR relation between the patterns. 83 | 84 | 85 | ```python 86 | def add_file_path_rule( 87 | self, 88 | name: str, 89 | pattern: str, 90 | ) -> None 91 | ``` 92 | The `add_file_path_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. File name rule means that the regex pattern would be tested against the file paths. 93 | - `name` - The name of the rule so it can be identified. 94 | - `pattern` - The regex pattern (Rust Regex syntax) to match against the file paths of the commited files. 95 | 96 | 97 | ```python 98 | def add_file_extension_to_skip( 99 | self, 100 | file_extension: str, 101 | ) -> None 102 | ``` 103 | The `add_file_extension_to_skip` function adds a new file extension to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan. 104 | - `file_extension` - A file extension, without a leading dot, to filter out from the scan. 105 | 106 | 107 | ```python 108 | def add_file_path_to_skip( 109 | self, 110 | file_path: str, 111 | ) -> None 112 | ``` 113 | The `add_file_path_to_skip` function adds a new file path pattern to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan. Every file path that would include the `file_path` substring would be left out of the scanned files. 114 | - `file_path` - If the inspected file path would include this substring, it won't be scanned. This parameter is a free text. 115 | 116 | 117 | ```python 118 | def scan( 119 | self, 120 | repository_path: str, 121 | branch_glob_pattern: typing.Optional[str], 122 | from_timestamp: typing.Optional[int], 123 | ) -> typing.List[typing.Dict[str, str]] 124 | ``` 125 | The `scan` function is the main function in the library. Calling this function would trigger a new scan that would return a list of matches. The scan function is a multithreaded operation, that would utilize all the available core in the system. The results would not include the file content but only the regex matching group. To retrieve the full file content one should take the `results['oid']` and to call `get_file_content` function. 126 | - `repository_path` - The git repository folder path. 127 | - `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`. 128 | - `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`. 129 | 130 | A sample result would look like this: 131 | ```python 132 | { 133 | 'rule_name': 'First Rule', 134 | 'author_email': 'author@email.email', 135 | 'author_name': 'Author Name', 136 | 'commit_id': '1111111111111111111111111111111111111111', 137 | 'commit_message': 'The commit message', 138 | 'commit_time': '2020-01-01T00:00:00e', 139 | 'file_path': 'full/file/path', 140 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a', 141 | 'match': 'The matched group', 142 | } 143 | ``` 144 | 145 | 146 | ```python 147 | def scan_from_url( 148 | self, 149 | url: str, 150 | repository_path: str, 151 | branch_glob_pattern: typing.Optional[str], 152 | from_timestamp: typing.Optional[int], 153 | ) -> typing.List[typing.Dict[str, str]] 154 | ``` 155 | The same as `scan` function but also clones a repository from a given URL into the provided repository path. 156 | - `url` - URL of a git repository. 157 | - `repository_path` - The path to clone the repository to 158 | - `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`. 159 | - `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`. 160 | 161 | 162 | ```python 163 | def get_file_content( 164 | self, 165 | repository_path: str, 166 | file_oid: str, 167 | ) -> bytes 168 | ``` 169 | The `get_file_content` function exists to retrieve the content of a file that was previously matched. The full file content is omitted from the results to reduce the results list size and to deliver better performance. 170 | - `repository_path` - The git repository folder path. 171 | - `file_oid` - A string representing the file oid. This parameter exists in the results dictionary returned by the `scan` function. 172 | 173 | 174 | ## Usage 175 | 176 | ```python 177 | import pyrepscan 178 | 179 | grs = pyrepscan.GitRepositoryScanner() 180 | 181 | # Adds a specific rule, can be called multiple times or none 182 | grs.add_content_rule( 183 | name='First Rule', 184 | pattern=r'(-----BEGIN PRIVATE KEY-----)', 185 | whitelist_patterns=[], 186 | blacklist_patterns=[], 187 | ) 188 | grs.add_file_path_rule( 189 | name='Second Rule', 190 | pattern=r'.+\.pem', 191 | ) 192 | grs.add_file_path_rule( 193 | name='Third Rule', 194 | pattern=r'(prod|dev|stage).+key', 195 | ) 196 | 197 | # Add file extensions to ignore during the search 198 | grs.add_file_extension_to_skip( 199 | file_extension='bin', 200 | ) 201 | grs.add_file_extension_to_skip( 202 | file_extension='jpg', 203 | ) 204 | 205 | # Add file paths to ignore during the search. Free text is allowed 206 | grs.add_file_path_to_skip( 207 | file_path='site-packages', 208 | ) 209 | grs.add_file_path_to_skip( 210 | file_path='node_modules', 211 | ) 212 | 213 | # Scans a repository 214 | results = grs.scan( 215 | repository_path='/repository/path', 216 | branch_glob_pattern='*', 217 | ) 218 | 219 | # Results is a list of dicts. Each dict is in the following format: 220 | { 221 | 'rule_name': 'First Rule', 222 | 'author_email': 'author@email.email', 223 | 'author_name': 'Author Name', 224 | 'commit_id': '1111111111111111111111111111111111111111', 225 | 'commit_message': 'The commit message', 226 | 'commit_time': '2020-01-01T00:00:00e', 227 | 'file_path': 'full/file/path', 228 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a', 229 | 'match': 'The matched group', 230 | } 231 | 232 | # Fetch the file_oid full content 233 | file_content = grs.get_file_content( 234 | repository_path='/repository/path', 235 | file_oid='47d2739ba2c34690248c8f91b84bb54e8936899a', 236 | ) 237 | 238 | # file_content 239 | b'binary data' 240 | 241 | # Creating a RulesManager directly 242 | rules_manager = pyrepscan.RulesManager() 243 | 244 | # For testing purposes, check your regexes pattern using check_pattern function 245 | rules_manager.check_pattern( 246 | content='some content1 to check, another content2 in the same line\nanother content3 in another line\n', 247 | pattern=r'(content\d)', 248 | ) 249 | 250 | # Results are the list of captured matches 251 | [ 252 | 'content1', 253 | 'content2', 254 | 'content3', 255 | ] 256 | ``` 257 | 258 | 259 | ## License 260 | 261 | Distributed under the MIT License. See `LICENSE` for more information. 262 | 263 | 264 | ## Contact 265 | 266 | Gal Ben David - gal@intsights.com 267 | 268 | Project Link: [https://github.com/intsights/PyRepScan](https://github.com/intsights/PyRepScan) 269 | -------------------------------------------------------------------------------- /benchmarks/gitleaks.sh: -------------------------------------------------------------------------------- 1 | docker pull zricethezav/gitleaks:latest 2 | docker run -v ${FOLDER_TO_SCAN}:/path -v ${PWD}/benchmarks/gitleaks.toml:/gitleaks.toml zricethezav/gitleaks:latest detect --source="/path" --config=/gitleaks.toml 3 | -------------------------------------------------------------------------------- /benchmarks/gitleaks.toml: -------------------------------------------------------------------------------- 1 | [[rules]] 2 | description = "AWS Manager ID" 3 | regex = '''(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}''' 4 | tags = ["key", "AWS"] 5 | -------------------------------------------------------------------------------- /benchmarks/pyrepscan_bench.py: -------------------------------------------------------------------------------- 1 | import pyrepscan 2 | 3 | 4 | grs = pyrepscan.GitRepositoryScanner() 5 | grs.add_content_rule( 6 | name='AWS Manager ID', 7 | pattern=r'(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}', 8 | whitelist_patterns=[], 9 | blacklist_patterns=[], 10 | ) 11 | results = grs.scan( 12 | repository_path='/path/to/repository', 13 | branch_glob_pattern='*', 14 | ) 15 | print(len(results)) 16 | -------------------------------------------------------------------------------- /cortex.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | info: 3 | title: Pyrepscan 4 | description: A Git Repository Secrets Scanner written in Rust 5 | x-cortex-git: 6 | github: 7 | alias: intsightsorg 8 | repository: Intsights/PyRepScan 9 | x-cortex-tag: pyrepscan 10 | x-cortex-type: service 11 | x-cortex-domain-parents: 12 | - tag: threatintel-brand-security 13 | x-cortex-groups: 14 | - exposure:external-ship 15 | - target:library 16 | openapi: 3.0.1 17 | servers: 18 | - url: "/" 19 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/images/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pyrepscan" 3 | version = "0.12.0" 4 | description = "A Git Repository Secrets Scanner written in Rust" 5 | authors = [ 6 | {email = "gal@intsights.com"}, 7 | {name = "Gal Ben David"} 8 | ] 9 | requires-python = ">=3.7" 10 | license = {file = "LICENSE"} 11 | classifiers = [ 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: MacOS", 14 | "Operating System :: Microsoft", 15 | "Operating System :: POSIX :: Linux", 16 | "Programming Language :: Python :: 3.7", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Rust", 22 | ] 23 | 24 | [build-system] 25 | requires = ["maturin>=0.12,<0.13"] 26 | build-backend = "maturin" 27 | 28 | [tool.maturin] 29 | sdist-include = [ 30 | "Cargo.toml", 31 | "pyproject.toml", 32 | "pyrepscan/*.py", 33 | "pyrepscan/*.pyi", 34 | "src/*", 35 | ] 36 | 37 | [tool.poetry] 38 | name = "pyrepscan" 39 | version = "0.11.0" 40 | authors = ["Gal Ben David "] 41 | description = "A Git Repository Secrets Scanner written in Rust" 42 | readme = "README.md" 43 | repository = "https://github.com/intsights/pyrepscan" 44 | homepage = "https://github.com/intsights/pyrepscan" 45 | license = "MIT" 46 | keywords = [ 47 | "git", 48 | "secrets", 49 | "scanner", 50 | "rust", 51 | "pyo3" 52 | ] 53 | classifiers = [ 54 | "License :: OSI Approved :: MIT License", 55 | "Operating System :: MacOS", 56 | "Operating System :: Microsoft", 57 | "Operating System :: POSIX :: Linux", 58 | "Programming Language :: Python :: 3.7", 59 | "Programming Language :: Python :: 3.8", 60 | "Programming Language :: Python :: 3.9", 61 | "Programming Language :: Python :: 3.10", 62 | "Programming Language :: Python :: 3.11", 63 | "Programming Language :: Rust", 64 | ] 65 | 66 | [tool.poetry.dependencies] 67 | python = "^3.7" 68 | 69 | [tool.poetry.dev-dependencies] 70 | pytest = "*" 71 | gitpython = { git = "https://github.com/gitpython-developers/GitPython" } 72 | wheel = "*" 73 | pytest-runner = "*" 74 | maturin = "*" 75 | 76 | [tool.pytest.ini_options] 77 | minversion = "6.0" 78 | addopts = [ 79 | "--tb=native", 80 | "--pythonwarnings=all", 81 | ] 82 | testpaths = [ 83 | "tests", 84 | ] 85 | -------------------------------------------------------------------------------- /pyrepscan/__init__.py: -------------------------------------------------------------------------------- 1 | from . import pyrepscan 2 | 3 | 4 | GitRepositoryScanner = pyrepscan.GitRepositoryScanner 5 | RulesManager = pyrepscan.RulesManager 6 | -------------------------------------------------------------------------------- /pyrepscan/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/pyrepscan/py.typed -------------------------------------------------------------------------------- /pyrepscan/pyrepscan.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | 4 | class GitRepositoryScanner: 5 | def __init__( 6 | self, 7 | ) -> None: ... 8 | 9 | def add_content_rule( 10 | self, 11 | name: str, 12 | pattern: str, 13 | whitelist_patterns: typing.List[str], 14 | blacklist_patterns: typing.List[str], 15 | ) -> None: ... 16 | 17 | def add_file_path_rule( 18 | self, 19 | name: str, 20 | pattern: str, 21 | ) -> None: ... 22 | 23 | def add_file_extension_to_skip( 24 | self, 25 | file_extension: str, 26 | ) -> None: ... 27 | 28 | def add_file_path_to_skip( 29 | self, 30 | file_path: str, 31 | ) -> None: ... 32 | 33 | def scan( 34 | self, 35 | repository_path: str, 36 | branch_glob_pattern: typing.Optional[str], 37 | from_timestamp: typing.Optional[int], 38 | ) -> typing.List[typing.Dict[str, str]]: ... 39 | 40 | def scan_from_url( 41 | self, 42 | url: str, 43 | repository_path: str, 44 | branch_glob_pattern: typing.Optional[str], 45 | from_timestamp: typing.Optional[int], 46 | ) -> typing.List[typing.Dict[str, str]]: ... 47 | 48 | def get_file_content( 49 | self, 50 | repository_path: str, 51 | file_oid: str, 52 | ) -> bytes: ... 53 | 54 | 55 | class RulesManager: 56 | def __init__( 57 | self, 58 | ) -> None: ... 59 | 60 | def add_content_rule( 61 | self, 62 | name: str, 63 | pattern: str, 64 | whitelist_patterns: typing.List[str], 65 | blacklist_patterns: typing.List[str], 66 | ) -> None: ... 67 | 68 | def add_file_path_rule( 69 | self, 70 | name: str, 71 | pattern: str, 72 | ) -> None: ... 73 | 74 | def add_file_extension_to_skip( 75 | self, 76 | file_extension: str, 77 | ) -> None: ... 78 | 79 | def add_file_path_to_skip( 80 | self, 81 | file_path: str, 82 | ) -> None: ... 83 | 84 | def should_scan_file_path( 85 | self, 86 | file_path: str, 87 | ) -> bool: ... 88 | 89 | def scan_file( 90 | self, 91 | file_path: str, 92 | content: typing.Optional[str], 93 | ) -> typing.Optional[typing.List[typing.Dict[str, str]]]: ... 94 | 95 | def check_pattern( 96 | self, 97 | content: str, 98 | pattern: str, 99 | ) -> typing.List[str]: ... 100 | -------------------------------------------------------------------------------- /src/git_repository_scanner.rs: -------------------------------------------------------------------------------- 1 | use crate::rules_manager; 2 | 3 | use chrono::prelude::*; 4 | use crossbeam_utils::atomic::AtomicCell; 5 | use crossbeam_utils::thread as crossbeam_thread; 6 | use crossbeam::queue::ArrayQueue; 7 | use git2::{Oid, Repository, Delta}; 8 | use parking_lot::Mutex; 9 | use pyo3::exceptions::PyRuntimeError; 10 | use pyo3::prelude::*; 11 | use std::collections::HashMap; 12 | use std::path::Path; 13 | use std::sync::Arc; 14 | use std::thread; 15 | use std::time; 16 | 17 | fn scan_commit_oid( 18 | should_stop: &AtomicCell, 19 | git_repo: &Repository, 20 | oid: &Oid, 21 | rules_manager: &rules_manager::RulesManager, 22 | output_matches: Arc>>>, 23 | ) -> Result<(), git2::Error> { 24 | let commit = git_repo.find_commit(*oid)?; 25 | 26 | let commit_parent_count = commit.parent_count(); 27 | if commit_parent_count > 1 { 28 | return Ok(()); 29 | } 30 | 31 | let commit_tree = commit.tree()?; 32 | 33 | let commit_diff = if commit_parent_count == 0 { 34 | git_repo.diff_tree_to_tree(None, Some(&commit_tree), None)? 35 | } else { 36 | let parent_commit = commit.parent(0)?; 37 | let parent_commit_tree = parent_commit.tree()?; 38 | 39 | git_repo.diff_tree_to_tree(Some(&parent_commit_tree), Some(&commit_tree), None)? 40 | }; 41 | 42 | for delta in commit_diff.deltas() { 43 | if should_stop.load() { 44 | break; 45 | } 46 | 47 | match delta.status() { 48 | Delta::Added | Delta::Modified => {}, 49 | _ => continue, 50 | } 51 | 52 | let new_file = delta.new_file(); 53 | 54 | let delta_new_file_path = match new_file.path() { 55 | Some(path) => path.to_string_lossy().to_string(), 56 | None => continue, 57 | }; 58 | if !rules_manager.should_scan_file_path(&delta_new_file_path.to_ascii_lowercase()) { 59 | continue; 60 | } 61 | 62 | let delta_new_file_blob = match git_repo.find_blob(new_file.id()) { 63 | Ok(blob) => blob, 64 | Err(_) => continue, 65 | }; 66 | 67 | if delta_new_file_blob.size() < 2 { 68 | continue; 69 | } 70 | 71 | let delta_new_file_content = if delta_new_file_blob.is_binary() || delta_new_file_blob.size() > 5000000 { 72 | None 73 | } else { 74 | match std::str::from_utf8(delta_new_file_blob.content()) { 75 | Ok(content) => Some(content), 76 | Err(_) => None, 77 | } 78 | }; 79 | 80 | let scan_matches = rules_manager.scan_file(&delta_new_file_path, delta_new_file_content); 81 | if let Some(scan_matches) = scan_matches { 82 | for scan_match in scan_matches.iter() { 83 | let mut match_hashmap = HashMap::with_capacity(9); 84 | match_hashmap.insert( 85 | "commit_id", 86 | commit.id().to_string(), 87 | ); 88 | match_hashmap.insert( 89 | "commit_message", 90 | commit.message().unwrap_or("").to_string(), 91 | ); 92 | match_hashmap.insert( 93 | "commit_time", 94 | Utc.timestamp(commit.time().seconds(), 0).format("%Y-%m-%dT%H:%M:%S").to_string(), 95 | ); 96 | match_hashmap.insert( 97 | "author_name", 98 | commit.author().name().unwrap_or("").to_string(), 99 | ); 100 | match_hashmap.insert( 101 | "author_email", 102 | commit.author().email().unwrap_or("").to_string(), 103 | ); 104 | match_hashmap.insert( 105 | "file_path", 106 | new_file.path().unwrap_or_else(|| Path::new("")).to_string_lossy().to_string(), 107 | ); 108 | match_hashmap.insert( 109 | "file_oid", 110 | new_file.id().to_string(), 111 | ); 112 | match_hashmap.insert( 113 | "rule_name", 114 | scan_match.get("rule_name").unwrap_or(&String::from("")).to_string(), 115 | ); 116 | match_hashmap.insert( 117 | "match_text", 118 | scan_match.get("match_text").unwrap_or(&String::from("")).to_string(), 119 | ); 120 | output_matches.lock().push(match_hashmap); 121 | } 122 | } 123 | } 124 | 125 | Ok(()) 126 | } 127 | 128 | fn get_commit_oids( 129 | repository_path: &str, 130 | branch_glob_pattern: &str, 131 | from_timestamp: i64, 132 | ) -> Result, git2::Error>{ 133 | let git_repo = Repository::open(repository_path)?; 134 | 135 | let mut revwalk = git_repo.revwalk()?; 136 | revwalk.push_head()?; 137 | revwalk.set_sorting(git2::Sort::TIME)?; 138 | revwalk.push_glob(branch_glob_pattern)?; 139 | 140 | let mut oids = Vec::new(); 141 | for oid in revwalk.flatten() { 142 | if let Ok(commit) = git_repo.find_commit(oid) { 143 | if commit.time().seconds() >= from_timestamp { 144 | oids.push(oid); 145 | } 146 | } 147 | } 148 | 149 | Ok(oids) 150 | } 151 | 152 | pub fn scan_repository( 153 | py: &Python, 154 | repository_path: &str, 155 | branch_glob_pattern: &str, 156 | from_timestamp: i64, 157 | rules_manager: &rules_manager::RulesManager, 158 | output_matches: Arc>>>, 159 | ) -> PyResult<()> { 160 | let commit_oids_queue; 161 | 162 | match get_commit_oids( 163 | repository_path, 164 | branch_glob_pattern, 165 | from_timestamp 166 | ) { 167 | Ok(commit_oids) => { 168 | if commit_oids.is_empty() { 169 | return Ok(()); 170 | } 171 | 172 | commit_oids_queue = ArrayQueue::new(commit_oids.len()); 173 | for commit_oid in commit_oids { 174 | commit_oids_queue.push(commit_oid).unwrap(); 175 | } 176 | }, 177 | Err(error) => { 178 | return Err(PyRuntimeError::new_err(error.to_string())) 179 | }, 180 | } 181 | 182 | let mut py_signal_error: PyResult<()> = Ok(()); 183 | 184 | let should_stop = AtomicCell::new(false); 185 | let number_of_cores = std::thread::available_parallelism().unwrap().get(); 186 | 187 | crossbeam_thread::scope( 188 | |scope| { 189 | for _ in 0..number_of_cores { 190 | scope.spawn( 191 | |_| { 192 | if let Ok(git_repo) = Repository::open(repository_path) { 193 | while !should_stop.load() { 194 | if let Some(commit_oid) = commit_oids_queue.pop() { 195 | scan_commit_oid( 196 | &should_stop, 197 | &git_repo, 198 | &commit_oid, 199 | rules_manager, 200 | output_matches.clone(), 201 | ).unwrap_or(()); 202 | } else { 203 | break; 204 | } 205 | } 206 | }; 207 | } 208 | ); 209 | } 210 | 211 | while !commit_oids_queue.is_empty() { 212 | py_signal_error = py.check_signals(); 213 | if py_signal_error.is_err() { 214 | should_stop.store(true); 215 | 216 | break; 217 | } 218 | 219 | thread::sleep(time::Duration::from_millis(100)); 220 | } 221 | } 222 | ).unwrap_or_default(); 223 | 224 | py_signal_error?; 225 | 226 | Ok(()) 227 | } 228 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod git_repository_scanner; 2 | mod rules_manager; 3 | 4 | use git2::{Oid, Repository}; 5 | use parking_lot::Mutex; 6 | use pyo3::exceptions; 7 | use pyo3::prelude::*; 8 | use pyo3::types::PyBytes; 9 | use std::collections::HashMap; 10 | use std::path::Path; 11 | use std::sync::Arc; 12 | 13 | /// GitRepositoryScanner class 14 | /// A git repository scanner object 15 | /// 16 | /// input: 17 | /// None 18 | /// 19 | /// example: 20 | /// grs = pyrepscan.GitRepositoryScanner() 21 | #[pyclass] 22 | #[derive(Default)] 23 | struct GitRepositoryScanner { 24 | rules_manager: rules_manager::RulesManager, 25 | } 26 | 27 | #[pymethods] 28 | impl GitRepositoryScanner { 29 | #[new] 30 | fn new() -> Self { 31 | Self::default() 32 | } 33 | 34 | /// Adding a new content rule. A content rule is a rule that will be applied to the content of 35 | /// the commit changes. For every commit, each file will be scanned and its content will be scanned 36 | /// with the content rules. 37 | /// 38 | /// input: 39 | /// name: str -> The name of the rules. This will help to identify which rule has been matched. 40 | /// pattern: str -> The regex pattern. The pattern should be in Rust regex syntax. 41 | /// whitelist_patterns: list[str] -> A list of regex patterns. If this list is empty nothing happens. 42 | /// If the list contains one or more regex patterns, each regex pattern will be applied to to the 43 | /// matched content. There should be at least one regex pattern that matched to approve the secret. 44 | /// blacklist_patterns: list[str] -> A list of regex patterns. If this list is empty nothing happens. 45 | /// If the list contains one or more regex patterns, each regex pattern will be applied to to the 46 | /// matched content. There should be at least one regex pattern that matched to reject the secret. 47 | /// 48 | /// returns: 49 | /// None 50 | /// 51 | /// example: 52 | /// grs.add_content_rule( 53 | /// name="Rule #1", 54 | /// pattern=r"password=([\d\w]+)", 55 | /// whitelist_patterns=[], 56 | /// blacklist_patterns=[ 57 | /// "(?:test|example|xxx|empty)", 58 | /// ], 59 | /// ) 60 | fn add_content_rule( 61 | &mut self, 62 | name: String, 63 | pattern: String, 64 | whitelist_patterns: Vec, 65 | blacklist_patterns: Vec, 66 | ) -> PyResult<()> { 67 | self.rules_manager.add_content_rule( 68 | name, 69 | pattern, 70 | whitelist_patterns, 71 | blacklist_patterns, 72 | ) 73 | } 74 | 75 | /// Adding a new file path rule. A file path rule is a rule that will be applied to the file path of 76 | /// the commit changes. For every commit, each file will be scanned. 77 | /// 78 | /// input: 79 | /// name: str -> The name of the rules. This will help to identify which rule has been matched. 80 | /// pattern: str -> The regex pattern. The pattern should be in Rust regex syntax. 81 | /// 82 | /// returns: 83 | /// None 84 | /// 85 | /// example: 86 | /// grs.add_file_path_rule( 87 | /// name="Rule #2", 88 | /// pattern=r".*\.(?:pem|cer)", 89 | /// ) 90 | fn add_file_path_rule( 91 | &mut self, 92 | name: String, 93 | pattern: String 94 | ) -> PyResult<()> { 95 | self.rules_manager.add_file_path_rule( 96 | name, 97 | pattern, 98 | ) 99 | } 100 | 101 | /// Adding a file extension to ignore during the scan. 102 | /// Every file with this extension would not be scanned. 103 | /// 104 | /// input: 105 | /// file_extension: str -> A file extension string. During a scan, the file path will be matched 106 | /// using an ends_with function meaning that it can be partial extension, with a dot, or without 107 | /// 108 | /// returns: 109 | /// None 110 | /// 111 | /// example: 112 | /// grs.add_file_extension_to_skip( 113 | /// file_extension="rar", 114 | /// ) 115 | /// grs.add_file_extension_to_skip( 116 | /// file_extension="tar.gz", 117 | /// ) 118 | fn add_file_extension_to_skip( 119 | &mut self, 120 | file_extension: String, 121 | ) -> PyResult<()> { 122 | self.rules_manager.add_file_extension_to_skip(file_extension) 123 | } 124 | 125 | /// Adding a file path pattern to skip during the scan. The pattern should be in a free text format. 126 | /// 127 | /// input: 128 | /// file_path: str -> A free text pattern to skip during the scan. If the scanned file path would contain 129 | /// this pattern, the scan will skip the file. 130 | /// 131 | /// returns: 132 | /// None 133 | /// 134 | /// example: 135 | /// grs.add_file_path_to_skip( 136 | /// file_extension="test", 137 | /// ) 138 | /// grs.add_file_path_to_skip( 139 | /// file_extension="example", 140 | /// ) 141 | fn add_file_path_to_skip( 142 | &mut self, 143 | file_path: String, 144 | ) -> PyResult<()> { 145 | self.rules_manager.add_file_path_to_skip(file_path) 146 | } 147 | 148 | /// Retrieves a file content using its ObjectID. 149 | /// 150 | /// input: 151 | /// repository_path: str -> Absolute path of the git repository directory. 152 | /// file_oid: str -> The file OID in a string representation 153 | /// 154 | /// returns: 155 | /// bytes -> The file content in a binary representation 156 | /// 157 | /// example: 158 | /// grs.get_file_content( 159 | /// repository_path="/path/to/repository", 160 | /// file_oid="6b584e8ece562ebffc15d38808cd6b98fc3d97ea", 161 | /// ) 162 | fn get_file_content<'py>( 163 | &mut self, 164 | py: Python<'py>, 165 | repository_path: String, 166 | file_oid: String, 167 | ) -> PyResult<&'py PyBytes> { 168 | let git_repo = Repository::open(repository_path).map_err( 169 | |error| exceptions::PyRuntimeError::new_err(error.to_string()) 170 | )?; 171 | 172 | let oid = Oid::from_str(&file_oid).map_err( 173 | |error| exceptions::PyRuntimeError::new_err(error.to_string()) 174 | )?; 175 | 176 | let blob = git_repo.find_blob(oid).map_err( 177 | |error| exceptions::PyRuntimeError::new_err(error.to_string()) 178 | )?; 179 | 180 | let content = PyBytes::new(py, blob.content()); 181 | 182 | Ok(content) 183 | } 184 | 185 | /// Scan a git repository for secrets. Rules shuld be loaded before calling this function. 186 | /// 187 | /// input: 188 | /// repository_path: str -> Absolute path of the git repository directory. 189 | /// branch_glob_pattern: str -> A blob pattern to match against the git branches names. 190 | /// Only matched branches will be scanned. 191 | /// from_timestamp: int = 0 -> Unix epoch timestamp to start the scan from. 192 | /// 193 | /// returns: 194 | /// list[dict] -> List of matches 195 | /// 196 | /// example: 197 | /// grs.scan( 198 | /// repository_path="/path/to/repository", 199 | /// branch_glob_pattern="*", 200 | /// ) 201 | fn scan( 202 | &self, 203 | py: Python, 204 | repository_path: &str, 205 | branch_glob_pattern: Option<&str>, 206 | from_timestamp: Option, 207 | ) -> PyResult { 208 | let matches = Arc::new(Mutex::new(Vec::>::with_capacity(10000))); 209 | match git_repository_scanner::scan_repository( 210 | &py, 211 | repository_path, 212 | branch_glob_pattern.unwrap_or("*"), 213 | from_timestamp.unwrap_or(0), 214 | &self.rules_manager, 215 | matches.clone(), 216 | ) { 217 | Ok(_) => Ok(matches.lock().to_object(py)), 218 | Err(error) => Err(error), 219 | } 220 | } 221 | 222 | /// Scan a git repository for secrets. Rules shuld be loaded before calling this function. 223 | /// 224 | /// input: 225 | /// url: str -> URL of a git repository 226 | /// repository_path: str -> The path to clone the repository to 227 | /// branch_glob_pattern: str -> A blob pattern to match against the git branches names. 228 | /// Only matched branches will be scanned. 229 | /// from_timestamp: int = 0 -> Unix epoch timestamp to start the scan from. 230 | /// 231 | /// returns: 232 | /// list[dict] -> List of matches 233 | /// 234 | /// example: 235 | /// grs.scan_from_url( 236 | /// url="https://github.com/rust-lang/git2-rs", 237 | /// repository_path="/path/to/repository", 238 | /// branch_glob_pattern="*", 239 | /// ) 240 | fn scan_from_url( 241 | &self, 242 | py: Python, 243 | url: &str, 244 | repository_path: &str, 245 | branch_glob_pattern: Option<&str>, 246 | from_timestamp: Option, 247 | ) -> PyResult { 248 | let mut builder = git2::build::RepoBuilder::new(); 249 | builder.bare(true); 250 | 251 | if let Err(error) = builder.clone(url, Path::new(repository_path).as_ref()) { 252 | return Err(exceptions::PyRuntimeError::new_err(error.to_string())); 253 | }; 254 | 255 | self.scan(py, repository_path, branch_glob_pattern, from_timestamp) 256 | } 257 | } 258 | 259 | /// PyRepScan is a Python library written in Rust. The library prodives an API to scan git repositories 260 | /// for leaked secrects via usage of rules. There are multiple types of rules that can be used to find 261 | /// leaked files and content. 262 | #[pymodule] 263 | fn pyrepscan( 264 | _py: Python, 265 | m: &PyModule, 266 | ) -> PyResult<()> { 267 | m.add_class::()?; 268 | m.add_class::()?; 269 | 270 | Ok(()) 271 | } 272 | -------------------------------------------------------------------------------- /src/rules_manager.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::collections::{HashMap, HashSet}; 3 | use regex::Regex; 4 | use pyo3::prelude::*; 5 | use pyo3::exceptions::PyRuntimeError; 6 | use aho_corasick::AhoCorasick; 7 | 8 | struct ContentRule { 9 | name: String, 10 | regex: Regex, 11 | whitelist_regexes: Vec, 12 | blacklist_regexes: Vec, 13 | } 14 | 15 | struct FilePathRule { 16 | name: String, 17 | regex: Regex, 18 | } 19 | 20 | #[pyclass] 21 | pub struct RulesManager { 22 | file_extensions_to_skip: HashSet, 23 | file_paths_to_skip: Vec, 24 | file_paths_to_skip_ac: Option, 25 | content_rules: Vec, 26 | file_path_rules: Vec, 27 | } 28 | 29 | impl Default for RulesManager { 30 | fn default() -> Self { 31 | Self::new() 32 | } 33 | } 34 | 35 | #[pymethods] 36 | impl RulesManager { 37 | #[new] 38 | pub fn new() -> Self { 39 | RulesManager { 40 | file_extensions_to_skip: HashSet::default(), 41 | file_paths_to_skip: Vec::default(), 42 | file_paths_to_skip_ac: None, 43 | content_rules: Vec::default(), 44 | file_path_rules: Vec::default(), 45 | } 46 | } 47 | 48 | pub fn add_content_rule( 49 | &mut self, 50 | name: String, 51 | pattern: String, 52 | whitelist_patterns: Vec, 53 | blacklist_patterns: Vec, 54 | ) -> PyResult<()> { 55 | if name.is_empty() || pattern.is_empty() { 56 | return Err( 57 | PyRuntimeError::new_err("Rule name and pattern can not be empty") 58 | ) 59 | } 60 | 61 | let regex = match Regex::new(&pattern) { 62 | Ok(regex) => regex, 63 | Err(error) => { 64 | return Err( 65 | PyRuntimeError::new_err( 66 | format!("Invalid regex pattern: {error}") 67 | ) 68 | ) 69 | }, 70 | }; 71 | if regex.captures_len() != 2 { 72 | return Err( 73 | PyRuntimeError::new_err( 74 | format!("Matching regex pattern must have exactly one capturing group: {pattern}") 75 | ) 76 | ); 77 | } 78 | 79 | let mut whitelist_regexes = Vec::new(); 80 | for whitelist_pattern in whitelist_patterns.iter() { 81 | let whitelist_regex = match Regex::new(whitelist_pattern) { 82 | Ok(whitelist_regex) => whitelist_regex, 83 | Err(error) => { 84 | return Err( 85 | PyRuntimeError::new_err( 86 | format!("Invalid whitelist regex pattern: {error}") 87 | ) 88 | ) 89 | }, 90 | }; 91 | if whitelist_regex.captures_len() != 1 { 92 | return Err( 93 | PyRuntimeError::new_err( 94 | format!("Whitelist regex pattern must not have a capturing group: {whitelist_pattern}") 95 | ) 96 | ); 97 | } 98 | whitelist_regexes.push(whitelist_regex); 99 | } 100 | 101 | let mut blacklist_regexes = Vec::new(); 102 | for blacklist_pattern in blacklist_patterns.iter() { 103 | let blacklist_regex = match Regex::new(blacklist_pattern) { 104 | Ok(blacklist_regex) => blacklist_regex, 105 | Err(error) => { 106 | return Err( 107 | PyRuntimeError::new_err( 108 | format!("Invalid blacklist regex pattern: {error}") 109 | ) 110 | ) 111 | }, 112 | }; 113 | if blacklist_regex.captures_len() != 1 { 114 | return Err( 115 | PyRuntimeError::new_err( 116 | format!("Blacklist regex pattern must not have a capturing group: {blacklist_pattern}") 117 | ) 118 | ); 119 | } 120 | blacklist_regexes.push(blacklist_regex); 121 | } 122 | 123 | let content_rule = ContentRule { 124 | name, 125 | regex, 126 | whitelist_regexes, 127 | blacklist_regexes, 128 | }; 129 | self.content_rules.push(content_rule); 130 | 131 | Ok(()) 132 | } 133 | 134 | pub fn add_file_path_rule( 135 | &mut self, 136 | name: String, 137 | pattern: String, 138 | ) -> PyResult<()> { 139 | if name.is_empty() || pattern.is_empty() { 140 | return Err( 141 | PyRuntimeError::new_err("Rule name and pattern can not be empty") 142 | ) 143 | } 144 | 145 | let regex = match Regex::new(&pattern) { 146 | Ok(regex) => regex, 147 | Err(error) => { 148 | return Err( 149 | PyRuntimeError::new_err( 150 | format!("Invalid regex pattern: {error}") 151 | ) 152 | ) 153 | } 154 | }; 155 | 156 | let file_path_rule = FilePathRule { name, regex }; 157 | self.file_path_rules.push(file_path_rule); 158 | 159 | Ok(()) 160 | } 161 | 162 | pub fn add_file_extension_to_skip( 163 | &mut self, 164 | file_extension: String, 165 | ) -> PyResult<()> { 166 | if file_extension.is_empty() { 167 | return Err( 168 | PyRuntimeError::new_err("File extension can not be empty") 169 | ) 170 | } 171 | self.file_extensions_to_skip.insert(file_extension.to_ascii_lowercase()); 172 | 173 | Ok(()) 174 | } 175 | 176 | pub fn add_file_path_to_skip( 177 | &mut self, 178 | file_path: String, 179 | ) -> PyResult<()> { 180 | if file_path.is_empty() { 181 | return Err( 182 | PyRuntimeError::new_err("File path can not be empty") 183 | ) 184 | } 185 | self.file_paths_to_skip.push(file_path.to_ascii_lowercase()); 186 | self.file_paths_to_skip_ac = Some( 187 | AhoCorasick::new_auto_configured( 188 | self.file_paths_to_skip.as_slice() 189 | ) 190 | ); 191 | 192 | Ok(()) 193 | } 194 | 195 | pub fn should_scan_file_path( 196 | &self, 197 | file_path: &str, 198 | ) -> bool { 199 | if let Some(file_extension) = Path::new(file_path).extension() { 200 | if self.file_extensions_to_skip.contains(file_extension.to_string_lossy().as_ref()) { 201 | return false; 202 | } 203 | } 204 | 205 | if let Some(file_paths_to_skip_patterns) = &self.file_paths_to_skip_ac { 206 | if file_paths_to_skip_patterns.is_match(file_path) { 207 | return false; 208 | } 209 | } 210 | 211 | true 212 | } 213 | 214 | pub fn scan_file( 215 | &self, 216 | file_path: &str, 217 | content: Option<&str>, 218 | ) -> Option>> { 219 | let mut scan_matches = Vec::new(); 220 | 221 | for file_path_rule in self.file_path_rules.iter() { 222 | if file_path_rule.regex.is_match(file_path) { 223 | let mut scan_match = HashMap::<&str, String>::new(); 224 | scan_match.insert("rule_name", file_path_rule.name.clone()); 225 | scan_match.insert("match_text", file_path.to_string()); 226 | scan_matches.push(scan_match); 227 | } 228 | } 229 | 230 | if let Some(content) = content { 231 | for content_rule in self.content_rules.iter() { 232 | for match_text in content_rule.regex.find_iter(content) { 233 | if content_rule.blacklist_regexes.iter().any( 234 | |blacklist_regex| blacklist_regex.is_match(match_text.as_str()) 235 | ) { 236 | continue; 237 | } 238 | if !content_rule.whitelist_regexes.is_empty() && !content_rule.whitelist_regexes.iter().any( 239 | |whitelist_regex| whitelist_regex.is_match(match_text.as_str()) 240 | ) { 241 | continue; 242 | } 243 | 244 | let mut scan_match = HashMap::<&str, String>::new(); 245 | scan_match.insert("rule_name", content_rule.name.clone()); 246 | scan_match.insert("match_text", match_text.as_str().to_string()); 247 | scan_matches.push(scan_match); 248 | } 249 | } 250 | } 251 | 252 | if scan_matches.is_empty() { 253 | None 254 | } else { 255 | Some(scan_matches) 256 | } 257 | } 258 | 259 | pub fn check_pattern( 260 | &mut self, 261 | content: String, 262 | pattern: String 263 | ) -> PyResult> { 264 | let regex = match Regex::new(&pattern) { 265 | Ok(regex) => regex, 266 | Err(error) => { 267 | return Err( 268 | PyRuntimeError::new_err( 269 | format!("Invalid regex pattern: {error}") 270 | ) 271 | ) 272 | }, 273 | }; 274 | if regex.captures_len() != 2 { 275 | return Err( 276 | PyRuntimeError::new_err( 277 | format!("Matching regex pattern must have exactly one capturing group: {pattern}") 278 | ) 279 | ); 280 | } 281 | 282 | let mut matches = Vec::new(); 283 | for matched in regex.find_iter(&content) { 284 | matches.push(matched.as_str().to_string()); 285 | } 286 | 287 | Ok(matches) 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_git_repository_scanner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import git 4 | import datetime 5 | 6 | import pyrepscan 7 | 8 | 9 | class GitRepositoryScannerTestCase( 10 | unittest.TestCase, 11 | ): 12 | def setUp( 13 | self, 14 | ): 15 | self.tmpdir = tempfile.TemporaryDirectory() 16 | self.addCleanup(self.tmpdir.cleanup) 17 | 18 | bare_repo = git.Repo.init( 19 | path=self.tmpdir.name, 20 | ) 21 | test_author = git.Actor( 22 | name='Author Name', 23 | email='test@author.email', 24 | ) 25 | 26 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile: 27 | tmpfile.write('content') 28 | with open(f'{self.tmpdir.name}/file.py', 'w') as tmpfile: 29 | tmpfile.write('content') 30 | with open(f'{self.tmpdir.name}/prod_env.key', 'w') as tmpfile: 31 | tmpfile.write('') 32 | with open(f'{self.tmpdir.name}/prod_env_with_content.key', 'w') as tmpfile: 33 | tmpfile.write('some_key') 34 | with open(f'{self.tmpdir.name}/file.other', 'w') as tmpfile: 35 | tmpfile.write('nothing special') 36 | with open(f'{self.tmpdir.name}/test_file.cpp', 'w') as tmpfile: 37 | tmpfile.write('content') 38 | bare_repo.index.add( 39 | items=[ 40 | f'{self.tmpdir.name}/file.txt', 41 | f'{self.tmpdir.name}/file.py', 42 | f'{self.tmpdir.name}/prod_env.key', 43 | f'{self.tmpdir.name}/prod_env_with_content.key', 44 | f'{self.tmpdir.name}/file.other', 45 | f'{self.tmpdir.name}/test_file.cpp', 46 | ], 47 | ) 48 | 49 | bare_repo.index.commit( 50 | message='initial commit', 51 | author=test_author, 52 | commit_date='2000-01-01T00:00:00', 53 | author_date='2000-01-01T00:00:00', 54 | ) 55 | 56 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile: 57 | tmpfile.write('new content') 58 | bare_repo.index.add( 59 | items=[ 60 | f'{self.tmpdir.name}/file.txt', 61 | ], 62 | ) 63 | bare_repo.index.commit( 64 | message='edited file', 65 | author=test_author, 66 | commit_date='2001-01-01T00:00:00', 67 | author_date='2001-01-01T00:00:00', 68 | ) 69 | 70 | new_branch = bare_repo.create_head('new_branch') 71 | bare_repo.head.reference = bare_repo.heads[1] 72 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile: 73 | tmpfile.write('new content from new branch') 74 | bare_repo.index.add( 75 | items=[ 76 | f'{self.tmpdir.name}/file.txt', 77 | ], 78 | ) 79 | bare_repo.index.commit( 80 | message='edited file in new branch', 81 | author=test_author, 82 | commit_date='2002-01-01T00:00:00', 83 | author_date='2002-01-01T00:00:00', 84 | ) 85 | bare_repo.head.reference = bare_repo.heads.master 86 | bare_repo.head.reset( 87 | index=True, 88 | working_tree=True, 89 | ) 90 | 91 | merge_base = bare_repo.merge_base( 92 | new_branch, 93 | bare_repo.heads.master, 94 | ) 95 | bare_repo.index.merge_tree( 96 | rhs=bare_repo.heads.master, 97 | base=merge_base, 98 | ) 99 | bare_repo.index.commit( 100 | message='merge from new branch', 101 | author=test_author, 102 | commit_date='2003-01-01T00:00:00', 103 | author_date='2003-01-01T00:00:00', 104 | parent_commits=( 105 | new_branch.commit, 106 | bare_repo.heads.master.commit, 107 | ), 108 | ) 109 | 110 | new_branch = bare_repo.create_head('non_merged_branch') 111 | bare_repo.head.reference = bare_repo.heads[2] 112 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile: 113 | tmpfile.write('new content from non_merged_branch') 114 | bare_repo.index.add( 115 | items=[ 116 | f'{self.tmpdir.name}/file.txt', 117 | ], 118 | ) 119 | bare_repo.index.commit( 120 | message='edited file in non_merged_branch', 121 | author=test_author, 122 | commit_date='2004-01-01T00:00:00', 123 | author_date='2004-01-01T00:00:00', 124 | ) 125 | 126 | bare_repo.head.reference = bare_repo.heads.master 127 | bare_repo.close() 128 | 129 | def test_scan_regular( 130 | self, 131 | ): 132 | grs = pyrepscan.GitRepositoryScanner() 133 | grs.add_content_rule( 134 | name='First Rule', 135 | pattern=r'''(content)''', 136 | whitelist_patterns=[], 137 | blacklist_patterns=[], 138 | ) 139 | 140 | grs.add_file_extension_to_skip('py') 141 | grs.add_file_path_to_skip('test_') 142 | 143 | results = grs.scan( 144 | repository_path=self.tmpdir.name, 145 | branch_glob_pattern='*master', 146 | from_timestamp=0, 147 | ) 148 | for result in results: 149 | result.pop('commit_id') 150 | self.assertCountEqual( 151 | first=results, 152 | second=[ 153 | { 154 | 'author_email': 'test@author.email', 155 | 'author_name': 'Author Name', 156 | 'commit_message': 'edited file', 157 | 'commit_time': '2001-01-01T00:00:00', 158 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a', 159 | 'file_path': 'file.txt', 160 | 'match_text': 'content', 161 | 'rule_name': 'First Rule' 162 | }, 163 | { 164 | 'author_email': 'test@author.email', 165 | 'author_name': 'Author Name', 166 | 'commit_message': 'edited file in new branch', 167 | 'commit_time': '2002-01-01T00:00:00', 168 | 'file_oid': '0407a18f7c6802c7e7ddc5c9e8af4a34584383ff', 169 | 'file_path': 'file.txt', 170 | 'match_text': 'content', 171 | 'rule_name': 'First Rule' 172 | }, 173 | { 174 | 'author_email': 'test@author.email', 175 | 'author_name': 'Author Name', 176 | 'commit_message': 'initial commit', 177 | 'commit_time': '2000-01-01T00:00:00', 178 | 'file_oid': '6b584e8ece562ebffc15d38808cd6b98fc3d97ea', 179 | 'file_path': 'file.txt', 180 | 'match_text': 'content', 181 | 'rule_name': 'First Rule' 182 | }, 183 | ], 184 | ) 185 | 186 | results = grs.scan( 187 | repository_path=self.tmpdir.name, 188 | branch_glob_pattern='*', 189 | from_timestamp=0, 190 | ) 191 | for result in results: 192 | result.pop('commit_id') 193 | self.assertCountEqual( 194 | first=results, 195 | second=[ 196 | { 197 | 'author_email': 'test@author.email', 198 | 'author_name': 'Author Name', 199 | 'commit_message': 'edited file', 200 | 'commit_time': '2001-01-01T00:00:00', 201 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a', 202 | 'file_path': 'file.txt', 203 | 'match_text': 'content', 204 | 'rule_name': 'First Rule' 205 | }, 206 | { 207 | 'author_email': 'test@author.email', 208 | 'author_name': 'Author Name', 209 | 'commit_message': 'edited file in new branch', 210 | 'commit_time': '2002-01-01T00:00:00', 211 | 'file_oid': '0407a18f7c6802c7e7ddc5c9e8af4a34584383ff', 212 | 'file_path': 'file.txt', 213 | 'match_text': 'content', 214 | 'rule_name': 'First Rule' 215 | }, 216 | { 217 | 'author_email': 'test@author.email', 218 | 'author_name': 'Author Name', 219 | 'commit_message': 'initial commit', 220 | 'commit_time': '2000-01-01T00:00:00', 221 | 'file_oid': '6b584e8ece562ebffc15d38808cd6b98fc3d97ea', 222 | 'file_path': 'file.txt', 223 | 'match_text': 'content', 224 | 'rule_name': 'First Rule' 225 | }, 226 | { 227 | 'author_email': 'test@author.email', 228 | 'author_name': 'Author Name', 229 | 'commit_message': 'edited file in non_merged_branch', 230 | 'commit_time': '2004-01-01T00:00:00', 231 | 'file_oid': '057032a2108721ad1de6a9240fd1a8f45bc3f2ef', 232 | 'file_path': 'file.txt', 233 | 'match_text': 'content', 234 | 'rule_name': 'First Rule' 235 | }, 236 | ], 237 | ) 238 | 239 | self.assertEqual( 240 | first=b'new content', 241 | second=grs.get_file_content( 242 | repository_path=self.tmpdir.name, 243 | file_oid='47d2739ba2c34690248c8f91b84bb54e8936899a', 244 | ), 245 | ) 246 | self.assertEqual( 247 | first=b'new content from new branch', 248 | second=grs.get_file_content( 249 | repository_path=self.tmpdir.name, 250 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383ff', 251 | ), 252 | ) 253 | self.assertEqual( 254 | first=b'content', 255 | second=grs.get_file_content( 256 | repository_path=self.tmpdir.name, 257 | file_oid='6b584e8ece562ebffc15d38808cd6b98fc3d97ea', 258 | ), 259 | ) 260 | 261 | def test_scan_from_timestamp( 262 | self, 263 | ): 264 | grs = pyrepscan.GitRepositoryScanner() 265 | grs.add_content_rule( 266 | name='First Rule', 267 | pattern=r'''(content)''', 268 | whitelist_patterns=[], 269 | blacklist_patterns=[], 270 | ) 271 | 272 | grs.add_file_extension_to_skip('py') 273 | grs.add_file_path_to_skip('test_') 274 | 275 | results = grs.scan( 276 | repository_path=self.tmpdir.name, 277 | branch_glob_pattern='*', 278 | from_timestamp=int( 279 | datetime.datetime( 280 | year=2004, 281 | month=1, 282 | day=1, 283 | hour=0, 284 | minute=0, 285 | second=0, 286 | tzinfo=datetime.timezone.utc, 287 | ).timestamp() 288 | ), 289 | ) 290 | for result in results: 291 | result.pop('commit_id') 292 | self.assertCountEqual( 293 | first=results, 294 | second=[ 295 | { 296 | 'author_email': 'test@author.email', 297 | 'author_name': 'Author Name', 298 | 'commit_message': 'edited file in non_merged_branch', 299 | 'commit_time': '2004-01-01T00:00:00', 300 | 'file_oid': '057032a2108721ad1de6a9240fd1a8f45bc3f2ef', 301 | 'file_path': 'file.txt', 302 | 'match_text': 'content', 303 | 'rule_name': 'First Rule' 304 | }, 305 | ], 306 | ) 307 | 308 | results = grs.scan( 309 | repository_path=self.tmpdir.name, 310 | branch_glob_pattern='*', 311 | from_timestamp=int( 312 | datetime.datetime( 313 | year=2004, 314 | month=1, 315 | day=1, 316 | hour=0, 317 | minute=0, 318 | second=1, 319 | tzinfo=datetime.timezone.utc, 320 | ).timestamp() 321 | ), 322 | ) 323 | for result in results: 324 | result.pop('commit_id') 325 | self.assertListEqual( 326 | list1=results, 327 | list2=[], 328 | ) 329 | 330 | def test_scan_file_name( 331 | self, 332 | ): 333 | grs = pyrepscan.GitRepositoryScanner() 334 | grs.add_file_path_rule( 335 | name='First Rule', 336 | pattern=r'(prod|dev|stage).+key', 337 | ) 338 | 339 | results = grs.scan( 340 | repository_path=self.tmpdir.name, 341 | branch_glob_pattern='*', 342 | ) 343 | for result in results: 344 | result.pop('commit_id') 345 | self.assertCountEqual( 346 | first=results, 347 | second=[ 348 | { 349 | 'author_email': 'test@author.email', 350 | 'author_name': 'Author Name', 351 | 'commit_message': 'initial commit', 352 | 'commit_time': '2000-01-01T00:00:00', 353 | 'file_oid': 'ec3741ea9c00bc5cd88564e49fd81d2340a5582f', 354 | 'file_path': 'prod_env_with_content.key', 355 | 'match_text': 'prod_env_with_content.key', 356 | 'rule_name': 'First Rule' 357 | }, 358 | ], 359 | ) 360 | 361 | def test_get_file_content( 362 | self, 363 | ): 364 | grs = pyrepscan.GitRepositoryScanner() 365 | 366 | self.assertEqual( 367 | first=grs.get_file_content( 368 | repository_path=self.tmpdir.name, 369 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383ff', 370 | ), 371 | second=b'new content from new branch', 372 | ) 373 | 374 | def test_scan_exceptions( 375 | self, 376 | ): 377 | grs = pyrepscan.GitRepositoryScanner() 378 | 379 | with self.assertRaises( 380 | expected_exception=RuntimeError, 381 | ): 382 | grs.scan( 383 | repository_path='/non/existent/path', 384 | ) 385 | 386 | def test_add_content_rule_exceptions( 387 | self, 388 | ): 389 | grs = pyrepscan.GitRepositoryScanner() 390 | 391 | with self.assertRaises( 392 | expected_exception=RuntimeError, 393 | ): 394 | grs.add_content_rule( 395 | name='', 396 | pattern=r'regex', 397 | whitelist_patterns=[], 398 | blacklist_patterns=[], 399 | ) 400 | 401 | def test_add_file_path_rule_exceptions( 402 | self, 403 | ): 404 | grs = pyrepscan.GitRepositoryScanner() 405 | 406 | with self.assertRaises( 407 | expected_exception=RuntimeError, 408 | ): 409 | grs.add_file_path_rule( 410 | name='', 411 | pattern=r'regex', 412 | ) 413 | 414 | def test_add_file_extension_to_skip_exceptions( 415 | self, 416 | ): 417 | grs = pyrepscan.GitRepositoryScanner() 418 | 419 | with self.assertRaises( 420 | expected_exception=RuntimeError, 421 | ): 422 | grs.add_file_extension_to_skip( 423 | file_extension='', 424 | ) 425 | 426 | def test_add_file_path_to_skip_exceptions( 427 | self, 428 | ): 429 | grs = pyrepscan.GitRepositoryScanner() 430 | 431 | with self.assertRaises( 432 | expected_exception=RuntimeError, 433 | ): 434 | grs.add_file_path_to_skip( 435 | file_path='', 436 | ) 437 | 438 | def test_get_file_content_exceptions( 439 | self, 440 | ): 441 | grs = pyrepscan.GitRepositoryScanner() 442 | 443 | with self.assertRaises( 444 | expected_exception=RuntimeError, 445 | ): 446 | grs.get_file_content( 447 | repository_path=self.tmpdir.name, 448 | file_oid='', 449 | ) 450 | 451 | with self.assertRaises( 452 | expected_exception=RuntimeError, 453 | ): 454 | grs.get_file_content( 455 | repository_path=self.tmpdir.name, 456 | file_oid='aaaaaaaaa', 457 | ) 458 | 459 | with self.assertRaises( 460 | expected_exception=RuntimeError, 461 | ): 462 | grs.get_file_content( 463 | repository_path=self.tmpdir.name, 464 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383fa', 465 | ) 466 | -------------------------------------------------------------------------------- /tests/test_rules_manager.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import pyrepscan 4 | 5 | 6 | class RulesManagerTestCase( 7 | unittest.TestCase, 8 | ): 9 | def test_should_scan_file_ignored_extensions( 10 | self, 11 | ): 12 | rules_manager = pyrepscan.RulesManager() 13 | 14 | self.assertTrue( 15 | expr=rules_manager.should_scan_file_path('file.txt'), 16 | ) 17 | rules_manager.add_file_extension_to_skip('txt') 18 | self.assertFalse( 19 | expr=rules_manager.should_scan_file_path('file.txt'), 20 | ) 21 | 22 | rules_manager.add_file_extension_to_skip('pdf') 23 | self.assertFalse( 24 | expr=rules_manager.should_scan_file_path('file.txt'), 25 | ) 26 | self.assertFalse( 27 | expr=rules_manager.should_scan_file_path('file.pdf'), 28 | ) 29 | self.assertFalse( 30 | expr=rules_manager.should_scan_file_path('file.other.pdf'), 31 | ) 32 | self.assertTrue( 33 | expr=rules_manager.should_scan_file_path('file.pdf.other'), 34 | ) 35 | self.assertTrue( 36 | expr=rules_manager.should_scan_file_path('file.doc'), 37 | ) 38 | 39 | def test_should_scan_file_ignored_file_paths( 40 | self, 41 | ): 42 | rules_manager = pyrepscan.RulesManager() 43 | 44 | self.assertTrue( 45 | expr=rules_manager.should_scan_file_path('/site-packages/file.txt'), 46 | ) 47 | 48 | rules_manager.add_file_path_to_skip('site-packages') 49 | self.assertFalse( 50 | expr=rules_manager.should_scan_file_path('/site-packages/file.txt'), 51 | ) 52 | self.assertTrue( 53 | expr=rules_manager.should_scan_file_path('/folder_one/subfolder/file.txt'), 54 | ) 55 | 56 | rules_manager.add_file_path_to_skip('folder_one/subfolder') 57 | self.assertFalse( 58 | expr=rules_manager.should_scan_file_path('/folder_one/subfolder/file.txt'), 59 | ) 60 | self.assertTrue( 61 | expr=rules_manager.should_scan_file_path('/folder_one/sub/file.txt'), 62 | ) 63 | 64 | rules_manager.add_file_path_to_skip('part/name') 65 | self.assertFalse( 66 | expr=rules_manager.should_scan_file_path('some_part/name_some'), 67 | ) 68 | 69 | def test_add_content_rule_one( 70 | self, 71 | ): 72 | rules_manager = pyrepscan.RulesManager() 73 | rules_manager.add_content_rule( 74 | name='rule_one', 75 | pattern=r'([a-z]+)', 76 | whitelist_patterns=[], 77 | blacklist_patterns=[], 78 | ) 79 | 80 | self.assertEqual( 81 | first=rules_manager.scan_file( 82 | file_path='', 83 | content='first line\nsecond line\nthird line', 84 | ), 85 | second=[ 86 | { 87 | 'match_text': 'first', 88 | 'rule_name': 'rule_one', 89 | }, 90 | { 91 | 'match_text': 'line', 92 | 'rule_name': 'rule_one', 93 | }, 94 | { 95 | 'match_text': 'second', 96 | 'rule_name': 'rule_one', 97 | }, 98 | { 99 | 'match_text': 'line', 100 | 'rule_name': 'rule_one', 101 | }, 102 | { 103 | 'match_text': 'third', 104 | 'rule_name': 'rule_one', 105 | }, 106 | { 107 | 'match_text': 'line', 108 | 'rule_name': 'rule_one', 109 | }, 110 | ], 111 | ) 112 | 113 | def test_add_content_rule_two( 114 | self, 115 | ): 116 | rules_manager = pyrepscan.RulesManager() 117 | rules_manager.add_content_rule( 118 | name='rule_one', 119 | pattern=r'([a-z]+)', 120 | whitelist_patterns=[], 121 | blacklist_patterns=[ 122 | r'line', 123 | ], 124 | ) 125 | 126 | self.assertEqual( 127 | first=rules_manager.scan_file( 128 | file_path='', 129 | content='first line\nsecond line\nthird line', 130 | ), 131 | second=[ 132 | { 133 | 'match_text': 'first', 134 | 'rule_name': 'rule_one', 135 | }, 136 | { 137 | 'match_text': 'second', 138 | 'rule_name': 'rule_one', 139 | }, 140 | { 141 | 'match_text': 'third', 142 | 'rule_name': 'rule_one', 143 | }, 144 | ], 145 | ) 146 | 147 | def test_add_content_rule_three( 148 | self, 149 | ): 150 | rules_manager = pyrepscan.RulesManager() 151 | rules_manager.add_content_rule( 152 | name='rule_one', 153 | pattern=r'([a-z]+)', 154 | whitelist_patterns=[ 155 | 'second', 156 | 'third', 157 | ], 158 | blacklist_patterns=[], 159 | ) 160 | 161 | self.assertEqual( 162 | first=rules_manager.scan_file( 163 | file_path='', 164 | content='first line\nsecond line\nthird line', 165 | ), 166 | second=[ 167 | { 168 | 'match_text': 'second', 169 | 'rule_name': 'rule_one', 170 | }, 171 | { 172 | 'match_text': 'third', 173 | 'rule_name': 'rule_one', 174 | }, 175 | ], 176 | ) 177 | 178 | def test_add_content_rule_four( 179 | self, 180 | ): 181 | rules_manager = pyrepscan.RulesManager() 182 | rules_manager.add_content_rule( 183 | name='rule_one', 184 | pattern=r'([a-z]+)', 185 | whitelist_patterns=[ 186 | 'second', 187 | 'third', 188 | ], 189 | blacklist_patterns=[ 190 | r'nd$', 191 | ], 192 | ) 193 | 194 | self.assertEqual( 195 | first=rules_manager.scan_file( 196 | file_path='', 197 | content='first line\nsecond line\nthird line', 198 | ), 199 | second=[ 200 | { 201 | 'match_text': 'third', 202 | 'rule_name': 'rule_one', 203 | }, 204 | ], 205 | ) 206 | 207 | def test_add_content_rule_five( 208 | self, 209 | ): 210 | rules_manager = pyrepscan.RulesManager() 211 | rules_manager.add_content_rule( 212 | name='rule_one', 213 | pattern=r'(nothing)', 214 | whitelist_patterns=[], 215 | blacklist_patterns=[], 216 | ) 217 | 218 | self.assertIsNone( 219 | obj=rules_manager.scan_file( 220 | file_path='', 221 | content='first line\nsecond line\nthird line', 222 | ), 223 | ) 224 | 225 | def test_add_content_rule_exceptions( 226 | self, 227 | ): 228 | rules_manager = pyrepscan.RulesManager() 229 | 230 | with self.assertRaises( 231 | expected_exception=RuntimeError, 232 | ): 233 | rules_manager.add_content_rule( 234 | name='', 235 | pattern=r'regex', 236 | whitelist_patterns=[], 237 | blacklist_patterns=[], 238 | ) 239 | 240 | with self.assertRaises( 241 | expected_exception=RuntimeError, 242 | ): 243 | rules_manager.add_content_rule( 244 | name='rule_one', 245 | pattern=r'', 246 | whitelist_patterns=[], 247 | blacklist_patterns=[], 248 | ) 249 | 250 | with self.assertRaises( 251 | expected_exception=RuntimeError, 252 | ): 253 | rules_manager.add_content_rule( 254 | name='rule_one', 255 | pattern=r'(', 256 | whitelist_patterns=[], 257 | blacklist_patterns=[], 258 | ) 259 | 260 | with self.assertRaises( 261 | expected_exception=RuntimeError, 262 | ): 263 | rules_manager.add_content_rule( 264 | name='rule_one', 265 | pattern=r'regex_pattern_without_capturing_group', 266 | whitelist_patterns=[], 267 | blacklist_patterns=[], 268 | ) 269 | 270 | with self.assertRaises( 271 | expected_exception=RuntimeError, 272 | ): 273 | rules_manager.add_content_rule( 274 | name='rule_two', 275 | pattern=r'(content)', 276 | whitelist_patterns=[], 277 | blacklist_patterns=[ 278 | '(', 279 | ], 280 | ) 281 | 282 | with self.assertRaises( 283 | expected_exception=RuntimeError, 284 | ): 285 | rules_manager.add_content_rule( 286 | name='rule_two', 287 | pattern=r'(content)', 288 | whitelist_patterns=[], 289 | blacklist_patterns=[ 290 | '(blacklist_regex_with_capturing_group)', 291 | ], 292 | ) 293 | 294 | with self.assertRaises( 295 | expected_exception=RuntimeError, 296 | ): 297 | rules_manager.add_content_rule( 298 | name='rule_two', 299 | pattern=r'(content)', 300 | whitelist_patterns=[ 301 | '(', 302 | ], 303 | blacklist_patterns=[], 304 | ) 305 | 306 | with self.assertRaises( 307 | expected_exception=RuntimeError, 308 | ): 309 | rules_manager.add_content_rule( 310 | name='rule_two', 311 | pattern=r'(content)', 312 | whitelist_patterns=[ 313 | '(whitelist_regex_with_capturing_group)', 314 | ], 315 | blacklist_patterns=[], 316 | ) 317 | 318 | def test_add_file_path_rule_one( 319 | self, 320 | ): 321 | rules_manager = pyrepscan.RulesManager() 322 | rules_manager.add_file_path_rule( 323 | name='rule_one', 324 | pattern=r'(prod|dev|stage).+key', 325 | ) 326 | 327 | self.assertIsNone( 328 | obj=rules_manager.scan_file( 329 | file_path='workdir/prod/some_file', 330 | content=None, 331 | ), 332 | ) 333 | self.assertEqual( 334 | first=rules_manager.scan_file( 335 | file_path='workdir/prod/some_file.key', 336 | content=None, 337 | ), 338 | second=[ 339 | { 340 | 'match_text': 'workdir/prod/some_file.key', 341 | 'rule_name': 'rule_one', 342 | }, 343 | ], 344 | ) 345 | 346 | rules_manager.add_file_path_rule( 347 | name='rule_two', 348 | pattern=r'prod.+key', 349 | ) 350 | 351 | self.assertIsNone( 352 | obj=rules_manager.scan_file( 353 | file_path='workdir/prod/some_file', 354 | content=None, 355 | ), 356 | ) 357 | self.assertEqual( 358 | first=rules_manager.scan_file( 359 | file_path='workdir/prod/some_file.key', 360 | content=None, 361 | ), 362 | second=[ 363 | { 364 | 'match_text': 'workdir/prod/some_file.key', 365 | 'rule_name': 'rule_one', 366 | }, 367 | { 368 | 'match_text': 'workdir/prod/some_file.key', 369 | 'rule_name': 'rule_two', 370 | }, 371 | ], 372 | ) 373 | 374 | def test_add_file_path_rule_exceptions( 375 | self, 376 | ): 377 | rules_manager = pyrepscan.RulesManager() 378 | 379 | with self.assertRaises( 380 | expected_exception=RuntimeError, 381 | ): 382 | rules_manager.add_file_path_rule( 383 | name='', 384 | pattern=r'regex', 385 | ) 386 | 387 | with self.assertRaises( 388 | expected_exception=RuntimeError, 389 | ): 390 | rules_manager.add_file_path_rule( 391 | name='rule_one', 392 | pattern=r'', 393 | ) 394 | 395 | def test_add_file_extension_to_skip_exceptions( 396 | self, 397 | ): 398 | rules_manager = pyrepscan.RulesManager() 399 | 400 | with self.assertRaises( 401 | expected_exception=RuntimeError, 402 | ): 403 | rules_manager.add_file_extension_to_skip( 404 | file_extension='', 405 | ) 406 | 407 | def test_add_file_path_to_skip_exceptions( 408 | self, 409 | ): 410 | rules_manager = pyrepscan.RulesManager() 411 | 412 | with self.assertRaises( 413 | expected_exception=RuntimeError, 414 | ): 415 | rules_manager.add_file_path_to_skip( 416 | file_path='', 417 | ) 418 | 419 | def test_scan_file_one( 420 | self, 421 | ): 422 | rules_manager = pyrepscan.RulesManager() 423 | 424 | self.assertIsNone( 425 | obj=rules_manager.scan_file( 426 | file_path='/path/to/file.txt', 427 | content=None, 428 | ), 429 | ) 430 | 431 | def test_scan_file_two( 432 | self, 433 | ): 434 | rules_manager = pyrepscan.RulesManager() 435 | 436 | rules_manager.add_content_rule( 437 | name='rule_one', 438 | pattern=r'(some_text)', 439 | whitelist_patterns=[], 440 | blacklist_patterns=[], 441 | ) 442 | self.assertIsNone( 443 | obj=rules_manager.scan_file( 444 | file_path='/path/to/file.txt', 445 | content=None, 446 | ), 447 | ) 448 | self.assertIsNone( 449 | obj=rules_manager.scan_file( 450 | file_path='/path/to/file.txt', 451 | content='', 452 | ), 453 | ) 454 | self.assertIsNone( 455 | obj=rules_manager.scan_file( 456 | file_path='/path/to/file.txt', 457 | content='other_text', 458 | ), 459 | ) 460 | self.assertEqual( 461 | first=rules_manager.scan_file( 462 | file_path='/path/to/file.txt', 463 | content='some_text', 464 | ), 465 | second=[ 466 | { 467 | 'rule_name': 'rule_one', 468 | 'match_text': 'some_text', 469 | }, 470 | ], 471 | ) 472 | 473 | rules_manager.add_content_rule( 474 | name='rule_two', 475 | pattern=r'(some)', 476 | whitelist_patterns=[], 477 | blacklist_patterns=[], 478 | ) 479 | self.assertEqual( 480 | first=rules_manager.scan_file( 481 | file_path='/path/to/file.txt', 482 | content='some_text', 483 | ), 484 | second=[ 485 | { 486 | 'rule_name': 'rule_one', 487 | 'match_text': 'some_text', 488 | }, 489 | { 490 | 'rule_name': 'rule_two', 491 | 'match_text': 'some', 492 | }, 493 | ], 494 | ) 495 | 496 | def test_scan_file_three( 497 | self, 498 | ): 499 | rules_manager = pyrepscan.RulesManager() 500 | 501 | rules_manager.add_content_rule( 502 | name='rule_one', 503 | pattern=r'(some_.+)', 504 | whitelist_patterns=[], 505 | blacklist_patterns=[ 506 | r'text', 507 | ], 508 | ) 509 | self.assertIsNone( 510 | obj=rules_manager.scan_file( 511 | file_path='/path/to/file.txt', 512 | content='some_text', 513 | ), 514 | ) 515 | self.assertEqual( 516 | first=rules_manager.scan_file( 517 | file_path='/path/to/file.txt', 518 | content='some_other', 519 | ), 520 | second=[ 521 | { 522 | 'rule_name': 'rule_one', 523 | 'match_text': 'some_other', 524 | }, 525 | ], 526 | ) 527 | 528 | def test_scan_file_four( 529 | self, 530 | ): 531 | rules_manager = pyrepscan.RulesManager() 532 | 533 | rules_manager.add_content_rule( 534 | name='rule_one', 535 | pattern=r'(some_.+)', 536 | whitelist_patterns=[], 537 | blacklist_patterns=[ 538 | r'text', 539 | r'other', 540 | ], 541 | ) 542 | self.assertIsNone( 543 | obj=rules_manager.scan_file( 544 | file_path='/path/to/file.txt', 545 | content='some_text', 546 | ), 547 | ) 548 | self.assertIsNone( 549 | obj=rules_manager.scan_file( 550 | file_path='/path/to/file.txt', 551 | content='some_other', 552 | ), 553 | ) 554 | self.assertEqual( 555 | first=rules_manager.scan_file( 556 | file_path='/path/to/file.txt', 557 | content='some_diff', 558 | ), 559 | second=[ 560 | { 561 | 'rule_name': 'rule_one', 562 | 'match_text': 'some_diff', 563 | }, 564 | ], 565 | ) 566 | 567 | def test_scan_file_five( 568 | self, 569 | ): 570 | rules_manager = pyrepscan.RulesManager() 571 | 572 | rules_manager.add_content_rule( 573 | name='rule_one', 574 | pattern=r'(some_.+)', 575 | whitelist_patterns=[ 576 | 'diff', 577 | ], 578 | blacklist_patterns=[], 579 | ) 580 | self.assertIsNone( 581 | obj=rules_manager.scan_file( 582 | file_path='/path/to/file.txt', 583 | content='some_text', 584 | ), 585 | ) 586 | self.assertIsNone( 587 | obj=rules_manager.scan_file( 588 | file_path='/path/to/file.txt', 589 | content='some_other', 590 | ), 591 | ) 592 | self.assertEqual( 593 | first=rules_manager.scan_file( 594 | file_path='/path/to/file.txt', 595 | content='some_diff', 596 | ), 597 | second=[ 598 | { 599 | 'rule_name': 'rule_one', 600 | 'match_text': 'some_diff', 601 | }, 602 | ], 603 | ) 604 | 605 | def test_scan_file_six( 606 | self, 607 | ): 608 | rules_manager = pyrepscan.RulesManager() 609 | 610 | rules_manager.add_content_rule( 611 | name='rule_one', 612 | pattern=r'(some_.+)', 613 | whitelist_patterns=[ 614 | 'diff', 615 | 'other', 616 | ], 617 | blacklist_patterns=[], 618 | ) 619 | self.assertIsNone( 620 | obj=rules_manager.scan_file( 621 | file_path='/path/to/file.txt', 622 | content='some_text', 623 | ), 624 | ) 625 | self.assertEqual( 626 | first=rules_manager.scan_file( 627 | file_path='/path/to/file.txt', 628 | content='some_other', 629 | ), 630 | second=[ 631 | { 632 | 'rule_name': 'rule_one', 633 | 'match_text': 'some_other', 634 | }, 635 | ], 636 | ) 637 | self.assertEqual( 638 | first=rules_manager.scan_file( 639 | file_path='/path/to/file.txt', 640 | content='some_diff', 641 | ), 642 | second=[ 643 | { 644 | 'rule_name': 'rule_one', 645 | 'match_text': 'some_diff', 646 | }, 647 | ], 648 | ) 649 | 650 | def test_scan_file_seven( 651 | self, 652 | ): 653 | rules_manager = pyrepscan.RulesManager() 654 | 655 | rules_manager.add_file_path_rule( 656 | name='rule_one', 657 | pattern=r'dev\.txt', 658 | ) 659 | self.assertIsNone( 660 | obj=rules_manager.scan_file( 661 | file_path='/path/to/file.txt', 662 | content=None, 663 | ), 664 | ) 665 | self.assertIsNone( 666 | obj=rules_manager.scan_file( 667 | file_path='/path/to/file.txt', 668 | content='', 669 | ), 670 | ) 671 | self.assertIsNone( 672 | obj=rules_manager.scan_file( 673 | file_path='/path/to/file.txt', 674 | content='other_text', 675 | ), 676 | ) 677 | self.assertEqual( 678 | first=rules_manager.scan_file( 679 | file_path='/path/to/dev.txt', 680 | content='', 681 | ), 682 | second=[ 683 | { 684 | 'rule_name': 'rule_one', 685 | 'match_text': '/path/to/dev.txt', 686 | }, 687 | ], 688 | ) 689 | 690 | rules_manager.add_file_path_rule( 691 | name='rule_two', 692 | pattern=r'(\.txt)', 693 | ) 694 | self.assertEqual( 695 | first=rules_manager.scan_file( 696 | file_path='/path/to/dev.txt', 697 | content='some_text', 698 | ), 699 | second=[ 700 | { 701 | 'rule_name': 'rule_one', 702 | 'match_text': '/path/to/dev.txt', 703 | }, 704 | { 705 | 'rule_name': 'rule_two', 706 | 'match_text': '/path/to/dev.txt', 707 | }, 708 | ], 709 | ) 710 | 711 | def test_check_pattern( 712 | self, 713 | ): 714 | rules_manager = pyrepscan.RulesManager() 715 | 716 | with self.assertRaises( 717 | expected_exception=RuntimeError, 718 | ): 719 | rules_manager.check_pattern( 720 | content='', 721 | pattern=r'(', 722 | ) 723 | 724 | with self.assertRaises( 725 | expected_exception=RuntimeError, 726 | ): 727 | rules_manager.check_pattern( 728 | content='', 729 | pattern=r'no_capturing_group', 730 | ) 731 | 732 | with self.assertRaises( 733 | expected_exception=RuntimeError, 734 | ): 735 | rules_manager.check_pattern( 736 | content='', 737 | pattern=r'(?:\:)', 738 | ) 739 | 740 | self.assertEqual( 741 | first=rules_manager.check_pattern( 742 | content='some sentence', 743 | pattern=r'([^ ]+)', 744 | ), 745 | second=[ 746 | 'some', 747 | 'sentence', 748 | ] 749 | ) 750 | --------------------------------------------------------------------------------