├── .github
└── workflows
│ ├── build.yml
│ └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── benchmarks
├── gitleaks.sh
├── gitleaks.toml
└── pyrepscan_bench.py
├── cortex.yaml
├── images
└── logo.png
├── pyproject.toml
├── pyrepscan
├── __init__.py
├── py.typed
└── pyrepscan.pyi
├── src
├── git_repository_scanner.rs
├── lib.rs
└── rules_manager.rs
└── tests
├── __init__.py
├── test_git_repository_scanner.py
└── test_rules_manager.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 | lint:
7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v3
12 | - name: Install latest rust
13 | uses: actions-rs/toolchain@v1
14 | with:
15 | toolchain: stable
16 | profile: minimal
17 | override: true
18 | components: clippy
19 | - name: Lint with clippy
20 | uses: actions-rs/cargo@v1
21 | with:
22 | command: clippy
23 | args: --all-targets --all-features
24 | test:
25 | runs-on: ${{ matrix.os }}
26 | needs: lint
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | python-version:
31 | - '3.7'
32 | - '3.8'
33 | - '3.9'
34 | - '3.10'
35 | - '3.11'
36 | os:
37 | - ubuntu-latest
38 | - macos-latest
39 | - windows-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v3
43 | - name: Set up Python ${{ matrix.python-version }}
44 | uses: actions/setup-python@v3
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Install Poetry
48 | uses: abatilo/actions-poetry@v2.1.3
49 | - name: Install Rust
50 | uses: actions-rs/toolchain@v1
51 | with:
52 | profile: minimal
53 | toolchain: stable
54 | override: true
55 | - name: Install dependencies
56 | run: poetry install
57 | - name: Build Python package
58 | run: poetry run maturin develop
59 | - name: Test
60 | run: poetry run pytest -Werror tests
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | release:
4 | types:
5 | - released
6 | jobs:
7 | deploy:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version:
13 | - '3.7'
14 | - '3.8'
15 | - '3.9'
16 | - '3.10'
17 | - '3.11'
18 | os:
19 | - ubuntu-latest
20 | - macos-latest
21 | - windows-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - name: Install Rust
30 | uses: actions-rs/toolchain@v1
31 | with:
32 | profile: minimal
33 | toolchain: stable
34 | override: true
35 | - name: Install Cross-compilers (macOS)
36 | if: matrix.os == 'macos-latest'
37 | run: |
38 | rustup target add x86_64-apple-darwin
39 | rustup target add aarch64-apple-darwin
40 | - name: Publish Package
41 | uses: messense/maturin-action@v1
42 | if: matrix.os != 'macos-latest'
43 | with:
44 | command: publish
45 | args: --username=__token__ --no-sdist --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
46 | env:
47 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
48 | - name: Publish macOS (x86_64) Package
49 | if: matrix.os == 'macos-latest'
50 | uses: PyO3/maturin-action@v1
51 | with:
52 | command: publish
53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
54 | env:
55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
56 | - name: Publish macOS (arm64) Package
57 | if: matrix.os == 'macos-latest'
58 | uses: PyO3/maturin-action@v1
59 | with:
60 | command: publish
61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
62 | env:
63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,rust,python
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,rust,python
4 |
5 | ### Python ###
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 | .vscode/
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | build/
18 | develop-eggs/
19 | dist/
20 | downloads/
21 | eggs/
22 | .eggs/
23 | lib/
24 | lib64/
25 | parts/
26 | sdist/
27 | var/
28 | wheels/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 | cover/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | .pybuilder/
82 | target/
83 |
84 | # Jupyter Notebook
85 | .ipynb_checkpoints
86 |
87 | # IPython
88 | profile_default/
89 | ipython_config.py
90 |
91 | # pyenv
92 | # For a library or package, you might want to ignore these files since the code is
93 | # intended to run in multiple environments; otherwise, check them in:
94 | # .python-version
95 |
96 | # pipenv
97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
100 | # install all needed dependencies.
101 | #Pipfile.lock
102 |
103 | # poetry
104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | # This is especially recommended for binary packages to ensure reproducibility, and is more
106 | # commonly ignored for libraries.
107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 |
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 |
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 |
117 | # SageMath parsed files
118 | *.sage.py
119 |
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 |
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 |
133 | # Rope project settings
134 | .ropeproject
135 |
136 | # mkdocs documentation
137 | /site
138 |
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 |
144 | # Pyre type checker
145 | .pyre/
146 |
147 | # pytype static type analyzer
148 | .pytype/
149 |
150 | # Cython debug symbols
151 | cython_debug/
152 |
153 | # PyCharm
154 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
155 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
156 | # and can be added to the global gitignore or merged into this file. For a more nuclear
157 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
158 | #.idea/
159 |
160 | ### Rust ###
161 | # Generated by Cargo
162 | # will have compiled files and executables
163 | debug/
164 |
165 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
166 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
167 | Cargo.lock
168 |
169 | # These are backup files generated by rustfmt
170 | **/*.rs.bk
171 |
172 | # MSVC Windows builds of rustc generate these, which store debugging information
173 | *.pdb
174 |
175 | ### VisualStudioCode ###
176 | .vscode/*
177 | !.vscode/settings.json
178 | !.vscode/tasks.json
179 | !.vscode/launch.json
180 | !.vscode/extensions.json
181 | !.vscode/*.code-snippets
182 |
183 | # Local History for Visual Studio Code
184 | .history/
185 |
186 | # Built Visual Studio Code Extensions
187 | *.vsix
188 |
189 | ### VisualStudioCode Patch ###
190 | # Ignore all local history of files
191 | .history
192 | .ionide
193 |
194 | # Support for Project snippet scope
195 |
196 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,rust,python
197 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "pyrepscan"
3 | version = "0.12.0"
4 | authors = ["Gal Ben David "]
5 | edition = "2021"
6 | description = "A Git Repository Secrets Scanner written in Rust"
7 | readme = "README.md"
8 | repository = "https://github.com/intsights/pyrepscan"
9 | homepage = "https://github.com/intsights/pyrepscan"
10 | license = "MIT"
11 | keywords = [
12 | "git",
13 | "secrets",
14 | "scanner",
15 | "rust",
16 | "pyo3",
17 | ]
18 |
19 | [lib]
20 | name = "pyrepscan"
21 | crate-type = ["cdylib"]
22 |
23 | [dependencies]
24 | aho-corasick = "0.7.18"
25 | chrono = "0.4.19"
26 | crossbeam = "0.8.1"
27 | crossbeam-utils = "0.8.10"
28 | parking_lot = "0.12.1"
29 | regex = "1.6.0"
30 |
31 | [dependencies.libgit2-sys]
32 | version = "0.13.4"
33 | features = ["https"]
34 |
35 | [dependencies.git2]
36 | version = "0.14.4"
37 | features = ["vendored-openssl"]
38 |
39 | [dependencies.pyo3]
40 | version = "0.16.5"
41 | features = ["extension-module"]
42 |
43 | [profile.release]
44 | lto = true
45 | panic = "abort"
46 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Gal Ben David
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | A Git Repository Secrets Scanner written in Rust
7 |
8 |
9 |
10 | 
11 | 
12 | 
13 | [](https://pypi.org/project/PyRepScan/)
14 |
15 | ## Table of Contents
16 |
17 | - [Table of Contents](#table-of-contents)
18 | - [About The Project](#about-the-project)
19 | - [Built With](#built-with)
20 | - [Performance](#performance)
21 | - [CPU](#cpu)
22 | - [Installation](#installation)
23 | - [Documentation](#documentation)
24 | - [Usage](#usage)
25 | - [License](#license)
26 | - [Contact](#contact)
27 |
28 |
29 | ## About The Project
30 |
31 | PyRepScan is a python library written in Rust. The library uses [git2-rs](https://github.com/rust-lang/git2-rs) for repository parsing and traversing, [regex](https://github.com/rust-lang/regex) for regex pattern matching and [crossbeam](https://github.com/crossbeam-rs/crossbeam) for concurrency. The library was written to achieve high performance and python bindings.
32 |
33 |
34 | ### Built With
35 |
36 | * [git2-rs](https://github.com/rust-lang/git2-rs)
37 | * [regex](https://github.com/rust-lang/regex)
38 | * [crossbeam](https://github.com/crossbeam-rs/crossbeam)
39 | * [parking-lot](https://github.com/Amanieu/parking_lot)
40 |
41 |
42 | ### Performance
43 |
44 | #### CPU
45 | | Library | Time | Peak Memory |
46 | | ------------- | ------------- | ------------- |
47 | | [PyRepScan](https://github.com/intsights/PyRepScan) | 8.74s | 1,149,152 kb |
48 | | [gitleaks](https://github.com/zricethezav/gitleaks) | 1118s | 1,146,300 kb |
49 |
50 |
51 | ### Installation
52 |
53 | ```sh
54 | pip3 install PyRepScan
55 | ```
56 |
57 |
58 | ## Documentation
59 |
60 | ```python
61 | class GitRepositoryScanner:
62 | def __init__(
63 | self,
64 | ) -> None
65 | ```
66 | This class holds all the added rules for fast reuse.
67 |
68 |
69 | ```python
70 | def add_content_rule(
71 | self,
72 | name: str,
73 | pattern: str,
74 | whitelist_patterns: typing.List[str],
75 | blacklist_patterns: typing.List[str],
76 | ) -> None
77 | ```
78 | The `add_content_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. Content rule means that the regex pattern would be tested against the content of the files.
79 | - `name` - The name of the rule so it can be identified.
80 | - `pattern` - The regex pattern (Rust Regex syntax) to match against the content of the commited files.
81 | - `whitelist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter in results. Only one of the patterns should be matched to pass through the result. There is an OR relation between the patterns.
82 | - `blacklist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter out results. Only one of the patterns should be matched to omit the result. There is an OR relation between the patterns.
83 |
84 |
85 | ```python
86 | def add_file_path_rule(
87 | self,
88 | name: str,
89 | pattern: str,
90 | ) -> None
91 | ```
92 | The `add_file_path_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. File name rule means that the regex pattern would be tested against the file paths.
93 | - `name` - The name of the rule so it can be identified.
94 | - `pattern` - The regex pattern (Rust Regex syntax) to match against the file paths of the commited files.
95 |
96 |
97 | ```python
98 | def add_file_extension_to_skip(
99 | self,
100 | file_extension: str,
101 | ) -> None
102 | ```
103 | The `add_file_extension_to_skip` function adds a new file extension to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan.
104 | - `file_extension` - A file extension, without a leading dot, to filter out from the scan.
105 |
106 |
107 | ```python
108 | def add_file_path_to_skip(
109 | self,
110 | file_path: str,
111 | ) -> None
112 | ```
113 | The `add_file_path_to_skip` function adds a new file path pattern to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan. Every file path that would include the `file_path` substring would be left out of the scanned files.
114 | - `file_path` - If the inspected file path would include this substring, it won't be scanned. This parameter is a free text.
115 |
116 |
117 | ```python
118 | def scan(
119 | self,
120 | repository_path: str,
121 | branch_glob_pattern: typing.Optional[str],
122 | from_timestamp: typing.Optional[int],
123 | ) -> typing.List[typing.Dict[str, str]]
124 | ```
125 | The `scan` function is the main function in the library. Calling this function would trigger a new scan that would return a list of matches. The scan function is a multithreaded operation, that would utilize all the available core in the system. The results would not include the file content but only the regex matching group. To retrieve the full file content one should take the `results['oid']` and to call `get_file_content` function.
126 | - `repository_path` - The git repository folder path.
127 | - `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`.
128 | - `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`.
129 |
130 | A sample result would look like this:
131 | ```python
132 | {
133 | 'rule_name': 'First Rule',
134 | 'author_email': 'author@email.email',
135 | 'author_name': 'Author Name',
136 | 'commit_id': '1111111111111111111111111111111111111111',
137 | 'commit_message': 'The commit message',
138 | 'commit_time': '2020-01-01T00:00:00e',
139 | 'file_path': 'full/file/path',
140 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a',
141 | 'match': 'The matched group',
142 | }
143 | ```
144 |
145 |
146 | ```python
147 | def scan_from_url(
148 | self,
149 | url: str,
150 | repository_path: str,
151 | branch_glob_pattern: typing.Optional[str],
152 | from_timestamp: typing.Optional[int],
153 | ) -> typing.List[typing.Dict[str, str]]
154 | ```
155 | The same as `scan` function but also clones a repository from a given URL into the provided repository path.
156 | - `url` - URL of a git repository.
157 | - `repository_path` - The path to clone the repository to
158 | - `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`.
159 | - `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`.
160 |
161 |
162 | ```python
163 | def get_file_content(
164 | self,
165 | repository_path: str,
166 | file_oid: str,
167 | ) -> bytes
168 | ```
169 | The `get_file_content` function exists to retrieve the content of a file that was previously matched. The full file content is omitted from the results to reduce the results list size and to deliver better performance.
170 | - `repository_path` - The git repository folder path.
171 | - `file_oid` - A string representing the file oid. This parameter exists in the results dictionary returned by the `scan` function.
172 |
173 |
174 | ## Usage
175 |
176 | ```python
177 | import pyrepscan
178 |
179 | grs = pyrepscan.GitRepositoryScanner()
180 |
181 | # Adds a specific rule, can be called multiple times or none
182 | grs.add_content_rule(
183 | name='First Rule',
184 | pattern=r'(-----BEGIN PRIVATE KEY-----)',
185 | whitelist_patterns=[],
186 | blacklist_patterns=[],
187 | )
188 | grs.add_file_path_rule(
189 | name='Second Rule',
190 | pattern=r'.+\.pem',
191 | )
192 | grs.add_file_path_rule(
193 | name='Third Rule',
194 | pattern=r'(prod|dev|stage).+key',
195 | )
196 |
197 | # Add file extensions to ignore during the search
198 | grs.add_file_extension_to_skip(
199 | file_extension='bin',
200 | )
201 | grs.add_file_extension_to_skip(
202 | file_extension='jpg',
203 | )
204 |
205 | # Add file paths to ignore during the search. Free text is allowed
206 | grs.add_file_path_to_skip(
207 | file_path='site-packages',
208 | )
209 | grs.add_file_path_to_skip(
210 | file_path='node_modules',
211 | )
212 |
213 | # Scans a repository
214 | results = grs.scan(
215 | repository_path='/repository/path',
216 | branch_glob_pattern='*',
217 | )
218 |
219 | # Results is a list of dicts. Each dict is in the following format:
220 | {
221 | 'rule_name': 'First Rule',
222 | 'author_email': 'author@email.email',
223 | 'author_name': 'Author Name',
224 | 'commit_id': '1111111111111111111111111111111111111111',
225 | 'commit_message': 'The commit message',
226 | 'commit_time': '2020-01-01T00:00:00e',
227 | 'file_path': 'full/file/path',
228 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a',
229 | 'match': 'The matched group',
230 | }
231 |
232 | # Fetch the file_oid full content
233 | file_content = grs.get_file_content(
234 | repository_path='/repository/path',
235 | file_oid='47d2739ba2c34690248c8f91b84bb54e8936899a',
236 | )
237 |
238 | # file_content
239 | b'binary data'
240 |
241 | # Creating a RulesManager directly
242 | rules_manager = pyrepscan.RulesManager()
243 |
244 | # For testing purposes, check your regexes pattern using check_pattern function
245 | rules_manager.check_pattern(
246 | content='some content1 to check, another content2 in the same line\nanother content3 in another line\n',
247 | pattern=r'(content\d)',
248 | )
249 |
250 | # Results are the list of captured matches
251 | [
252 | 'content1',
253 | 'content2',
254 | 'content3',
255 | ]
256 | ```
257 |
258 |
259 | ## License
260 |
261 | Distributed under the MIT License. See `LICENSE` for more information.
262 |
263 |
264 | ## Contact
265 |
266 | Gal Ben David - gal@intsights.com
267 |
268 | Project Link: [https://github.com/intsights/PyRepScan](https://github.com/intsights/PyRepScan)
269 |
--------------------------------------------------------------------------------
/benchmarks/gitleaks.sh:
--------------------------------------------------------------------------------
1 | docker pull zricethezav/gitleaks:latest
2 | docker run -v ${FOLDER_TO_SCAN}:/path -v ${PWD}/benchmarks/gitleaks.toml:/gitleaks.toml zricethezav/gitleaks:latest detect --source="/path" --config=/gitleaks.toml
3 |
--------------------------------------------------------------------------------
/benchmarks/gitleaks.toml:
--------------------------------------------------------------------------------
1 | [[rules]]
2 | description = "AWS Manager ID"
3 | regex = '''(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}'''
4 | tags = ["key", "AWS"]
5 |
--------------------------------------------------------------------------------
/benchmarks/pyrepscan_bench.py:
--------------------------------------------------------------------------------
1 | import pyrepscan
2 |
3 |
4 | grs = pyrepscan.GitRepositoryScanner()
5 | grs.add_content_rule(
6 | name='AWS Manager ID',
7 | pattern=r'(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}',
8 | whitelist_patterns=[],
9 | blacklist_patterns=[],
10 | )
11 | results = grs.scan(
12 | repository_path='/path/to/repository',
13 | branch_glob_pattern='*',
14 | )
15 | print(len(results))
16 |
--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | info:
3 | title: Pyrepscan
4 | description: A Git Repository Secrets Scanner written in Rust
5 | x-cortex-git:
6 | github:
7 | alias: intsightsorg
8 | repository: Intsights/PyRepScan
9 | x-cortex-tag: pyrepscan
10 | x-cortex-type: service
11 | x-cortex-domain-parents:
12 | - tag: threatintel-brand-security
13 | x-cortex-groups:
14 | - exposure:external-ship
15 | - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/images/logo.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pyrepscan"
3 | version = "0.12.0"
4 | description = "A Git Repository Secrets Scanner written in Rust"
5 | authors = [
6 | {email = "gal@intsights.com"},
7 | {name = "Gal Ben David"}
8 | ]
9 | requires-python = ">=3.7"
10 | license = {file = "LICENSE"}
11 | classifiers = [
12 | "License :: OSI Approved :: MIT License",
13 | "Operating System :: MacOS",
14 | "Operating System :: Microsoft",
15 | "Operating System :: POSIX :: Linux",
16 | "Programming Language :: Python :: 3.7",
17 | "Programming Language :: Python :: 3.8",
18 | "Programming Language :: Python :: 3.9",
19 | "Programming Language :: Python :: 3.10",
20 | "Programming Language :: Python :: 3.11",
21 | "Programming Language :: Rust",
22 | ]
23 |
24 | [build-system]
25 | requires = ["maturin>=0.12,<0.13"]
26 | build-backend = "maturin"
27 |
28 | [tool.maturin]
29 | sdist-include = [
30 | "Cargo.toml",
31 | "pyproject.toml",
32 | "pyrepscan/*.py",
33 | "pyrepscan/*.pyi",
34 | "src/*",
35 | ]
36 |
37 | [tool.poetry]
38 | name = "pyrepscan"
39 | version = "0.11.0"
40 | authors = ["Gal Ben David "]
41 | description = "A Git Repository Secrets Scanner written in Rust"
42 | readme = "README.md"
43 | repository = "https://github.com/intsights/pyrepscan"
44 | homepage = "https://github.com/intsights/pyrepscan"
45 | license = "MIT"
46 | keywords = [
47 | "git",
48 | "secrets",
49 | "scanner",
50 | "rust",
51 | "pyo3"
52 | ]
53 | classifiers = [
54 | "License :: OSI Approved :: MIT License",
55 | "Operating System :: MacOS",
56 | "Operating System :: Microsoft",
57 | "Operating System :: POSIX :: Linux",
58 | "Programming Language :: Python :: 3.7",
59 | "Programming Language :: Python :: 3.8",
60 | "Programming Language :: Python :: 3.9",
61 | "Programming Language :: Python :: 3.10",
62 | "Programming Language :: Python :: 3.11",
63 | "Programming Language :: Rust",
64 | ]
65 |
66 | [tool.poetry.dependencies]
67 | python = "^3.7"
68 |
69 | [tool.poetry.dev-dependencies]
70 | pytest = "*"
71 | gitpython = { git = "https://github.com/gitpython-developers/GitPython" }
72 | wheel = "*"
73 | pytest-runner = "*"
74 | maturin = "*"
75 |
76 | [tool.pytest.ini_options]
77 | minversion = "6.0"
78 | addopts = [
79 | "--tb=native",
80 | "--pythonwarnings=all",
81 | ]
82 | testpaths = [
83 | "tests",
84 | ]
85 |
--------------------------------------------------------------------------------
/pyrepscan/__init__.py:
--------------------------------------------------------------------------------
1 | from . import pyrepscan
2 |
3 |
4 | GitRepositoryScanner = pyrepscan.GitRepositoryScanner
5 | RulesManager = pyrepscan.RulesManager
6 |
--------------------------------------------------------------------------------
/pyrepscan/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/pyrepscan/py.typed
--------------------------------------------------------------------------------
/pyrepscan/pyrepscan.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 |
4 | class GitRepositoryScanner:
5 | def __init__(
6 | self,
7 | ) -> None: ...
8 |
9 | def add_content_rule(
10 | self,
11 | name: str,
12 | pattern: str,
13 | whitelist_patterns: typing.List[str],
14 | blacklist_patterns: typing.List[str],
15 | ) -> None: ...
16 |
17 | def add_file_path_rule(
18 | self,
19 | name: str,
20 | pattern: str,
21 | ) -> None: ...
22 |
23 | def add_file_extension_to_skip(
24 | self,
25 | file_extension: str,
26 | ) -> None: ...
27 |
28 | def add_file_path_to_skip(
29 | self,
30 | file_path: str,
31 | ) -> None: ...
32 |
33 | def scan(
34 | self,
35 | repository_path: str,
36 | branch_glob_pattern: typing.Optional[str],
37 | from_timestamp: typing.Optional[int],
38 | ) -> typing.List[typing.Dict[str, str]]: ...
39 |
40 | def scan_from_url(
41 | self,
42 | url: str,
43 | repository_path: str,
44 | branch_glob_pattern: typing.Optional[str],
45 | from_timestamp: typing.Optional[int],
46 | ) -> typing.List[typing.Dict[str, str]]: ...
47 |
48 | def get_file_content(
49 | self,
50 | repository_path: str,
51 | file_oid: str,
52 | ) -> bytes: ...
53 |
54 |
55 | class RulesManager:
56 | def __init__(
57 | self,
58 | ) -> None: ...
59 |
60 | def add_content_rule(
61 | self,
62 | name: str,
63 | pattern: str,
64 | whitelist_patterns: typing.List[str],
65 | blacklist_patterns: typing.List[str],
66 | ) -> None: ...
67 |
68 | def add_file_path_rule(
69 | self,
70 | name: str,
71 | pattern: str,
72 | ) -> None: ...
73 |
74 | def add_file_extension_to_skip(
75 | self,
76 | file_extension: str,
77 | ) -> None: ...
78 |
79 | def add_file_path_to_skip(
80 | self,
81 | file_path: str,
82 | ) -> None: ...
83 |
84 | def should_scan_file_path(
85 | self,
86 | file_path: str,
87 | ) -> bool: ...
88 |
89 | def scan_file(
90 | self,
91 | file_path: str,
92 | content: typing.Optional[str],
93 | ) -> typing.Optional[typing.List[typing.Dict[str, str]]]: ...
94 |
95 | def check_pattern(
96 | self,
97 | content: str,
98 | pattern: str,
99 | ) -> typing.List[str]: ...
100 |
--------------------------------------------------------------------------------
/src/git_repository_scanner.rs:
--------------------------------------------------------------------------------
1 | use crate::rules_manager;
2 |
3 | use chrono::prelude::*;
4 | use crossbeam_utils::atomic::AtomicCell;
5 | use crossbeam_utils::thread as crossbeam_thread;
6 | use crossbeam::queue::ArrayQueue;
7 | use git2::{Oid, Repository, Delta};
8 | use parking_lot::Mutex;
9 | use pyo3::exceptions::PyRuntimeError;
10 | use pyo3::prelude::*;
11 | use std::collections::HashMap;
12 | use std::path::Path;
13 | use std::sync::Arc;
14 | use std::thread;
15 | use std::time;
16 |
17 | fn scan_commit_oid(
18 | should_stop: &AtomicCell,
19 | git_repo: &Repository,
20 | oid: &Oid,
21 | rules_manager: &rules_manager::RulesManager,
22 | output_matches: Arc>>>,
23 | ) -> Result<(), git2::Error> {
24 | let commit = git_repo.find_commit(*oid)?;
25 |
26 | let commit_parent_count = commit.parent_count();
27 | if commit_parent_count > 1 {
28 | return Ok(());
29 | }
30 |
31 | let commit_tree = commit.tree()?;
32 |
33 | let commit_diff = if commit_parent_count == 0 {
34 | git_repo.diff_tree_to_tree(None, Some(&commit_tree), None)?
35 | } else {
36 | let parent_commit = commit.parent(0)?;
37 | let parent_commit_tree = parent_commit.tree()?;
38 |
39 | git_repo.diff_tree_to_tree(Some(&parent_commit_tree), Some(&commit_tree), None)?
40 | };
41 |
42 | for delta in commit_diff.deltas() {
43 | if should_stop.load() {
44 | break;
45 | }
46 |
47 | match delta.status() {
48 | Delta::Added | Delta::Modified => {},
49 | _ => continue,
50 | }
51 |
52 | let new_file = delta.new_file();
53 |
54 | let delta_new_file_path = match new_file.path() {
55 | Some(path) => path.to_string_lossy().to_string(),
56 | None => continue,
57 | };
58 | if !rules_manager.should_scan_file_path(&delta_new_file_path.to_ascii_lowercase()) {
59 | continue;
60 | }
61 |
62 | let delta_new_file_blob = match git_repo.find_blob(new_file.id()) {
63 | Ok(blob) => blob,
64 | Err(_) => continue,
65 | };
66 |
67 | if delta_new_file_blob.size() < 2 {
68 | continue;
69 | }
70 |
71 | let delta_new_file_content = if delta_new_file_blob.is_binary() || delta_new_file_blob.size() > 5000000 {
72 | None
73 | } else {
74 | match std::str::from_utf8(delta_new_file_blob.content()) {
75 | Ok(content) => Some(content),
76 | Err(_) => None,
77 | }
78 | };
79 |
80 | let scan_matches = rules_manager.scan_file(&delta_new_file_path, delta_new_file_content);
81 | if let Some(scan_matches) = scan_matches {
82 | for scan_match in scan_matches.iter() {
83 | let mut match_hashmap = HashMap::with_capacity(9);
84 | match_hashmap.insert(
85 | "commit_id",
86 | commit.id().to_string(),
87 | );
88 | match_hashmap.insert(
89 | "commit_message",
90 | commit.message().unwrap_or("").to_string(),
91 | );
92 | match_hashmap.insert(
93 | "commit_time",
94 | Utc.timestamp(commit.time().seconds(), 0).format("%Y-%m-%dT%H:%M:%S").to_string(),
95 | );
96 | match_hashmap.insert(
97 | "author_name",
98 | commit.author().name().unwrap_or("").to_string(),
99 | );
100 | match_hashmap.insert(
101 | "author_email",
102 | commit.author().email().unwrap_or("").to_string(),
103 | );
104 | match_hashmap.insert(
105 | "file_path",
106 | new_file.path().unwrap_or_else(|| Path::new("")).to_string_lossy().to_string(),
107 | );
108 | match_hashmap.insert(
109 | "file_oid",
110 | new_file.id().to_string(),
111 | );
112 | match_hashmap.insert(
113 | "rule_name",
114 | scan_match.get("rule_name").unwrap_or(&String::from("")).to_string(),
115 | );
116 | match_hashmap.insert(
117 | "match_text",
118 | scan_match.get("match_text").unwrap_or(&String::from("")).to_string(),
119 | );
120 | output_matches.lock().push(match_hashmap);
121 | }
122 | }
123 | }
124 |
125 | Ok(())
126 | }
127 |
128 | fn get_commit_oids(
129 | repository_path: &str,
130 | branch_glob_pattern: &str,
131 | from_timestamp: i64,
132 | ) -> Result, git2::Error>{
133 | let git_repo = Repository::open(repository_path)?;
134 |
135 | let mut revwalk = git_repo.revwalk()?;
136 | revwalk.push_head()?;
137 | revwalk.set_sorting(git2::Sort::TIME)?;
138 | revwalk.push_glob(branch_glob_pattern)?;
139 |
140 | let mut oids = Vec::new();
141 | for oid in revwalk.flatten() {
142 | if let Ok(commit) = git_repo.find_commit(oid) {
143 | if commit.time().seconds() >= from_timestamp {
144 | oids.push(oid);
145 | }
146 | }
147 | }
148 |
149 | Ok(oids)
150 | }
151 |
152 | pub fn scan_repository(
153 | py: &Python,
154 | repository_path: &str,
155 | branch_glob_pattern: &str,
156 | from_timestamp: i64,
157 | rules_manager: &rules_manager::RulesManager,
158 | output_matches: Arc>>>,
159 | ) -> PyResult<()> {
160 | let commit_oids_queue;
161 |
162 | match get_commit_oids(
163 | repository_path,
164 | branch_glob_pattern,
165 | from_timestamp
166 | ) {
167 | Ok(commit_oids) => {
168 | if commit_oids.is_empty() {
169 | return Ok(());
170 | }
171 |
172 | commit_oids_queue = ArrayQueue::new(commit_oids.len());
173 | for commit_oid in commit_oids {
174 | commit_oids_queue.push(commit_oid).unwrap();
175 | }
176 | },
177 | Err(error) => {
178 | return Err(PyRuntimeError::new_err(error.to_string()))
179 | },
180 | }
181 |
182 | let mut py_signal_error: PyResult<()> = Ok(());
183 |
184 | let should_stop = AtomicCell::new(false);
185 | let number_of_cores = std::thread::available_parallelism().unwrap().get();
186 |
187 | crossbeam_thread::scope(
188 | |scope| {
189 | for _ in 0..number_of_cores {
190 | scope.spawn(
191 | |_| {
192 | if let Ok(git_repo) = Repository::open(repository_path) {
193 | while !should_stop.load() {
194 | if let Some(commit_oid) = commit_oids_queue.pop() {
195 | scan_commit_oid(
196 | &should_stop,
197 | &git_repo,
198 | &commit_oid,
199 | rules_manager,
200 | output_matches.clone(),
201 | ).unwrap_or(());
202 | } else {
203 | break;
204 | }
205 | }
206 | };
207 | }
208 | );
209 | }
210 |
211 | while !commit_oids_queue.is_empty() {
212 | py_signal_error = py.check_signals();
213 | if py_signal_error.is_err() {
214 | should_stop.store(true);
215 |
216 | break;
217 | }
218 |
219 | thread::sleep(time::Duration::from_millis(100));
220 | }
221 | }
222 | ).unwrap_or_default();
223 |
224 | py_signal_error?;
225 |
226 | Ok(())
227 | }
228 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | mod git_repository_scanner;
2 | mod rules_manager;
3 |
4 | use git2::{Oid, Repository};
5 | use parking_lot::Mutex;
6 | use pyo3::exceptions;
7 | use pyo3::prelude::*;
8 | use pyo3::types::PyBytes;
9 | use std::collections::HashMap;
10 | use std::path::Path;
11 | use std::sync::Arc;
12 |
13 | /// GitRepositoryScanner class
14 | /// A git repository scanner object
15 | ///
16 | /// input:
17 | /// None
18 | ///
19 | /// example:
20 | /// grs = pyrepscan.GitRepositoryScanner()
21 | #[pyclass]
22 | #[derive(Default)]
23 | struct GitRepositoryScanner {
24 | rules_manager: rules_manager::RulesManager,
25 | }
26 |
27 | #[pymethods]
28 | impl GitRepositoryScanner {
29 | #[new]
30 | fn new() -> Self {
31 | Self::default()
32 | }
33 |
34 | /// Adding a new content rule. A content rule is a rule that will be applied to the content of
35 | /// the commit changes. For every commit, each file will be scanned and its content will be scanned
36 | /// with the content rules.
37 | ///
38 | /// input:
39 | /// name: str -> The name of the rules. This will help to identify which rule has been matched.
40 | /// pattern: str -> The regex pattern. The pattern should be in Rust regex syntax.
41 | /// whitelist_patterns: list[str] -> A list of regex patterns. If this list is empty nothing happens.
42 | /// If the list contains one or more regex patterns, each regex pattern will be applied to to the
43 | /// matched content. There should be at least one regex pattern that matched to approve the secret.
44 | /// blacklist_patterns: list[str] -> A list of regex patterns. If this list is empty nothing happens.
45 | /// If the list contains one or more regex patterns, each regex pattern will be applied to to the
46 | /// matched content. There should be at least one regex pattern that matched to reject the secret.
47 | ///
48 | /// returns:
49 | /// None
50 | ///
51 | /// example:
52 | /// grs.add_content_rule(
53 | /// name="Rule #1",
54 | /// pattern=r"password=([\d\w]+)",
55 | /// whitelist_patterns=[],
56 | /// blacklist_patterns=[
57 | /// "(?:test|example|xxx|empty)",
58 | /// ],
59 | /// )
60 | fn add_content_rule(
61 | &mut self,
62 | name: String,
63 | pattern: String,
64 | whitelist_patterns: Vec,
65 | blacklist_patterns: Vec,
66 | ) -> PyResult<()> {
67 | self.rules_manager.add_content_rule(
68 | name,
69 | pattern,
70 | whitelist_patterns,
71 | blacklist_patterns,
72 | )
73 | }
74 |
75 | /// Adding a new file path rule. A file path rule is a rule that will be applied to the file path of
76 | /// the commit changes. For every commit, each file will be scanned.
77 | ///
78 | /// input:
79 | /// name: str -> The name of the rules. This will help to identify which rule has been matched.
80 | /// pattern: str -> The regex pattern. The pattern should be in Rust regex syntax.
81 | ///
82 | /// returns:
83 | /// None
84 | ///
85 | /// example:
86 | /// grs.add_file_path_rule(
87 | /// name="Rule #2",
88 | /// pattern=r".*\.(?:pem|cer)",
89 | /// )
90 | fn add_file_path_rule(
91 | &mut self,
92 | name: String,
93 | pattern: String
94 | ) -> PyResult<()> {
95 | self.rules_manager.add_file_path_rule(
96 | name,
97 | pattern,
98 | )
99 | }
100 |
101 | /// Adding a file extension to ignore during the scan.
102 | /// Every file with this extension would not be scanned.
103 | ///
104 | /// input:
105 | /// file_extension: str -> A file extension string. During a scan, the file path will be matched
106 | /// using an ends_with function meaning that it can be partial extension, with a dot, or without
107 | ///
108 | /// returns:
109 | /// None
110 | ///
111 | /// example:
112 | /// grs.add_file_extension_to_skip(
113 | /// file_extension="rar",
114 | /// )
115 | /// grs.add_file_extension_to_skip(
116 | /// file_extension="tar.gz",
117 | /// )
118 | fn add_file_extension_to_skip(
119 | &mut self,
120 | file_extension: String,
121 | ) -> PyResult<()> {
122 | self.rules_manager.add_file_extension_to_skip(file_extension)
123 | }
124 |
125 | /// Adding a file path pattern to skip during the scan. The pattern should be in a free text format.
126 | ///
127 | /// input:
128 | /// file_path: str -> A free text pattern to skip during the scan. If the scanned file path would contain
129 | /// this pattern, the scan will skip the file.
130 | ///
131 | /// returns:
132 | /// None
133 | ///
134 | /// example:
135 | /// grs.add_file_path_to_skip(
136 | /// file_extension="test",
137 | /// )
138 | /// grs.add_file_path_to_skip(
139 | /// file_extension="example",
140 | /// )
141 | fn add_file_path_to_skip(
142 | &mut self,
143 | file_path: String,
144 | ) -> PyResult<()> {
145 | self.rules_manager.add_file_path_to_skip(file_path)
146 | }
147 |
148 | /// Retrieves a file content using its ObjectID.
149 | ///
150 | /// input:
151 | /// repository_path: str -> Absolute path of the git repository directory.
152 | /// file_oid: str -> The file OID in a string representation
153 | ///
154 | /// returns:
155 | /// bytes -> The file content in a binary representation
156 | ///
157 | /// example:
158 | /// grs.get_file_content(
159 | /// repository_path="/path/to/repository",
160 | /// file_oid="6b584e8ece562ebffc15d38808cd6b98fc3d97ea",
161 | /// )
162 | fn get_file_content<'py>(
163 | &mut self,
164 | py: Python<'py>,
165 | repository_path: String,
166 | file_oid: String,
167 | ) -> PyResult<&'py PyBytes> {
168 | let git_repo = Repository::open(repository_path).map_err(
169 | |error| exceptions::PyRuntimeError::new_err(error.to_string())
170 | )?;
171 |
172 | let oid = Oid::from_str(&file_oid).map_err(
173 | |error| exceptions::PyRuntimeError::new_err(error.to_string())
174 | )?;
175 |
176 | let blob = git_repo.find_blob(oid).map_err(
177 | |error| exceptions::PyRuntimeError::new_err(error.to_string())
178 | )?;
179 |
180 | let content = PyBytes::new(py, blob.content());
181 |
182 | Ok(content)
183 | }
184 |
185 | /// Scan a git repository for secrets. Rules shuld be loaded before calling this function.
186 | ///
187 | /// input:
188 | /// repository_path: str -> Absolute path of the git repository directory.
189 | /// branch_glob_pattern: str -> A blob pattern to match against the git branches names.
190 | /// Only matched branches will be scanned.
191 | /// from_timestamp: int = 0 -> Unix epoch timestamp to start the scan from.
192 | ///
193 | /// returns:
194 | /// list[dict] -> List of matches
195 | ///
196 | /// example:
197 | /// grs.scan(
198 | /// repository_path="/path/to/repository",
199 | /// branch_glob_pattern="*",
200 | /// )
201 | fn scan(
202 | &self,
203 | py: Python,
204 | repository_path: &str,
205 | branch_glob_pattern: Option<&str>,
206 | from_timestamp: Option,
207 | ) -> PyResult {
208 | let matches = Arc::new(Mutex::new(Vec::>::with_capacity(10000)));
209 | match git_repository_scanner::scan_repository(
210 | &py,
211 | repository_path,
212 | branch_glob_pattern.unwrap_or("*"),
213 | from_timestamp.unwrap_or(0),
214 | &self.rules_manager,
215 | matches.clone(),
216 | ) {
217 | Ok(_) => Ok(matches.lock().to_object(py)),
218 | Err(error) => Err(error),
219 | }
220 | }
221 |
222 | /// Scan a git repository for secrets. Rules shuld be loaded before calling this function.
223 | ///
224 | /// input:
225 | /// url: str -> URL of a git repository
226 | /// repository_path: str -> The path to clone the repository to
227 | /// branch_glob_pattern: str -> A blob pattern to match against the git branches names.
228 | /// Only matched branches will be scanned.
229 | /// from_timestamp: int = 0 -> Unix epoch timestamp to start the scan from.
230 | ///
231 | /// returns:
232 | /// list[dict] -> List of matches
233 | ///
234 | /// example:
235 | /// grs.scan_from_url(
236 | /// url="https://github.com/rust-lang/git2-rs",
237 | /// repository_path="/path/to/repository",
238 | /// branch_glob_pattern="*",
239 | /// )
240 | fn scan_from_url(
241 | &self,
242 | py: Python,
243 | url: &str,
244 | repository_path: &str,
245 | branch_glob_pattern: Option<&str>,
246 | from_timestamp: Option,
247 | ) -> PyResult {
248 | let mut builder = git2::build::RepoBuilder::new();
249 | builder.bare(true);
250 |
251 | if let Err(error) = builder.clone(url, Path::new(repository_path).as_ref()) {
252 | return Err(exceptions::PyRuntimeError::new_err(error.to_string()));
253 | };
254 |
255 | self.scan(py, repository_path, branch_glob_pattern, from_timestamp)
256 | }
257 | }
258 |
259 | /// PyRepScan is a Python library written in Rust. The library prodives an API to scan git repositories
260 | /// for leaked secrects via usage of rules. There are multiple types of rules that can be used to find
261 | /// leaked files and content.
262 | #[pymodule]
263 | fn pyrepscan(
264 | _py: Python,
265 | m: &PyModule,
266 | ) -> PyResult<()> {
267 | m.add_class::()?;
268 | m.add_class::()?;
269 |
270 | Ok(())
271 | }
272 |
--------------------------------------------------------------------------------
/src/rules_manager.rs:
--------------------------------------------------------------------------------
1 | use std::path::Path;
2 | use std::collections::{HashMap, HashSet};
3 | use regex::Regex;
4 | use pyo3::prelude::*;
5 | use pyo3::exceptions::PyRuntimeError;
6 | use aho_corasick::AhoCorasick;
7 |
8 | struct ContentRule {
9 | name: String,
10 | regex: Regex,
11 | whitelist_regexes: Vec,
12 | blacklist_regexes: Vec,
13 | }
14 |
15 | struct FilePathRule {
16 | name: String,
17 | regex: Regex,
18 | }
19 |
20 | #[pyclass]
21 | pub struct RulesManager {
22 | file_extensions_to_skip: HashSet,
23 | file_paths_to_skip: Vec,
24 | file_paths_to_skip_ac: Option,
25 | content_rules: Vec,
26 | file_path_rules: Vec,
27 | }
28 |
29 | impl Default for RulesManager {
30 | fn default() -> Self {
31 | Self::new()
32 | }
33 | }
34 |
35 | #[pymethods]
36 | impl RulesManager {
37 | #[new]
38 | pub fn new() -> Self {
39 | RulesManager {
40 | file_extensions_to_skip: HashSet::default(),
41 | file_paths_to_skip: Vec::default(),
42 | file_paths_to_skip_ac: None,
43 | content_rules: Vec::default(),
44 | file_path_rules: Vec::default(),
45 | }
46 | }
47 |
48 | pub fn add_content_rule(
49 | &mut self,
50 | name: String,
51 | pattern: String,
52 | whitelist_patterns: Vec,
53 | blacklist_patterns: Vec,
54 | ) -> PyResult<()> {
55 | if name.is_empty() || pattern.is_empty() {
56 | return Err(
57 | PyRuntimeError::new_err("Rule name and pattern can not be empty")
58 | )
59 | }
60 |
61 | let regex = match Regex::new(&pattern) {
62 | Ok(regex) => regex,
63 | Err(error) => {
64 | return Err(
65 | PyRuntimeError::new_err(
66 | format!("Invalid regex pattern: {error}")
67 | )
68 | )
69 | },
70 | };
71 | if regex.captures_len() != 2 {
72 | return Err(
73 | PyRuntimeError::new_err(
74 | format!("Matching regex pattern must have exactly one capturing group: {pattern}")
75 | )
76 | );
77 | }
78 |
79 | let mut whitelist_regexes = Vec::new();
80 | for whitelist_pattern in whitelist_patterns.iter() {
81 | let whitelist_regex = match Regex::new(whitelist_pattern) {
82 | Ok(whitelist_regex) => whitelist_regex,
83 | Err(error) => {
84 | return Err(
85 | PyRuntimeError::new_err(
86 | format!("Invalid whitelist regex pattern: {error}")
87 | )
88 | )
89 | },
90 | };
91 | if whitelist_regex.captures_len() != 1 {
92 | return Err(
93 | PyRuntimeError::new_err(
94 | format!("Whitelist regex pattern must not have a capturing group: {whitelist_pattern}")
95 | )
96 | );
97 | }
98 | whitelist_regexes.push(whitelist_regex);
99 | }
100 |
101 | let mut blacklist_regexes = Vec::new();
102 | for blacklist_pattern in blacklist_patterns.iter() {
103 | let blacklist_regex = match Regex::new(blacklist_pattern) {
104 | Ok(blacklist_regex) => blacklist_regex,
105 | Err(error) => {
106 | return Err(
107 | PyRuntimeError::new_err(
108 | format!("Invalid blacklist regex pattern: {error}")
109 | )
110 | )
111 | },
112 | };
113 | if blacklist_regex.captures_len() != 1 {
114 | return Err(
115 | PyRuntimeError::new_err(
116 | format!("Blacklist regex pattern must not have a capturing group: {blacklist_pattern}")
117 | )
118 | );
119 | }
120 | blacklist_regexes.push(blacklist_regex);
121 | }
122 |
123 | let content_rule = ContentRule {
124 | name,
125 | regex,
126 | whitelist_regexes,
127 | blacklist_regexes,
128 | };
129 | self.content_rules.push(content_rule);
130 |
131 | Ok(())
132 | }
133 |
134 | pub fn add_file_path_rule(
135 | &mut self,
136 | name: String,
137 | pattern: String,
138 | ) -> PyResult<()> {
139 | if name.is_empty() || pattern.is_empty() {
140 | return Err(
141 | PyRuntimeError::new_err("Rule name and pattern can not be empty")
142 | )
143 | }
144 |
145 | let regex = match Regex::new(&pattern) {
146 | Ok(regex) => regex,
147 | Err(error) => {
148 | return Err(
149 | PyRuntimeError::new_err(
150 | format!("Invalid regex pattern: {error}")
151 | )
152 | )
153 | }
154 | };
155 |
156 | let file_path_rule = FilePathRule { name, regex };
157 | self.file_path_rules.push(file_path_rule);
158 |
159 | Ok(())
160 | }
161 |
162 | pub fn add_file_extension_to_skip(
163 | &mut self,
164 | file_extension: String,
165 | ) -> PyResult<()> {
166 | if file_extension.is_empty() {
167 | return Err(
168 | PyRuntimeError::new_err("File extension can not be empty")
169 | )
170 | }
171 | self.file_extensions_to_skip.insert(file_extension.to_ascii_lowercase());
172 |
173 | Ok(())
174 | }
175 |
176 | pub fn add_file_path_to_skip(
177 | &mut self,
178 | file_path: String,
179 | ) -> PyResult<()> {
180 | if file_path.is_empty() {
181 | return Err(
182 | PyRuntimeError::new_err("File path can not be empty")
183 | )
184 | }
185 | self.file_paths_to_skip.push(file_path.to_ascii_lowercase());
186 | self.file_paths_to_skip_ac = Some(
187 | AhoCorasick::new_auto_configured(
188 | self.file_paths_to_skip.as_slice()
189 | )
190 | );
191 |
192 | Ok(())
193 | }
194 |
195 | pub fn should_scan_file_path(
196 | &self,
197 | file_path: &str,
198 | ) -> bool {
199 | if let Some(file_extension) = Path::new(file_path).extension() {
200 | if self.file_extensions_to_skip.contains(file_extension.to_string_lossy().as_ref()) {
201 | return false;
202 | }
203 | }
204 |
205 | if let Some(file_paths_to_skip_patterns) = &self.file_paths_to_skip_ac {
206 | if file_paths_to_skip_patterns.is_match(file_path) {
207 | return false;
208 | }
209 | }
210 |
211 | true
212 | }
213 |
214 | pub fn scan_file(
215 | &self,
216 | file_path: &str,
217 | content: Option<&str>,
218 | ) -> Option>> {
219 | let mut scan_matches = Vec::new();
220 |
221 | for file_path_rule in self.file_path_rules.iter() {
222 | if file_path_rule.regex.is_match(file_path) {
223 | let mut scan_match = HashMap::<&str, String>::new();
224 | scan_match.insert("rule_name", file_path_rule.name.clone());
225 | scan_match.insert("match_text", file_path.to_string());
226 | scan_matches.push(scan_match);
227 | }
228 | }
229 |
230 | if let Some(content) = content {
231 | for content_rule in self.content_rules.iter() {
232 | for match_text in content_rule.regex.find_iter(content) {
233 | if content_rule.blacklist_regexes.iter().any(
234 | |blacklist_regex| blacklist_regex.is_match(match_text.as_str())
235 | ) {
236 | continue;
237 | }
238 | if !content_rule.whitelist_regexes.is_empty() && !content_rule.whitelist_regexes.iter().any(
239 | |whitelist_regex| whitelist_regex.is_match(match_text.as_str())
240 | ) {
241 | continue;
242 | }
243 |
244 | let mut scan_match = HashMap::<&str, String>::new();
245 | scan_match.insert("rule_name", content_rule.name.clone());
246 | scan_match.insert("match_text", match_text.as_str().to_string());
247 | scan_matches.push(scan_match);
248 | }
249 | }
250 | }
251 |
252 | if scan_matches.is_empty() {
253 | None
254 | } else {
255 | Some(scan_matches)
256 | }
257 | }
258 |
259 | pub fn check_pattern(
260 | &mut self,
261 | content: String,
262 | pattern: String
263 | ) -> PyResult> {
264 | let regex = match Regex::new(&pattern) {
265 | Ok(regex) => regex,
266 | Err(error) => {
267 | return Err(
268 | PyRuntimeError::new_err(
269 | format!("Invalid regex pattern: {error}")
270 | )
271 | )
272 | },
273 | };
274 | if regex.captures_len() != 2 {
275 | return Err(
276 | PyRuntimeError::new_err(
277 | format!("Matching regex pattern must have exactly one capturing group: {pattern}")
278 | )
279 | );
280 | }
281 |
282 | let mut matches = Vec::new();
283 | for matched in regex.find_iter(&content) {
284 | matches.push(matched.as_str().to_string());
285 | }
286 |
287 | Ok(matches)
288 | }
289 | }
290 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyRepScan/6c7c73a73af2a759a8f73441efb58b37a086b494/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_git_repository_scanner.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import tempfile
3 | import git
4 | import datetime
5 |
6 | import pyrepscan
7 |
8 |
9 | class GitRepositoryScannerTestCase(
10 | unittest.TestCase,
11 | ):
12 | def setUp(
13 | self,
14 | ):
15 | self.tmpdir = tempfile.TemporaryDirectory()
16 | self.addCleanup(self.tmpdir.cleanup)
17 |
18 | bare_repo = git.Repo.init(
19 | path=self.tmpdir.name,
20 | )
21 | test_author = git.Actor(
22 | name='Author Name',
23 | email='test@author.email',
24 | )
25 |
26 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile:
27 | tmpfile.write('content')
28 | with open(f'{self.tmpdir.name}/file.py', 'w') as tmpfile:
29 | tmpfile.write('content')
30 | with open(f'{self.tmpdir.name}/prod_env.key', 'w') as tmpfile:
31 | tmpfile.write('')
32 | with open(f'{self.tmpdir.name}/prod_env_with_content.key', 'w') as tmpfile:
33 | tmpfile.write('some_key')
34 | with open(f'{self.tmpdir.name}/file.other', 'w') as tmpfile:
35 | tmpfile.write('nothing special')
36 | with open(f'{self.tmpdir.name}/test_file.cpp', 'w') as tmpfile:
37 | tmpfile.write('content')
38 | bare_repo.index.add(
39 | items=[
40 | f'{self.tmpdir.name}/file.txt',
41 | f'{self.tmpdir.name}/file.py',
42 | f'{self.tmpdir.name}/prod_env.key',
43 | f'{self.tmpdir.name}/prod_env_with_content.key',
44 | f'{self.tmpdir.name}/file.other',
45 | f'{self.tmpdir.name}/test_file.cpp',
46 | ],
47 | )
48 |
49 | bare_repo.index.commit(
50 | message='initial commit',
51 | author=test_author,
52 | commit_date='2000-01-01T00:00:00',
53 | author_date='2000-01-01T00:00:00',
54 | )
55 |
56 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile:
57 | tmpfile.write('new content')
58 | bare_repo.index.add(
59 | items=[
60 | f'{self.tmpdir.name}/file.txt',
61 | ],
62 | )
63 | bare_repo.index.commit(
64 | message='edited file',
65 | author=test_author,
66 | commit_date='2001-01-01T00:00:00',
67 | author_date='2001-01-01T00:00:00',
68 | )
69 |
70 | new_branch = bare_repo.create_head('new_branch')
71 | bare_repo.head.reference = bare_repo.heads[1]
72 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile:
73 | tmpfile.write('new content from new branch')
74 | bare_repo.index.add(
75 | items=[
76 | f'{self.tmpdir.name}/file.txt',
77 | ],
78 | )
79 | bare_repo.index.commit(
80 | message='edited file in new branch',
81 | author=test_author,
82 | commit_date='2002-01-01T00:00:00',
83 | author_date='2002-01-01T00:00:00',
84 | )
85 | bare_repo.head.reference = bare_repo.heads.master
86 | bare_repo.head.reset(
87 | index=True,
88 | working_tree=True,
89 | )
90 |
91 | merge_base = bare_repo.merge_base(
92 | new_branch,
93 | bare_repo.heads.master,
94 | )
95 | bare_repo.index.merge_tree(
96 | rhs=bare_repo.heads.master,
97 | base=merge_base,
98 | )
99 | bare_repo.index.commit(
100 | message='merge from new branch',
101 | author=test_author,
102 | commit_date='2003-01-01T00:00:00',
103 | author_date='2003-01-01T00:00:00',
104 | parent_commits=(
105 | new_branch.commit,
106 | bare_repo.heads.master.commit,
107 | ),
108 | )
109 |
110 | new_branch = bare_repo.create_head('non_merged_branch')
111 | bare_repo.head.reference = bare_repo.heads[2]
112 | with open(f'{self.tmpdir.name}/file.txt', 'w') as tmpfile:
113 | tmpfile.write('new content from non_merged_branch')
114 | bare_repo.index.add(
115 | items=[
116 | f'{self.tmpdir.name}/file.txt',
117 | ],
118 | )
119 | bare_repo.index.commit(
120 | message='edited file in non_merged_branch',
121 | author=test_author,
122 | commit_date='2004-01-01T00:00:00',
123 | author_date='2004-01-01T00:00:00',
124 | )
125 |
126 | bare_repo.head.reference = bare_repo.heads.master
127 | bare_repo.close()
128 |
129 | def test_scan_regular(
130 | self,
131 | ):
132 | grs = pyrepscan.GitRepositoryScanner()
133 | grs.add_content_rule(
134 | name='First Rule',
135 | pattern=r'''(content)''',
136 | whitelist_patterns=[],
137 | blacklist_patterns=[],
138 | )
139 |
140 | grs.add_file_extension_to_skip('py')
141 | grs.add_file_path_to_skip('test_')
142 |
143 | results = grs.scan(
144 | repository_path=self.tmpdir.name,
145 | branch_glob_pattern='*master',
146 | from_timestamp=0,
147 | )
148 | for result in results:
149 | result.pop('commit_id')
150 | self.assertCountEqual(
151 | first=results,
152 | second=[
153 | {
154 | 'author_email': 'test@author.email',
155 | 'author_name': 'Author Name',
156 | 'commit_message': 'edited file',
157 | 'commit_time': '2001-01-01T00:00:00',
158 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a',
159 | 'file_path': 'file.txt',
160 | 'match_text': 'content',
161 | 'rule_name': 'First Rule'
162 | },
163 | {
164 | 'author_email': 'test@author.email',
165 | 'author_name': 'Author Name',
166 | 'commit_message': 'edited file in new branch',
167 | 'commit_time': '2002-01-01T00:00:00',
168 | 'file_oid': '0407a18f7c6802c7e7ddc5c9e8af4a34584383ff',
169 | 'file_path': 'file.txt',
170 | 'match_text': 'content',
171 | 'rule_name': 'First Rule'
172 | },
173 | {
174 | 'author_email': 'test@author.email',
175 | 'author_name': 'Author Name',
176 | 'commit_message': 'initial commit',
177 | 'commit_time': '2000-01-01T00:00:00',
178 | 'file_oid': '6b584e8ece562ebffc15d38808cd6b98fc3d97ea',
179 | 'file_path': 'file.txt',
180 | 'match_text': 'content',
181 | 'rule_name': 'First Rule'
182 | },
183 | ],
184 | )
185 |
186 | results = grs.scan(
187 | repository_path=self.tmpdir.name,
188 | branch_glob_pattern='*',
189 | from_timestamp=0,
190 | )
191 | for result in results:
192 | result.pop('commit_id')
193 | self.assertCountEqual(
194 | first=results,
195 | second=[
196 | {
197 | 'author_email': 'test@author.email',
198 | 'author_name': 'Author Name',
199 | 'commit_message': 'edited file',
200 | 'commit_time': '2001-01-01T00:00:00',
201 | 'file_oid': '47d2739ba2c34690248c8f91b84bb54e8936899a',
202 | 'file_path': 'file.txt',
203 | 'match_text': 'content',
204 | 'rule_name': 'First Rule'
205 | },
206 | {
207 | 'author_email': 'test@author.email',
208 | 'author_name': 'Author Name',
209 | 'commit_message': 'edited file in new branch',
210 | 'commit_time': '2002-01-01T00:00:00',
211 | 'file_oid': '0407a18f7c6802c7e7ddc5c9e8af4a34584383ff',
212 | 'file_path': 'file.txt',
213 | 'match_text': 'content',
214 | 'rule_name': 'First Rule'
215 | },
216 | {
217 | 'author_email': 'test@author.email',
218 | 'author_name': 'Author Name',
219 | 'commit_message': 'initial commit',
220 | 'commit_time': '2000-01-01T00:00:00',
221 | 'file_oid': '6b584e8ece562ebffc15d38808cd6b98fc3d97ea',
222 | 'file_path': 'file.txt',
223 | 'match_text': 'content',
224 | 'rule_name': 'First Rule'
225 | },
226 | {
227 | 'author_email': 'test@author.email',
228 | 'author_name': 'Author Name',
229 | 'commit_message': 'edited file in non_merged_branch',
230 | 'commit_time': '2004-01-01T00:00:00',
231 | 'file_oid': '057032a2108721ad1de6a9240fd1a8f45bc3f2ef',
232 | 'file_path': 'file.txt',
233 | 'match_text': 'content',
234 | 'rule_name': 'First Rule'
235 | },
236 | ],
237 | )
238 |
239 | self.assertEqual(
240 | first=b'new content',
241 | second=grs.get_file_content(
242 | repository_path=self.tmpdir.name,
243 | file_oid='47d2739ba2c34690248c8f91b84bb54e8936899a',
244 | ),
245 | )
246 | self.assertEqual(
247 | first=b'new content from new branch',
248 | second=grs.get_file_content(
249 | repository_path=self.tmpdir.name,
250 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383ff',
251 | ),
252 | )
253 | self.assertEqual(
254 | first=b'content',
255 | second=grs.get_file_content(
256 | repository_path=self.tmpdir.name,
257 | file_oid='6b584e8ece562ebffc15d38808cd6b98fc3d97ea',
258 | ),
259 | )
260 |
261 | def test_scan_from_timestamp(
262 | self,
263 | ):
264 | grs = pyrepscan.GitRepositoryScanner()
265 | grs.add_content_rule(
266 | name='First Rule',
267 | pattern=r'''(content)''',
268 | whitelist_patterns=[],
269 | blacklist_patterns=[],
270 | )
271 |
272 | grs.add_file_extension_to_skip('py')
273 | grs.add_file_path_to_skip('test_')
274 |
275 | results = grs.scan(
276 | repository_path=self.tmpdir.name,
277 | branch_glob_pattern='*',
278 | from_timestamp=int(
279 | datetime.datetime(
280 | year=2004,
281 | month=1,
282 | day=1,
283 | hour=0,
284 | minute=0,
285 | second=0,
286 | tzinfo=datetime.timezone.utc,
287 | ).timestamp()
288 | ),
289 | )
290 | for result in results:
291 | result.pop('commit_id')
292 | self.assertCountEqual(
293 | first=results,
294 | second=[
295 | {
296 | 'author_email': 'test@author.email',
297 | 'author_name': 'Author Name',
298 | 'commit_message': 'edited file in non_merged_branch',
299 | 'commit_time': '2004-01-01T00:00:00',
300 | 'file_oid': '057032a2108721ad1de6a9240fd1a8f45bc3f2ef',
301 | 'file_path': 'file.txt',
302 | 'match_text': 'content',
303 | 'rule_name': 'First Rule'
304 | },
305 | ],
306 | )
307 |
308 | results = grs.scan(
309 | repository_path=self.tmpdir.name,
310 | branch_glob_pattern='*',
311 | from_timestamp=int(
312 | datetime.datetime(
313 | year=2004,
314 | month=1,
315 | day=1,
316 | hour=0,
317 | minute=0,
318 | second=1,
319 | tzinfo=datetime.timezone.utc,
320 | ).timestamp()
321 | ),
322 | )
323 | for result in results:
324 | result.pop('commit_id')
325 | self.assertListEqual(
326 | list1=results,
327 | list2=[],
328 | )
329 |
330 | def test_scan_file_name(
331 | self,
332 | ):
333 | grs = pyrepscan.GitRepositoryScanner()
334 | grs.add_file_path_rule(
335 | name='First Rule',
336 | pattern=r'(prod|dev|stage).+key',
337 | )
338 |
339 | results = grs.scan(
340 | repository_path=self.tmpdir.name,
341 | branch_glob_pattern='*',
342 | )
343 | for result in results:
344 | result.pop('commit_id')
345 | self.assertCountEqual(
346 | first=results,
347 | second=[
348 | {
349 | 'author_email': 'test@author.email',
350 | 'author_name': 'Author Name',
351 | 'commit_message': 'initial commit',
352 | 'commit_time': '2000-01-01T00:00:00',
353 | 'file_oid': 'ec3741ea9c00bc5cd88564e49fd81d2340a5582f',
354 | 'file_path': 'prod_env_with_content.key',
355 | 'match_text': 'prod_env_with_content.key',
356 | 'rule_name': 'First Rule'
357 | },
358 | ],
359 | )
360 |
361 | def test_get_file_content(
362 | self,
363 | ):
364 | grs = pyrepscan.GitRepositoryScanner()
365 |
366 | self.assertEqual(
367 | first=grs.get_file_content(
368 | repository_path=self.tmpdir.name,
369 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383ff',
370 | ),
371 | second=b'new content from new branch',
372 | )
373 |
374 | def test_scan_exceptions(
375 | self,
376 | ):
377 | grs = pyrepscan.GitRepositoryScanner()
378 |
379 | with self.assertRaises(
380 | expected_exception=RuntimeError,
381 | ):
382 | grs.scan(
383 | repository_path='/non/existent/path',
384 | )
385 |
386 | def test_add_content_rule_exceptions(
387 | self,
388 | ):
389 | grs = pyrepscan.GitRepositoryScanner()
390 |
391 | with self.assertRaises(
392 | expected_exception=RuntimeError,
393 | ):
394 | grs.add_content_rule(
395 | name='',
396 | pattern=r'regex',
397 | whitelist_patterns=[],
398 | blacklist_patterns=[],
399 | )
400 |
401 | def test_add_file_path_rule_exceptions(
402 | self,
403 | ):
404 | grs = pyrepscan.GitRepositoryScanner()
405 |
406 | with self.assertRaises(
407 | expected_exception=RuntimeError,
408 | ):
409 | grs.add_file_path_rule(
410 | name='',
411 | pattern=r'regex',
412 | )
413 |
414 | def test_add_file_extension_to_skip_exceptions(
415 | self,
416 | ):
417 | grs = pyrepscan.GitRepositoryScanner()
418 |
419 | with self.assertRaises(
420 | expected_exception=RuntimeError,
421 | ):
422 | grs.add_file_extension_to_skip(
423 | file_extension='',
424 | )
425 |
426 | def test_add_file_path_to_skip_exceptions(
427 | self,
428 | ):
429 | grs = pyrepscan.GitRepositoryScanner()
430 |
431 | with self.assertRaises(
432 | expected_exception=RuntimeError,
433 | ):
434 | grs.add_file_path_to_skip(
435 | file_path='',
436 | )
437 |
438 | def test_get_file_content_exceptions(
439 | self,
440 | ):
441 | grs = pyrepscan.GitRepositoryScanner()
442 |
443 | with self.assertRaises(
444 | expected_exception=RuntimeError,
445 | ):
446 | grs.get_file_content(
447 | repository_path=self.tmpdir.name,
448 | file_oid='',
449 | )
450 |
451 | with self.assertRaises(
452 | expected_exception=RuntimeError,
453 | ):
454 | grs.get_file_content(
455 | repository_path=self.tmpdir.name,
456 | file_oid='aaaaaaaaa',
457 | )
458 |
459 | with self.assertRaises(
460 | expected_exception=RuntimeError,
461 | ):
462 | grs.get_file_content(
463 | repository_path=self.tmpdir.name,
464 | file_oid='0407a18f7c6802c7e7ddc5c9e8af4a34584383fa',
465 | )
466 |
--------------------------------------------------------------------------------
/tests/test_rules_manager.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import pyrepscan
4 |
5 |
6 | class RulesManagerTestCase(
7 | unittest.TestCase,
8 | ):
9 | def test_should_scan_file_ignored_extensions(
10 | self,
11 | ):
12 | rules_manager = pyrepscan.RulesManager()
13 |
14 | self.assertTrue(
15 | expr=rules_manager.should_scan_file_path('file.txt'),
16 | )
17 | rules_manager.add_file_extension_to_skip('txt')
18 | self.assertFalse(
19 | expr=rules_manager.should_scan_file_path('file.txt'),
20 | )
21 |
22 | rules_manager.add_file_extension_to_skip('pdf')
23 | self.assertFalse(
24 | expr=rules_manager.should_scan_file_path('file.txt'),
25 | )
26 | self.assertFalse(
27 | expr=rules_manager.should_scan_file_path('file.pdf'),
28 | )
29 | self.assertFalse(
30 | expr=rules_manager.should_scan_file_path('file.other.pdf'),
31 | )
32 | self.assertTrue(
33 | expr=rules_manager.should_scan_file_path('file.pdf.other'),
34 | )
35 | self.assertTrue(
36 | expr=rules_manager.should_scan_file_path('file.doc'),
37 | )
38 |
39 | def test_should_scan_file_ignored_file_paths(
40 | self,
41 | ):
42 | rules_manager = pyrepscan.RulesManager()
43 |
44 | self.assertTrue(
45 | expr=rules_manager.should_scan_file_path('/site-packages/file.txt'),
46 | )
47 |
48 | rules_manager.add_file_path_to_skip('site-packages')
49 | self.assertFalse(
50 | expr=rules_manager.should_scan_file_path('/site-packages/file.txt'),
51 | )
52 | self.assertTrue(
53 | expr=rules_manager.should_scan_file_path('/folder_one/subfolder/file.txt'),
54 | )
55 |
56 | rules_manager.add_file_path_to_skip('folder_one/subfolder')
57 | self.assertFalse(
58 | expr=rules_manager.should_scan_file_path('/folder_one/subfolder/file.txt'),
59 | )
60 | self.assertTrue(
61 | expr=rules_manager.should_scan_file_path('/folder_one/sub/file.txt'),
62 | )
63 |
64 | rules_manager.add_file_path_to_skip('part/name')
65 | self.assertFalse(
66 | expr=rules_manager.should_scan_file_path('some_part/name_some'),
67 | )
68 |
69 | def test_add_content_rule_one(
70 | self,
71 | ):
72 | rules_manager = pyrepscan.RulesManager()
73 | rules_manager.add_content_rule(
74 | name='rule_one',
75 | pattern=r'([a-z]+)',
76 | whitelist_patterns=[],
77 | blacklist_patterns=[],
78 | )
79 |
80 | self.assertEqual(
81 | first=rules_manager.scan_file(
82 | file_path='',
83 | content='first line\nsecond line\nthird line',
84 | ),
85 | second=[
86 | {
87 | 'match_text': 'first',
88 | 'rule_name': 'rule_one',
89 | },
90 | {
91 | 'match_text': 'line',
92 | 'rule_name': 'rule_one',
93 | },
94 | {
95 | 'match_text': 'second',
96 | 'rule_name': 'rule_one',
97 | },
98 | {
99 | 'match_text': 'line',
100 | 'rule_name': 'rule_one',
101 | },
102 | {
103 | 'match_text': 'third',
104 | 'rule_name': 'rule_one',
105 | },
106 | {
107 | 'match_text': 'line',
108 | 'rule_name': 'rule_one',
109 | },
110 | ],
111 | )
112 |
113 | def test_add_content_rule_two(
114 | self,
115 | ):
116 | rules_manager = pyrepscan.RulesManager()
117 | rules_manager.add_content_rule(
118 | name='rule_one',
119 | pattern=r'([a-z]+)',
120 | whitelist_patterns=[],
121 | blacklist_patterns=[
122 | r'line',
123 | ],
124 | )
125 |
126 | self.assertEqual(
127 | first=rules_manager.scan_file(
128 | file_path='',
129 | content='first line\nsecond line\nthird line',
130 | ),
131 | second=[
132 | {
133 | 'match_text': 'first',
134 | 'rule_name': 'rule_one',
135 | },
136 | {
137 | 'match_text': 'second',
138 | 'rule_name': 'rule_one',
139 | },
140 | {
141 | 'match_text': 'third',
142 | 'rule_name': 'rule_one',
143 | },
144 | ],
145 | )
146 |
147 | def test_add_content_rule_three(
148 | self,
149 | ):
150 | rules_manager = pyrepscan.RulesManager()
151 | rules_manager.add_content_rule(
152 | name='rule_one',
153 | pattern=r'([a-z]+)',
154 | whitelist_patterns=[
155 | 'second',
156 | 'third',
157 | ],
158 | blacklist_patterns=[],
159 | )
160 |
161 | self.assertEqual(
162 | first=rules_manager.scan_file(
163 | file_path='',
164 | content='first line\nsecond line\nthird line',
165 | ),
166 | second=[
167 | {
168 | 'match_text': 'second',
169 | 'rule_name': 'rule_one',
170 | },
171 | {
172 | 'match_text': 'third',
173 | 'rule_name': 'rule_one',
174 | },
175 | ],
176 | )
177 |
178 | def test_add_content_rule_four(
179 | self,
180 | ):
181 | rules_manager = pyrepscan.RulesManager()
182 | rules_manager.add_content_rule(
183 | name='rule_one',
184 | pattern=r'([a-z]+)',
185 | whitelist_patterns=[
186 | 'second',
187 | 'third',
188 | ],
189 | blacklist_patterns=[
190 | r'nd$',
191 | ],
192 | )
193 |
194 | self.assertEqual(
195 | first=rules_manager.scan_file(
196 | file_path='',
197 | content='first line\nsecond line\nthird line',
198 | ),
199 | second=[
200 | {
201 | 'match_text': 'third',
202 | 'rule_name': 'rule_one',
203 | },
204 | ],
205 | )
206 |
207 | def test_add_content_rule_five(
208 | self,
209 | ):
210 | rules_manager = pyrepscan.RulesManager()
211 | rules_manager.add_content_rule(
212 | name='rule_one',
213 | pattern=r'(nothing)',
214 | whitelist_patterns=[],
215 | blacklist_patterns=[],
216 | )
217 |
218 | self.assertIsNone(
219 | obj=rules_manager.scan_file(
220 | file_path='',
221 | content='first line\nsecond line\nthird line',
222 | ),
223 | )
224 |
225 | def test_add_content_rule_exceptions(
226 | self,
227 | ):
228 | rules_manager = pyrepscan.RulesManager()
229 |
230 | with self.assertRaises(
231 | expected_exception=RuntimeError,
232 | ):
233 | rules_manager.add_content_rule(
234 | name='',
235 | pattern=r'regex',
236 | whitelist_patterns=[],
237 | blacklist_patterns=[],
238 | )
239 |
240 | with self.assertRaises(
241 | expected_exception=RuntimeError,
242 | ):
243 | rules_manager.add_content_rule(
244 | name='rule_one',
245 | pattern=r'',
246 | whitelist_patterns=[],
247 | blacklist_patterns=[],
248 | )
249 |
250 | with self.assertRaises(
251 | expected_exception=RuntimeError,
252 | ):
253 | rules_manager.add_content_rule(
254 | name='rule_one',
255 | pattern=r'(',
256 | whitelist_patterns=[],
257 | blacklist_patterns=[],
258 | )
259 |
260 | with self.assertRaises(
261 | expected_exception=RuntimeError,
262 | ):
263 | rules_manager.add_content_rule(
264 | name='rule_one',
265 | pattern=r'regex_pattern_without_capturing_group',
266 | whitelist_patterns=[],
267 | blacklist_patterns=[],
268 | )
269 |
270 | with self.assertRaises(
271 | expected_exception=RuntimeError,
272 | ):
273 | rules_manager.add_content_rule(
274 | name='rule_two',
275 | pattern=r'(content)',
276 | whitelist_patterns=[],
277 | blacklist_patterns=[
278 | '(',
279 | ],
280 | )
281 |
282 | with self.assertRaises(
283 | expected_exception=RuntimeError,
284 | ):
285 | rules_manager.add_content_rule(
286 | name='rule_two',
287 | pattern=r'(content)',
288 | whitelist_patterns=[],
289 | blacklist_patterns=[
290 | '(blacklist_regex_with_capturing_group)',
291 | ],
292 | )
293 |
294 | with self.assertRaises(
295 | expected_exception=RuntimeError,
296 | ):
297 | rules_manager.add_content_rule(
298 | name='rule_two',
299 | pattern=r'(content)',
300 | whitelist_patterns=[
301 | '(',
302 | ],
303 | blacklist_patterns=[],
304 | )
305 |
306 | with self.assertRaises(
307 | expected_exception=RuntimeError,
308 | ):
309 | rules_manager.add_content_rule(
310 | name='rule_two',
311 | pattern=r'(content)',
312 | whitelist_patterns=[
313 | '(whitelist_regex_with_capturing_group)',
314 | ],
315 | blacklist_patterns=[],
316 | )
317 |
318 | def test_add_file_path_rule_one(
319 | self,
320 | ):
321 | rules_manager = pyrepscan.RulesManager()
322 | rules_manager.add_file_path_rule(
323 | name='rule_one',
324 | pattern=r'(prod|dev|stage).+key',
325 | )
326 |
327 | self.assertIsNone(
328 | obj=rules_manager.scan_file(
329 | file_path='workdir/prod/some_file',
330 | content=None,
331 | ),
332 | )
333 | self.assertEqual(
334 | first=rules_manager.scan_file(
335 | file_path='workdir/prod/some_file.key',
336 | content=None,
337 | ),
338 | second=[
339 | {
340 | 'match_text': 'workdir/prod/some_file.key',
341 | 'rule_name': 'rule_one',
342 | },
343 | ],
344 | )
345 |
346 | rules_manager.add_file_path_rule(
347 | name='rule_two',
348 | pattern=r'prod.+key',
349 | )
350 |
351 | self.assertIsNone(
352 | obj=rules_manager.scan_file(
353 | file_path='workdir/prod/some_file',
354 | content=None,
355 | ),
356 | )
357 | self.assertEqual(
358 | first=rules_manager.scan_file(
359 | file_path='workdir/prod/some_file.key',
360 | content=None,
361 | ),
362 | second=[
363 | {
364 | 'match_text': 'workdir/prod/some_file.key',
365 | 'rule_name': 'rule_one',
366 | },
367 | {
368 | 'match_text': 'workdir/prod/some_file.key',
369 | 'rule_name': 'rule_two',
370 | },
371 | ],
372 | )
373 |
374 | def test_add_file_path_rule_exceptions(
375 | self,
376 | ):
377 | rules_manager = pyrepscan.RulesManager()
378 |
379 | with self.assertRaises(
380 | expected_exception=RuntimeError,
381 | ):
382 | rules_manager.add_file_path_rule(
383 | name='',
384 | pattern=r'regex',
385 | )
386 |
387 | with self.assertRaises(
388 | expected_exception=RuntimeError,
389 | ):
390 | rules_manager.add_file_path_rule(
391 | name='rule_one',
392 | pattern=r'',
393 | )
394 |
395 | def test_add_file_extension_to_skip_exceptions(
396 | self,
397 | ):
398 | rules_manager = pyrepscan.RulesManager()
399 |
400 | with self.assertRaises(
401 | expected_exception=RuntimeError,
402 | ):
403 | rules_manager.add_file_extension_to_skip(
404 | file_extension='',
405 | )
406 |
407 | def test_add_file_path_to_skip_exceptions(
408 | self,
409 | ):
410 | rules_manager = pyrepscan.RulesManager()
411 |
412 | with self.assertRaises(
413 | expected_exception=RuntimeError,
414 | ):
415 | rules_manager.add_file_path_to_skip(
416 | file_path='',
417 | )
418 |
419 | def test_scan_file_one(
420 | self,
421 | ):
422 | rules_manager = pyrepscan.RulesManager()
423 |
424 | self.assertIsNone(
425 | obj=rules_manager.scan_file(
426 | file_path='/path/to/file.txt',
427 | content=None,
428 | ),
429 | )
430 |
431 | def test_scan_file_two(
432 | self,
433 | ):
434 | rules_manager = pyrepscan.RulesManager()
435 |
436 | rules_manager.add_content_rule(
437 | name='rule_one',
438 | pattern=r'(some_text)',
439 | whitelist_patterns=[],
440 | blacklist_patterns=[],
441 | )
442 | self.assertIsNone(
443 | obj=rules_manager.scan_file(
444 | file_path='/path/to/file.txt',
445 | content=None,
446 | ),
447 | )
448 | self.assertIsNone(
449 | obj=rules_manager.scan_file(
450 | file_path='/path/to/file.txt',
451 | content='',
452 | ),
453 | )
454 | self.assertIsNone(
455 | obj=rules_manager.scan_file(
456 | file_path='/path/to/file.txt',
457 | content='other_text',
458 | ),
459 | )
460 | self.assertEqual(
461 | first=rules_manager.scan_file(
462 | file_path='/path/to/file.txt',
463 | content='some_text',
464 | ),
465 | second=[
466 | {
467 | 'rule_name': 'rule_one',
468 | 'match_text': 'some_text',
469 | },
470 | ],
471 | )
472 |
473 | rules_manager.add_content_rule(
474 | name='rule_two',
475 | pattern=r'(some)',
476 | whitelist_patterns=[],
477 | blacklist_patterns=[],
478 | )
479 | self.assertEqual(
480 | first=rules_manager.scan_file(
481 | file_path='/path/to/file.txt',
482 | content='some_text',
483 | ),
484 | second=[
485 | {
486 | 'rule_name': 'rule_one',
487 | 'match_text': 'some_text',
488 | },
489 | {
490 | 'rule_name': 'rule_two',
491 | 'match_text': 'some',
492 | },
493 | ],
494 | )
495 |
496 | def test_scan_file_three(
497 | self,
498 | ):
499 | rules_manager = pyrepscan.RulesManager()
500 |
501 | rules_manager.add_content_rule(
502 | name='rule_one',
503 | pattern=r'(some_.+)',
504 | whitelist_patterns=[],
505 | blacklist_patterns=[
506 | r'text',
507 | ],
508 | )
509 | self.assertIsNone(
510 | obj=rules_manager.scan_file(
511 | file_path='/path/to/file.txt',
512 | content='some_text',
513 | ),
514 | )
515 | self.assertEqual(
516 | first=rules_manager.scan_file(
517 | file_path='/path/to/file.txt',
518 | content='some_other',
519 | ),
520 | second=[
521 | {
522 | 'rule_name': 'rule_one',
523 | 'match_text': 'some_other',
524 | },
525 | ],
526 | )
527 |
528 | def test_scan_file_four(
529 | self,
530 | ):
531 | rules_manager = pyrepscan.RulesManager()
532 |
533 | rules_manager.add_content_rule(
534 | name='rule_one',
535 | pattern=r'(some_.+)',
536 | whitelist_patterns=[],
537 | blacklist_patterns=[
538 | r'text',
539 | r'other',
540 | ],
541 | )
542 | self.assertIsNone(
543 | obj=rules_manager.scan_file(
544 | file_path='/path/to/file.txt',
545 | content='some_text',
546 | ),
547 | )
548 | self.assertIsNone(
549 | obj=rules_manager.scan_file(
550 | file_path='/path/to/file.txt',
551 | content='some_other',
552 | ),
553 | )
554 | self.assertEqual(
555 | first=rules_manager.scan_file(
556 | file_path='/path/to/file.txt',
557 | content='some_diff',
558 | ),
559 | second=[
560 | {
561 | 'rule_name': 'rule_one',
562 | 'match_text': 'some_diff',
563 | },
564 | ],
565 | )
566 |
567 | def test_scan_file_five(
568 | self,
569 | ):
570 | rules_manager = pyrepscan.RulesManager()
571 |
572 | rules_manager.add_content_rule(
573 | name='rule_one',
574 | pattern=r'(some_.+)',
575 | whitelist_patterns=[
576 | 'diff',
577 | ],
578 | blacklist_patterns=[],
579 | )
580 | self.assertIsNone(
581 | obj=rules_manager.scan_file(
582 | file_path='/path/to/file.txt',
583 | content='some_text',
584 | ),
585 | )
586 | self.assertIsNone(
587 | obj=rules_manager.scan_file(
588 | file_path='/path/to/file.txt',
589 | content='some_other',
590 | ),
591 | )
592 | self.assertEqual(
593 | first=rules_manager.scan_file(
594 | file_path='/path/to/file.txt',
595 | content='some_diff',
596 | ),
597 | second=[
598 | {
599 | 'rule_name': 'rule_one',
600 | 'match_text': 'some_diff',
601 | },
602 | ],
603 | )
604 |
605 | def test_scan_file_six(
606 | self,
607 | ):
608 | rules_manager = pyrepscan.RulesManager()
609 |
610 | rules_manager.add_content_rule(
611 | name='rule_one',
612 | pattern=r'(some_.+)',
613 | whitelist_patterns=[
614 | 'diff',
615 | 'other',
616 | ],
617 | blacklist_patterns=[],
618 | )
619 | self.assertIsNone(
620 | obj=rules_manager.scan_file(
621 | file_path='/path/to/file.txt',
622 | content='some_text',
623 | ),
624 | )
625 | self.assertEqual(
626 | first=rules_manager.scan_file(
627 | file_path='/path/to/file.txt',
628 | content='some_other',
629 | ),
630 | second=[
631 | {
632 | 'rule_name': 'rule_one',
633 | 'match_text': 'some_other',
634 | },
635 | ],
636 | )
637 | self.assertEqual(
638 | first=rules_manager.scan_file(
639 | file_path='/path/to/file.txt',
640 | content='some_diff',
641 | ),
642 | second=[
643 | {
644 | 'rule_name': 'rule_one',
645 | 'match_text': 'some_diff',
646 | },
647 | ],
648 | )
649 |
650 | def test_scan_file_seven(
651 | self,
652 | ):
653 | rules_manager = pyrepscan.RulesManager()
654 |
655 | rules_manager.add_file_path_rule(
656 | name='rule_one',
657 | pattern=r'dev\.txt',
658 | )
659 | self.assertIsNone(
660 | obj=rules_manager.scan_file(
661 | file_path='/path/to/file.txt',
662 | content=None,
663 | ),
664 | )
665 | self.assertIsNone(
666 | obj=rules_manager.scan_file(
667 | file_path='/path/to/file.txt',
668 | content='',
669 | ),
670 | )
671 | self.assertIsNone(
672 | obj=rules_manager.scan_file(
673 | file_path='/path/to/file.txt',
674 | content='other_text',
675 | ),
676 | )
677 | self.assertEqual(
678 | first=rules_manager.scan_file(
679 | file_path='/path/to/dev.txt',
680 | content='',
681 | ),
682 | second=[
683 | {
684 | 'rule_name': 'rule_one',
685 | 'match_text': '/path/to/dev.txt',
686 | },
687 | ],
688 | )
689 |
690 | rules_manager.add_file_path_rule(
691 | name='rule_two',
692 | pattern=r'(\.txt)',
693 | )
694 | self.assertEqual(
695 | first=rules_manager.scan_file(
696 | file_path='/path/to/dev.txt',
697 | content='some_text',
698 | ),
699 | second=[
700 | {
701 | 'rule_name': 'rule_one',
702 | 'match_text': '/path/to/dev.txt',
703 | },
704 | {
705 | 'rule_name': 'rule_two',
706 | 'match_text': '/path/to/dev.txt',
707 | },
708 | ],
709 | )
710 |
711 | def test_check_pattern(
712 | self,
713 | ):
714 | rules_manager = pyrepscan.RulesManager()
715 |
716 | with self.assertRaises(
717 | expected_exception=RuntimeError,
718 | ):
719 | rules_manager.check_pattern(
720 | content='',
721 | pattern=r'(',
722 | )
723 |
724 | with self.assertRaises(
725 | expected_exception=RuntimeError,
726 | ):
727 | rules_manager.check_pattern(
728 | content='',
729 | pattern=r'no_capturing_group',
730 | )
731 |
732 | with self.assertRaises(
733 | expected_exception=RuntimeError,
734 | ):
735 | rules_manager.check_pattern(
736 | content='',
737 | pattern=r'(?:\:)',
738 | )
739 |
740 | self.assertEqual(
741 | first=rules_manager.check_pattern(
742 | content='some sentence',
743 | pattern=r'([^ ]+)',
744 | ),
745 | second=[
746 | 'some',
747 | 'sentence',
748 | ]
749 | )
750 |
--------------------------------------------------------------------------------