├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── Pipfile ├── README.md ├── generate_lambda_zip.py ├── lambdas ├── .dockerignore ├── Dockerfile ├── enrichment_plugins │ ├── __init__.py │ └── ensure_eventid.py ├── generate_partitions.py ├── normalization_plugins │ ├── __init__.py │ ├── event_shell.py │ ├── gsuite_login.py │ ├── ip_addresses.py │ ├── lowercase_keys.py │ └── timestamps.py ├── processor.py ├── requirements.txt ├── s3_to_firehose.py ├── tests │ ├── __init__.py │ ├── logging_config.yml │ ├── samples │ │ ├── sample_cloudfront_wordpress_probe.json │ │ ├── sample_cloudtrail_create_log_stream.json │ │ ├── sample_gsuite_login_event.json │ │ ├── sample_syslog_sudo.json │ │ └── sample_vpc_flow_log.json │ ├── test_core.py │ ├── test_plugin_gsuite_logins.py │ ├── test_plugin_ip_addresses.py │ ├── test_plugin_timestamps.py │ └── test_plugins.py └── utils │ ├── __init__.py │ ├── athena.py │ ├── dates.py │ ├── dict_helpers.py │ ├── dotdict.py │ ├── helpers.py │ └── plugins.py ├── main.tf ├── pytest.ini ├── terraform.tfvars └── variables.tf /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | liberapay: defendA 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .vscode 132 | #piplock 133 | Pipfile.lock 134 | lambda.zip 135 | .DS_Store 136 | terraform.tfstate 137 | terraform.tfstate.backup 138 | .terraform/plugins/darwin_amd64/lock.json 139 | .terraform/plugins/darwin_amd64/terraform-provider-aws_v3.0.0_x5 140 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # safelist 2 | branches: 3 | only: 4 | - main 5 | 6 | os: 7 | - linux 8 | 9 | language: python 10 | python: 11 | - "3.8" 12 | 13 | env: 14 | - 15 | 16 | install: 17 | - pip install pipenv 18 | - pipenv install -d 19 | 20 | # command to run tests 21 | script: 22 | - pytest 23 | 24 | notifications: 25 | email: 26 | on_success: never 27 | on_failure: always -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | pylint = "*" 8 | black = "*" 9 | pytest = "*" 10 | 11 | [packages] 12 | docker = "*" 13 | boto3 = "*" 14 | pyyaml = "*" 15 | pandas = "*" 16 | tzlocal = "*" 17 | netaddr = "*" 18 | pynsive = "*" 19 | 5400915 = {file = "https://github.com/noahmorrison/chevron/archive/master.zip"} 20 | pyathena = "*" 21 | 22 | [requires] 23 | python_version = "3.8" 24 | 25 | [pipenv] 26 | allow_prereleases = true 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # defendA Data Lake 2 | An AWS-native, serverless log management system to allow you to ingest unstructured JSON, normalize & enrich it and store it in Athena for queries and analysis. 3 | 4 | **Build Status:** 5 | - Master [![Build Status](https://travis-ci.com/0xdefenda/defenda-data-lake.svg?branch=master) ](https://travis-ci.com/0xdefenda/defenda-data-lake) 6 | 7 | ## Video intro 8 | Here's a brief video intro to the data lake. 9 | 10 | [![video intro](http://img.youtube.com/vi/eYQ0gjTMVhc/0.jpg)](http://www.youtube.com/watch?v=eYQ0gjTMVhc "defendA security data lake") 11 | 12 | ## Why? 13 | Centralized log/event management is a core element of an infosec program, yet most solutions are not cloud native, require unnecessary servers/clusters and force you to massage your events into a strict format. 14 | 15 | The reality is that infosec teams aren't able to dictate what format events come in which is usually arbitrary, nested JSON. 16 | 17 | This solution uses only serverless constructs to allow you to store unstructured JSON from any source in a predictable data structure that can be accessed using Athena's native SQL. 18 | 19 | ## Deployment: 20 | 21 | Deployment is via python/pipenv, terraform and a mini-docker environment to compile the lambdas. 22 | 23 | It uses us-west-2 as the default region, set a terraform.tfvars variable ( aws_region = "some-other-region ) if you'd like it elsewhere. 24 | 25 | 26 | First get the code and initiate pipenv (or [install it if you aren't converted yet](https://pipenv.pypa.io/en/latest/install/)): 27 | 28 | ```bash 29 | git clone . 30 | pipenv --python 3.8 31 | ``` 32 | 33 | Now build the lambdas: 34 | 35 | ```bash 36 | ./generate_lambda_zip.py 37 | ``` 38 | 39 | Init and run terraform 40 | ```bash 41 | terraform init 42 | terraform plan 43 | terraform apply 44 | ``` 45 | and you will end up with: 46 | 47 | - An Athena database: `defenda_datalake` 48 | - A table: `events` 49 | - An s3 bucket to serve as the data store for the athena data lake: `data-lake--output-bucket` 50 | - An s3 bucket to act as an input if you have things that can't talk directly to firehose: `data-lake--input-bucket` 51 | - A firehose delivery stream: `data_lake_s3_stream` 52 | - A lambda to operate on records bound for athena: `data_lake_lambda` 53 | - A lambda to generate partitions: `data_lake_generate_partitions` 54 | - All the iam permissions and glue to sync these together 55 | 56 | ## Event structure 57 | Athena does require *some* structure to allow for querying. To enable that and still allow unstructured JSON we use the following `event shell` 58 | 59 | - eventid (string) 60 | - a unique guid 61 | - utctimestamp (string) 62 | - timestamp for the event in UTC, ISO format 63 | - severity (string) 64 | - DEBUG, INFO, WARNING, etc 65 | - summary (string) 66 | - a human readable text description of the event 67 | - category (string) 68 | - what sort of event: authentication, etc. 69 | - source (string) 70 | - where the event came from (gsuite, sophos, cloudtrail, okta, etc) 71 | - tags (array) 72 | - a series of tags you'd like to add 73 | - plugins (array) 74 | - a record of what plugins operated on this event 75 | - details (string) 76 | - this `string` is the native JSON of the event. Stored as a string to allow for json_extract_scalar operations to query the JSON structure. 77 | - year (string) (Partitioned) 78 | - partition for athena 79 | - month (string) (Partitioned) 80 | - partition for athena 81 | - day (string) (Partitioned) 82 | - partition for athena 83 | - hour (string) (Partitioned) 84 | - partition for athena 85 | 86 | ### Sample query 87 | So what does it look like to use this data lake? Here's a sample query that would return all AWS console logins in a certain partition/timeframe: 88 | 89 | ```sql 90 | SELECT utctimestamp, 91 | summary, 92 | source, 93 | details 94 | 95 | FROM "defenda_data_lake"."events" 96 | where 97 | source='cloudtrail' AND json_extract_scalar(details,'$.eventname') = 'ConsoleLogin' 98 | AND ( 99 | (year='2020' 100 | AND month='06' 101 | AND day='19' 102 | AND hour='01') 103 | OR 104 | (year='2020' 105 | AND month='06' 106 | AND day='19' 107 | AND hour='00') 108 | ) 109 | limit 100 110 | ``` 111 | 112 | You can use the [json_extract_scalar](https://prestodb.io/docs/current/functions/json.html) function and [json path expressions](https://goessner.net/articles/JsonPath/index.html#e2) to get at any layer of the nested JSON stored in the 'details' field as part of your query. 113 | 114 | The date portion of the where clause allows us to hone in on a particular time period and allows us to limit the cost of the query by limiting the amount of data scanned by Athena. 115 | 116 | Queries can be any valid [Presto SQL](https://prestodb.io/docs/current/sql/select.html) including [functions](https://prestodb.io/docs/current/functions.html) 117 | 118 | 119 | Here's another, slightly more complex query taking advantage of the work the ip_addresses.py plugin does to gather all the ips it's seen into a list. We can use that to query for any events involving a suspect ip like so: 120 | 121 | ```sql 122 | SELECT 123 | utctimestamp, 124 | summary, 125 | source, 126 | details, 127 | tags 128 | FROM defenda_data_lake.events 129 | where 130 | source ='cloudtrail' 131 | AND json_array_contains(json_extract(details,'$._ipaddresses'),'7.1.14.12') 132 | AND year='2020' 133 | AND month='09' 134 | AND day='07' 135 | AND hour='18' 136 | LIMIT 100; 137 | ``` 138 | 139 | The plugin searches events for likely IP fields, verifies them, normalizes source/destination IPs and then appends them to a metadata list details._ipaddresses. We can query that json natively by extracting it from the details athena field and use the Presto function json_array_contains to narrow our query to the IP address in question. 140 | 141 | ### Python querying 142 | Thanks to the [pyathena library](https://pypi.org/project/PyAthena/) and [pandas](https://pandas.pydata.org/), querying and exploring data is easy! 143 | 144 | Here's the same sample query looking for IP address events, but performed from a python environment. 145 | 146 | ```python 147 | from pyathena import connect 148 | from pyathena.util import as_pandas 149 | from pyathena.pandas_cursor import PandasCursor 150 | import pandas as pd 151 | 152 | cursor = connect(work_group='defenda_data_lake',region_name='us-west-2',cursor_class=PandasCursor).cursor() 153 | 154 | cursor.execute(""" 155 | SELECT 156 | utctimestamp, 157 | summary, 158 | source, 159 | details, 160 | tags 161 | FROM defenda_data_lake.events 162 | where 163 | source ='cloudtrail' 164 | AND json_array_contains(json_extract(details,'$._ipaddresses'),'7.1.14.12') 165 | AND year='2020' 166 | AND month='09' 167 | AND day='07' 168 | AND hour='18' 169 | LIMIT 100; 170 | """) 171 | df = as_pandas(cursor) 172 | df.head() 173 | 174 | ``` 175 | 176 | You simply create a cursor to handle your results, send it a query and your result is a pandas data frame. 177 | 178 | If you'd like your query results restored to a list of python dictionaries you can convert the JSON in the details field like so: 179 | 180 | ```python 181 | query_results=[] 182 | for message in df.to_dict('records'): 183 | message['details']=json.loads(message['details']) 184 | query_results.append(message) 185 | ``` 186 | 187 | ### Advantages 188 | 189 | #### Serverless! 190 | No servers to manage and this scales up as your event ingestion scales. You can store as much data as s3/athena can handle and due to the JSON handling, changes in data structures won't blow up your infrastructure. 191 | 192 | #### Security 193 | Operating via serverless, there is nothing to maintain, patch, etc. Python libraries will of course update over time. 194 | 195 | There is nothing exposed to the outside world, no extra costs for authentication, no extra licensing for secure transport, etc. 196 | 197 | #### Customizable 198 | A simple plugin system allows you to write your own custom event handlers to either normalize your data or enhance it as you see fit. Plugins are in python, usually a dozen lines of code and an be fine tuned to operate only on the events of interest. 199 | 200 | #### Integration 201 | For input that can't be hooked up to firehose, you can deposit raw JSON in the s3 input bucket and it will be send automatically through to firehose/athena. You can use this to hook up legacy event producers that may not be able to speak native firehose but can write files to s3. 202 | 203 | #### Cost 204 | This costs nothing to deploy. Costs will vary depending on your data ingestion, but can get started today without having to guesstimate event per second, data size, throughput, or other statistics you usually have to commit to in other log management platforms. 205 | 206 | Preliminary tests sending 500MB of data to the data lake resulted in the following costs: 207 | 208 | Test using s3 as the input (copying json files to s3): 209 | - s3: $0.51 210 | - firehose: $0.02 211 | - athena: $0.00 212 | 213 | Test using firehose only as the input (no files, direct to firehose): 214 | - s3: $0.02 215 | - firehose: $0.02 216 | - athena: $0.00 217 | 218 | 219 | ### Disadvantages 220 | 221 | #### Latency 222 | Depending on your rate of event ingestion, firehose will queue events for 60 seconds before flushing to s3. If you have enough flow, this usually isn't a problem but if your event flow is very low you may see a slight delay. 223 | 224 | #### Query Cost potential 225 | 226 | Athena's pricing is based on $/query/data that as of this writing is $5 per terabyte. Each query is charged based on the amount of underlying data that was scanned to resolve the query and prorated accordingly. So if your query operated on a megabyte of data in a partition, your charge would be only for that megabyte. 227 | 228 | However it is a `per query` charge. So if you aren't careful with your queries and don't make use of partitions you can run up a bill. 229 | 230 | To help, data is automatically partitioned in hour chunks (year/month/day/hour structure in the s3 bucket). By simply adding some criteria to your where clause you can limit the amount of data you interact with and are charged for. Data is also automatically gzipped to also reduce the charges. 231 | 232 | 233 | ## Companion Projects 234 | 235 | Anything that sends json to firehost can be used as an input into the data lake. Here are some sample companion projects that do just that to send security events from some common data sources: 236 | 237 | - [gsuite log ingestion](https://github.com/jeffbryner/gsuite-activity-lambda) 238 | - [sophos log ingestion](https://github.com/jeffbryner/sophos-activity-lambda) 239 | - [meraki log ingestion](https://github.com/jeffbryner/meraki-activity-lambda) 240 | - [beats log ingestion](https://github.com/jeffbryner/firehose-es-input#browserbeat-example) 241 | 242 | ## Plugins 243 | Inspired by [MozDef's plugin system](https://github.com/mozilla/MozDef/tree/master/mq/plugins) via [pynsive](https://github.com/zinic/pynsive/), the plugins in the data lake use a similar concept of operations, but are ordered a bit differently. 244 | 245 | ### Plugin types 246 | Plugins can either normalize or enrich an event. Events are first run through normalization plugins, then through enrichment plugins. This makes it easier to target your plugin to the task at hand, and makes it easier to perform whatever operation you are envisoning. 247 | 248 | Plugins are python, and register themselves to receive events containing a field, a category or a tag. Plugins can signal they'd like to see all events by registering for '*'. 249 | 250 | If an event matches the registration, the event and it's metadata are sent to the plugin where the plugin can rearrange/rename fields (normalization), add information to the event (enrichment) or perform any operation you might envision with the event. 251 | 252 | A plugin can signal to drop the event by returning None for the message. The pipeline will not store the event, which can help weed out noise. 253 | 254 | ### Sample plugin 255 | Lets look at the sample Gsuite login plugin configured to operate on events from the [gsuite log ingestion](https://github.com/jeffbryner/gsuite-activity-lambda) project that polls Google for gsuite security events and sends them to firehose. 256 | 257 | ```python 258 | class message(object): 259 | 260 | def __init__(self): 261 | ''' 262 | handle gsuite login activity record 263 | ''' 264 | 265 | self.registration = ['kind'] 266 | self.priority = 20 267 | ``` 268 | 269 | The plugin registers to receive any even that has a field named 'kind'. The registration property is a list and can contain a list of fields that, if present, the plugin would like to receive. You could have a registration of ```['ipaddress','ip_address','srcip']``` for example to receive any event that contains any or all of those fields. 270 | 271 | Next, the plugin puts itself as priority 20, meaning any plugin with a lower number will receive the event first. This allows you to order your plugins in case that is important in the plugin pipeline logic. Plugins will be called in order of priority, 0 going first, higher numbers going later. 272 | 273 | Next the plugin contains the logic to use when encountering a matching event: 274 | 275 | ```python 276 | def onMessage(self, message, metadata): 277 | # for convenience, make a dot dict version of the message 278 | dot_message=DotDict(message) 279 | 280 | # double check that this is our target message 281 | if 'admin#reports#activity' not in dot_message.get('details.kind','')\ 282 | or 'id' not in message.get('details','') \ 283 | or 'etag' not in message.get('details',''): 284 | return(message, metadata) 285 | # 286 | ``` 287 | 288 | Your plugins can make use of the utils functions like DotDict, etc to operate on an event. It's best practice to first ensure this event fully matches what you expect and this plugin is double checking for certain fields in the structure and returning the message unchanged if there isn't a match. 289 | 290 | Normalization plugins usually cherry pick fields from the original event and surface them to standardized fields to make querying/correlating easier. For example this plugin sets some tags and brings out the IP address and timestamp: 291 | 292 | ```python 293 | message["source"]="gsuite" 294 | message["tags"].append("gsuite") 295 | 296 | # clean up ipaddress field 297 | if 'ipaddress' in message['details']: 298 | message['details']['sourceipaddress']=message['details']['ipaddress'] 299 | del message['details']['ipaddress'] 300 | 301 | # set the actual time 302 | if dot_message.get("details.id.time",None): 303 | message['utctimestamp']=toUTC(message['details']['id']['time']).isoformat() 304 | 305 | ``` 306 | 307 | it goes on to do the same for other common fields and most importantly sets a human readable summary: 308 | 309 | ```python 310 | # set summary 311 | message["summary"]=chevron.render("{{details.user}} {{details.events.0.name}} from IP {{details.sourceipaddress}}",message) 312 | ``` 313 | The [chevron library](https://github.com/noahmorrison/chevron) allows us to use mustache templates to access fields and fields within lists to pull out information from the event as needed. ```details.events.0.name``` in this case is looking for the first item in the details.events list and if that exists, it uses the ```name``` field in the text. Chevron is forgiving, you can reference fields that may not exist, or only exist in some cases. 314 | 315 | The utility libraries are purposefully crafted to allow you to get at the most stubborn data. In a gsuite event for example, the majority of the information is tucked away in key/value fields. Take this marker for suspicious logins as an example: 316 | 317 | ```json 318 | "events": [ 319 | { 320 | "type": "login", 321 | "name": "login_success", 322 | "parameters": [ 323 | { 324 | "name": "login_type", 325 | "value": "exchange" 326 | }, 327 | { 328 | "name": "login_challenge_method", 329 | "multiValue": [ 330 | "none" 331 | ] 332 | }, 333 | { 334 | "name": "is_suspicious", 335 | "boolValue": false 336 | } 337 | ] 338 | } 339 | ] 340 | ``` 341 | 342 | You can see there are several 'name' fields with a parameters list that make it difficult to programatically query. 343 | 344 | This plugin solves this via the use of the dict_match function like so: 345 | 346 | ```python 347 | #suspicious? 348 | suspicious={"boolvalue":True,"name":"is_suspicious"} 349 | for e in dot_message.get("details.events",[]): 350 | for p in e.get("parameters",[]): 351 | if dict_match(suspicious,p): 352 | message["details"]["suspicious"]=True 353 | ``` 354 | 355 | The dict_match function takes a dictionary of keys and values and compares it to something. If the keys and values match, it returns true which in this case allows to mark an event as suspicious if the name='is_suspicious' and a field called 'boolvalue' is True. 356 | 357 | Lastly the plugin returns the event and metadata back to the pipeline to be sent on to another plugin, or to the final data lake: 358 | 359 | ```python 360 | return (message, metadata) 361 | ``` 362 | 363 | It's best to include tests for plugins, and the [test for the gsuite login plugin can be found here](https://github.com/0xdefendA/defenda-data-lake/blob/main/lambdas/tests/test_plugin_gsuite_logins.py) as an example. -------------------------------------------------------------------------------- /generate_lambda_zip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import docker 3 | from os import path 4 | import subprocess 5 | 6 | 7 | def refresh_requirements(): 8 | subprocess.Popen( 9 | "pipenv run pip freeze > lambdas/requirements.txt", 10 | shell=True, 11 | stdout=subprocess.PIPE, 12 | ).stdout.read() 13 | 14 | 15 | def build_lambda_image(): 16 | docker_client = docker.from_env() 17 | docker_client.images.build(path="lambdas/", tag="datalake-lambdas", quiet=False) 18 | 19 | 20 | def get_lambda_zip(): 21 | docker_client = docker.from_env() 22 | docker_client.containers.run( 23 | "datalake-lambdas", 24 | "cp /asset-output/lambda.zip /mnt/cdk-data-lake/lambdas", 25 | volumes={ 26 | path.abspath("."): { 27 | "bind": "/mnt/cdk-data-lake", 28 | "mode": "rw", 29 | } 30 | }, 31 | remove=True, 32 | ) 33 | 34 | 35 | if __name__ == "__main__": 36 | print("refreshing requirements.txt using pipenv") 37 | refresh_requirements() 38 | print("Building image with requirements.txt") 39 | build_lambda_image() 40 | print("Retrieving zip file for lambda") 41 | get_lambda_zip() 42 | -------------------------------------------------------------------------------- /lambdas/.dockerignore: -------------------------------------------------------------------------------- 1 | cdk* 2 | .vscode* 3 | .git 4 | *log 5 | __pycache__** -------------------------------------------------------------------------------- /lambdas/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM lambci/lambda:build-python3.8 2 | 3 | ENV AWS_DEFAULT_REGION us-west-2 4 | RUN yum install -y rsync 5 | RUN mkdir /asset-input 6 | WORKDIR /asset-input 7 | ADD . . 8 | 9 | #RUN pip3 install -r requirements.txt 10 | RUN pip3 install -r requirements.txt -t /asset-output 11 | RUN rsync -r . /asset-output 12 | WORKDIR /asset-output 13 | RUN zip -9yr lambda.zip . -------------------------------------------------------------------------------- /lambdas/enrichment_plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/enrichment_plugins/__init__.py -------------------------------------------------------------------------------- /lambdas/enrichment_plugins/ensure_eventid.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | class message(object): 4 | 5 | def __init__(self): 6 | ''' 7 | takes an incoming message 8 | and adds an event id to the message if missing 9 | ''' 10 | 11 | self.registration = ['*'] 12 | self.priority = 10 13 | 14 | def onMessage(self, message, metadata): 15 | 16 | if 'eventid' not in message: 17 | message['eventid']=str(uuid.uuid4()) 18 | 19 | return (message, metadata) -------------------------------------------------------------------------------- /lambdas/generate_partitions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import boto3 3 | import time 4 | import logging, logging.config 5 | from utils.dotdict import DotDict 6 | from utils.dates import get_date_parts 7 | import pyathena 8 | from pyathena import connect 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | def get_athena_query(config): 15 | ( 16 | hour, 17 | month, 18 | day, 19 | year, 20 | last_hour_hour, 21 | last_hour_month, 22 | last_hour_day, 23 | last_hour_year, 24 | ) = get_date_parts() 25 | query = f""" 26 | ALTER TABLE {config.athena_database}.{config.athena_table} 27 | ADD IF NOT EXISTS PARTITION 28 | (year='{year}', 29 | month='{month}', 30 | day='{day}', 31 | hour='{hour}' 32 | ) 33 | location 's3://{config.account}-defenda-data-lake-output-bucket/{year}/{month}/{day}/{hour}' 34 | """ 35 | return query 36 | 37 | 38 | def lambda_handler(event, context): 39 | config = DotDict({}) 40 | config.account = boto3.client("sts").get_caller_identity().get("Account") 41 | config.athena_workgroup = os.environ.get("ATHENA_WORKGROUP", "defenda_data_lake") 42 | config.athena_database = os.environ.get("ATHENA_DATABASE", "defenda_data_lake") 43 | config.athena_table = os.environ.get("ATHENA_TABLE", "events") 44 | 45 | # query status/wait for response 46 | 47 | athena_query = get_athena_query(config) 48 | logger.debug(athena_query) 49 | cursor = connect(work_group=config.athena_workgroup).cursor() 50 | cursor.execute(athena_query) 51 | logger.debug("Query finished: {}".format(cursor.state)) 52 | return 53 | -------------------------------------------------------------------------------- /lambdas/normalization_plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/normalization_plugins/__init__.py -------------------------------------------------------------------------------- /lambdas/normalization_plugins/event_shell.py: -------------------------------------------------------------------------------- 1 | from utils.dict_helpers import enum_keys, merge 2 | from utils.dates import utcnow 3 | 4 | 5 | class message(object): 6 | def __init__(self): 7 | """ 8 | takes an incoming message 9 | and ensures it matches our event shell structure 10 | """ 11 | 12 | self.registration = ["*"] 13 | self.priority = 2 14 | 15 | def onMessage(self, message, metadata): 16 | # our target shell 17 | event_shell = { 18 | "utctimestamp": utcnow().isoformat(), 19 | "severity": "INFO", 20 | "summary": "UNKNOWN", 21 | "category": "UNKNOWN", 22 | "source": "UNKNOWN", 23 | "tags": [], 24 | "plugins": [], 25 | "details": {}, 26 | } 27 | # maybe the shell elements are already there? 28 | event_set = set(enum_keys(event_shell)) 29 | message_set = set(enum_keys(message)) 30 | if not event_set.issubset(message_set): 31 | # we have work to do 32 | # merge the dicts letting any message values win 33 | # if the message lacks any keys, our shell values win 34 | message = merge(event_shell, message) 35 | 36 | # move any non shell keys to 'details' 37 | for item in message_set: 38 | # enum_keys traverses sub dicts, we only move the top level 39 | # so check if the key is note a core element 40 | # present in the top level and move it to details 41 | if item not in event_shell and item in message: 42 | message["details"][item] = message.get(item) 43 | del message[item] 44 | 45 | return (message, metadata) 46 | -------------------------------------------------------------------------------- /lambdas/normalization_plugins/gsuite_login.py: -------------------------------------------------------------------------------- 1 | from utils.dict_helpers import sub_dict, enum_keys, dict_match 2 | from utils.dotdict import DotDict 3 | from utils.dates import toUTC 4 | import chevron 5 | 6 | class message(object): 7 | 8 | def __init__(self): 9 | ''' 10 | handle gsuite login activity record 11 | ''' 12 | 13 | self.registration = ['kind'] 14 | self.priority = 20 15 | 16 | def onMessage(self, message, metadata): 17 | # for convenience, make a dot dict version of the message 18 | dot_message=DotDict(message) 19 | 20 | # double check that this is our target message 21 | if 'admin#reports#activity' not in dot_message.get('details.kind','')\ 22 | or 'id' not in message.get('details','') \ 23 | or 'etag' not in message.get('details',''): 24 | return(message, metadata) 25 | 26 | message["source"]="gsuite" 27 | message["tags"].append("gsuite") 28 | 29 | # clean up ipaddress field 30 | if 'ipaddress' in message['details']: 31 | message['details']['sourceipaddress']=message['details']['ipaddress'] 32 | del message['details']['ipaddress'] 33 | 34 | # set the actual time 35 | if dot_message.get("details.id.time",None): 36 | message['utctimestamp']=toUTC(message['details']['id']['time']).isoformat() 37 | 38 | # set the user_name 39 | if dot_message.get("details.actor.email",None): 40 | message["details"]["user"]=dot_message.get("details.actor.email","") 41 | 42 | # set summary 43 | message["summary"]=chevron.render("{{details.user}} {{details.events.0.name}} from IP {{details.sourceipaddress}}",message) 44 | 45 | 46 | # set category 47 | message['category']="authentication" 48 | 49 | #success/failure 50 | if 'fail' in message["summary"]: 51 | message["details"]["success"]=False 52 | if 'success' in message["summary"]: 53 | message["details"]["success"]=True 54 | 55 | #suspicious? 56 | suspicious={"boolvalue":True,"name":"is_suspicious"} 57 | for e in dot_message.get("details.events",[]): 58 | for p in e.get("parameters",[]): 59 | if dict_match(suspicious,p): 60 | message["details"]["suspicious"]=True 61 | 62 | return (message, metadata) -------------------------------------------------------------------------------- /lambdas/normalization_plugins/ip_addresses.py: -------------------------------------------------------------------------------- 1 | from utils.dict_helpers import enum_keys, getValueByPath, find_keys 2 | from utils.dotdict import DotDict 3 | from utils.helpers import is_ip 4 | 5 | 6 | class message(object): 7 | def __init__(self): 8 | """ 9 | takes an incoming message 10 | discovers ip addresses and 11 | normalizes the field names (source/destination) 12 | """ 13 | 14 | self.registration = ["*"] 15 | self.priority = 20 16 | 17 | def onMessage(self, message, metadata): 18 | # help ourselves to a dot dict and list of keys 19 | message = DotDict(message) 20 | message_keys = list(enum_keys(message)) 21 | 22 | # all the ips we encounter along the way 23 | all_ips = [] 24 | 25 | # search for source ip address 26 | # likely places for a source IP 27 | likely_source_fields = [ 28 | "src", 29 | "srcaddr", 30 | "srcip", 31 | "src_ip", 32 | "source_ip", 33 | "sourceipaddress", 34 | "source_ip_address", 35 | "c-ip", 36 | "clientip", 37 | "remoteip", 38 | "remote_ip", 39 | "remoteaddr", 40 | "remote_host_ip_address", 41 | "ipaddress", 42 | "ip_address", 43 | "ipaddr", 44 | "id_orig_h", 45 | "x-forwarded-for", 46 | "http-x-forwarded-for", 47 | ] 48 | 49 | likely_destination_fields = [ 50 | "dst", 51 | "dstip", 52 | "dst_ip", 53 | "dstaddr", 54 | "dest", 55 | "destaddr", 56 | "dest_ip", 57 | "destination_ip", 58 | "destinationipaddress", 59 | "destination_ip_address", 60 | "id_resp_h", 61 | "serverip", 62 | ] 63 | # lets find a source 64 | # first match wins 65 | try: 66 | for field in likely_source_fields: 67 | if field in message_keys: 68 | # do we already have one? 69 | if not getValueByPath(message, "details.sourceipaddress"): 70 | # search the message for any instance of this field 71 | # a list since it could appear multiple times 72 | source_ips = list(find_keys(message, field)) 73 | for ip in source_ips: 74 | if "," in ip: 75 | # some fields like x-forwarded can include multiple IPs 76 | # get the first one 77 | ip = ip.split(",")[0].strip() 78 | if is_ip(ip): 79 | message.details.sourceipaddress = ip 80 | # first one wins 81 | # raise an error to break both for loops 82 | raise StopIteration 83 | except StopIteration: 84 | pass 85 | 86 | # harvest the result or existing source ip 87 | source_ip_address = getValueByPath(message, "details.sourceipaddress") 88 | if source_ip_address: 89 | if is_ip(source_ip_address): 90 | all_ips.append(source_ip_address) 91 | else: 92 | # hrm, there's an entry here that's not an ip 93 | # sometimes cloudtrail does this (config.amazonaws.com ) 94 | # and also sets a useragent field to the same 95 | if getValueByPath(message, "details.sourceipaddress") == getValueByPath( 96 | message, "details.useragent" 97 | ): 98 | del message.details.sourceipaddress 99 | 100 | # lets find a destination 101 | # first match wins 102 | try: 103 | for field in likely_destination_fields: 104 | if field in message_keys: 105 | # do we already have one? 106 | if not getValueByPath(message, "details.destinationipaddress"): 107 | # search the message for any instance of this field 108 | # a list since it could appear multiple times 109 | destination_ips = list(find_keys(message, field)) 110 | for ip in destination_ips: 111 | if is_ip(ip): 112 | message.details.destinationipaddress = ip 113 | # first one wins 114 | # raise an error to break both for loops 115 | raise StopIteration 116 | except StopIteration: 117 | pass 118 | 119 | # harvest the result or existing destination ip 120 | destination_ip_address = getValueByPath(message, "details.destinationipaddress") 121 | if destination_ip_address and is_ip(destination_ip_address): 122 | all_ips.append(destination_ip_address) 123 | 124 | # save all the ips we found along the way 125 | # in details._ipaddresses as a list 126 | if all_ips: 127 | if not getValueByPath(message, "details._ipaddresses"): 128 | message.details._ipaddresses = all_ips 129 | else: 130 | if isinstance(message.details._ipaddresses, list): 131 | for ip in all_ips: 132 | if ip not in message.details._ipaddresses: 133 | message.details._ipaddresses.append(ip) 134 | 135 | return (message, metadata) -------------------------------------------------------------------------------- /lambdas/normalization_plugins/lowercase_keys.py: -------------------------------------------------------------------------------- 1 | class message(object): 2 | 3 | def __init__(self): 4 | ''' 5 | takes an incoming message 6 | and sets the keys to lowercase 7 | ''' 8 | 9 | self.registration = ['*'] 10 | self.priority = 1 11 | 12 | def onMessage(self, message, metadata): 13 | def lower_key(in_dict): 14 | if isinstance(in_dict,dict): 15 | out_dict = {} 16 | for key, item in in_dict.items(): 17 | out_dict[key.lower()] = lower_key(item) 18 | return out_dict 19 | elif isinstance(in_dict,list): 20 | return [lower_key(obj) for obj in in_dict] 21 | else: 22 | return in_dict 23 | 24 | message = lower_key(message) 25 | return (message, metadata) -------------------------------------------------------------------------------- /lambdas/normalization_plugins/timestamps.py: -------------------------------------------------------------------------------- 1 | from utils.dict_helpers import enum_keys, getValueByPath, find_keys 2 | from utils.dotdict import DotDict 3 | from utils.dates import toUTC, utcnow 4 | from datetime import datetime 5 | import logging 6 | 7 | logger = logging.getLogger() 8 | 9 | # likely timestamp fields 10 | likely_timestamp_fields = [ 11 | "timestamp", 12 | "@timestamp", 13 | "time", 14 | "eventtime", 15 | "start", 16 | ] 17 | 18 | 19 | class message(object): 20 | def __init__(self): 21 | """ 22 | takes an incoming message 23 | discovers timestamps 24 | normalizes the format and updates utctimestamp 25 | appends _utcprocessedtimestamp 26 | """ 27 | 28 | # register for all events 29 | # so we can add the processed timestamp metadata field 30 | self.registration = ["*"] 31 | self.priority = 20 32 | 33 | def onMessage(self, message, metadata): 34 | # help ourselves to a dot dict and list of keys 35 | message = DotDict(message) 36 | message_keys = list(enum_keys(message)) 37 | 38 | try: 39 | for field in likely_timestamp_fields: 40 | if field in message_keys: 41 | timestamps = list(find_keys(message, field)) 42 | if field == "time" and "date" in message_keys: 43 | # combine date and time for a timestamp 44 | dates = list(find_keys(message, "date")) 45 | if dates: 46 | # setup a new list for the zipped results 47 | date_timestamps = [] 48 | for i in zip(dates, timestamps): 49 | date_timestamps.append(f"{i[0]} {i[1]}") 50 | 51 | if date_timestamps: 52 | # replace the original list 53 | # with this list of date + time 54 | timestamps = date_timestamps 55 | 56 | for timestamp in timestamps: 57 | utctimestamp = "" 58 | try: 59 | utctimestamp = toUTC(timestamp) 60 | except Exception as e: 61 | logger.error( 62 | f"exception {e} while converting {timestamp} to utc" 63 | ) 64 | pass 65 | if isinstance(utctimestamp, datetime): 66 | message["utctimestamp"] = utctimestamp.isoformat() 67 | # first match wins 68 | raise StopIteration 69 | 70 | except StopIteration: 71 | pass 72 | 73 | # append processed timestamp as metadata 74 | message["details"]["_utcprocessedtimestamp"] = utcnow().isoformat() 75 | 76 | return (message, metadata) -------------------------------------------------------------------------------- /lambdas/processor.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import base64 4 | import json 5 | from json import JSONDecodeError 6 | from io import StringIO 7 | from utils.dotdict import DotDict 8 | from utils.plugins import send_event_to_plugins, register_plugins 9 | from utils.helpers import is_cloudtrail, generate_metadata, emit_json_block, chunks 10 | from utils.dict_helpers import merge 11 | import logging 12 | 13 | logger = logging.getLogger() 14 | logger.setLevel(logging.INFO) 15 | 16 | 17 | def lambda_handler(event, context): 18 | output = [] 19 | metadata = generate_metadata(context) 20 | logger.debug(f"metadata is: {metadata}") 21 | normalization_plugins = register_plugins("normalization_plugins") 22 | enrichment_plugins = register_plugins("enrichment_plugins") 23 | 24 | if "records" in event: 25 | for record in event["records"]: 26 | output_record = {} 27 | logger.debug(f"found record in event: {record}") 28 | payload = base64.b64decode(record["data"]) 29 | 30 | payload_dict = None 31 | try: 32 | # load the json we have from either a .json file or a gunziped file 33 | payload_dict = json.loads(payload) 34 | except JSONDecodeError as e: 35 | # file isn't well formed json, see if we can interpret json from it 36 | logger.error(f"payload is not valid json decode error {e}") 37 | 38 | if payload_dict: 39 | # normalize it 40 | result_record, metadata = send_event_to_plugins( 41 | payload_dict, metadata, normalization_plugins 42 | ) 43 | # enrich it 44 | result_record, metadata = send_event_to_plugins( 45 | result_record, metadata, enrichment_plugins 46 | ) 47 | if result_record: 48 | # TODO, what to do with lambda info as metadata? Do we care? 49 | # result_record = merge(result_record, metadata) 50 | logger.debug(f" resulting norm/enriched is: {result_record}") 51 | # json ending in new line so athena recognizes the records 52 | output_record = { 53 | "recordId": record["recordId"], 54 | "result": "Ok", 55 | "data": base64.b64encode( 56 | json.dumps(result_record).encode("utf-8") + b"\n" 57 | ).decode("utf-8"), 58 | } 59 | else: 60 | # result as None, means drop the record 61 | # TODO, what is the right result in firehose terms 62 | logger.error(f"record {record['recordId']} failed processing") 63 | output_record = { 64 | "recordId": record["recordId"], 65 | "result": "ProcessingFailed", 66 | "data": record["data"], 67 | } 68 | else: 69 | logger.error( 70 | f"record {record['recordId']} failed processing, no resulting dict" 71 | ) 72 | output_record = { 73 | "recordId": record["recordId"], 74 | "result": "ProcessingFailed", 75 | "data": record["data"], 76 | } 77 | 78 | output.append(output_record) 79 | 80 | logger.info("Processed {} records.".format(len(event["records"]))) 81 | 82 | return {"records": output} 83 | else: 84 | logger.info(f"no records found in {event} with context: {context}") 85 | 86 | -------------------------------------------------------------------------------- /lambdas/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | astroid==2.4.2 3 | attrs==20.1.0 4 | black==20.8b1 5 | boto3==1.14.48 6 | botocore==1.17.48 7 | certifi==2020.6.20 8 | chardet==3.0.4 9 | chevron @ https://github.com/noahmorrison/chevron/archive/master.zip 10 | click==7.1.2 11 | docker==4.3.1 12 | docutils==0.15.2 13 | future==0.18.2 14 | idna==2.10 15 | iniconfig==1.0.1 16 | isort==5.4.2 17 | jmespath==0.10.0 18 | lazy-object-proxy==1.4.3 19 | mccabe==0.6.1 20 | more-itertools==8.4.0 21 | mypy-extensions==0.4.3 22 | netaddr==0.8.0 23 | numpy==1.19.1 24 | packaging==20.4 25 | pandas==1.1.1 26 | pathspec==0.8.0 27 | pluggy==0.13.1 28 | py==1.9.0 29 | PyAthena==1.11.1 30 | pylint==2.6.0 31 | pynsive==0.2.7 32 | pyparsing==3.0.0a2 33 | pytest==6.0.1 34 | python-dateutil==2.8.1 35 | pytz==2020.1 36 | PyYAML==5.3.1 37 | regex==2020.7.14 38 | requests==2.24.0 39 | s3transfer==0.3.3 40 | six==1.15.0 41 | tenacity==6.2.0 42 | toml==0.10.1 43 | typed-ast==1.4.1 44 | typing-extensions==3.7.4.3 45 | tzlocal==2.1 46 | urllib3==1.25.10 47 | websocket-client==0.57.0 48 | wrapt==1.12.1 49 | zipp==3.1.0 50 | -------------------------------------------------------------------------------- /lambdas/s3_to_firehose.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import gzip 3 | import json 4 | import logging 5 | import os 6 | from time import sleep 7 | from io import BytesIO, TextIOWrapper, StringIO 8 | from utils.dotdict import DotDict 9 | from utils.helpers import is_cloudtrail, generate_metadata, emit_json_block, chunks 10 | from json import JSONDecodeError 11 | 12 | logger = logging.getLogger() 13 | logger.setLevel(logging.INFO) 14 | FIREHOSE_DELIVERY_STREAM = os.environ.get( 15 | "FIREHOSE_DELIVERY_STREAM", "defenda_data_lake_s3_stream" 16 | ) 17 | FIREHOSE_BATCH_SIZE = os.environ.get("FIREHOSE_BATCH_SIZE", 100) 18 | 19 | 20 | def send_to_firehose(records): 21 | f_hose = boto3.client("firehose") 22 | 23 | # records should be a list of dicts 24 | response = None 25 | if type(records) is list: 26 | # batch up the list below the limits of firehose 27 | for batch in chunks(records, FIREHOSE_BATCH_SIZE): 28 | response = f_hose.put_record_batch( 29 | DeliveryStreamName=FIREHOSE_DELIVERY_STREAM, 30 | Records=[ 31 | {"Data": bytes(str(json.dumps(record) + "\n").encode("UTF-8"))} 32 | for record in batch 33 | ], 34 | ) 35 | logger.debug("firehose response is: {}".format(response)) 36 | 37 | 38 | def lambda_handler(event, context): 39 | """ 40 | Called on a PUT to s3 41 | Make every attempt to read in json records 42 | from the s3 source 43 | """ 44 | metadata = generate_metadata(context) 45 | logger.debug("Event is: {}".format(event)) 46 | 47 | # make the event easier to traverse 48 | event = DotDict(event) 49 | 50 | # test harnesses 51 | if event == {"test": "true"}: 52 | return {"Hello": "from s3_to_firehose"} 53 | elif event == {"metadata": "name"}: 54 | return metadata 55 | elif "Records" in event: 56 | # should be triggered by s3 Put/Object created events 57 | s3 = boto3.client("s3") 58 | for record in event.Records: 59 | record = DotDict(record) 60 | s3_bucket = record.s3.bucket.name 61 | s3_key = record.s3.object.key 62 | # a new bucket will fire for folders *and* files, early exit if it's a folder 63 | if s3_key.endswith("/"): 64 | continue 65 | # assume the file is just good ol json 66 | source = "s3json" 67 | # if the file name is cloudtrail-ish 68 | if is_cloudtrail(s3_key): 69 | source = "cloudtrail" 70 | # up to 5 attempts to get the object ( in case s3 file commit on write is lagging) 71 | s3_response = None 72 | for x in range(1, 6): 73 | try: 74 | s3_response = s3.get_object(Bucket=s3_bucket, Key=s3_key) 75 | break 76 | except Exception as e: 77 | logger.error( 78 | f"Attempt {x}: {e} while attempting to get_object {s3_bucket} {s3_key}" 79 | ) 80 | sleep(1) 81 | continue 82 | if not s3_response: 83 | logger.error( 84 | f"5 attempts to retrieve {s3_bucket} {s3_key} failed, moving on" 85 | ) 86 | continue 87 | s3_data = "" 88 | # gunzip if zipped 89 | if s3_key[-3:] == ".gz": 90 | s3_raw_data = s3_response["Body"].read() 91 | with gzip.GzipFile(fileobj=BytesIO(s3_raw_data)) as gzip_stream: 92 | s3_data += "".join(TextIOWrapper(gzip_stream, encoding="utf-8")) 93 | else: 94 | s3_data = s3_response["Body"].read().decode("utf-8") 95 | 96 | # create our list of records to append out findings to 97 | s3_records = [] 98 | s3_dict = None 99 | try: 100 | # load the json we have from either a .json file or a gunziped file 101 | s3_dict = json.loads(s3_data) 102 | except JSONDecodeError: 103 | # file isn't well formed json, see if we can interpret json from it 104 | for block in emit_json_block(StringIO(s3_data)): 105 | if block: 106 | record = json.loads(block) 107 | record["source"] = source 108 | s3_records.append(record) 109 | # if this is a dict of a single 'Records' list, unroll the list into 110 | # it's sub records 111 | if s3_dict and "Records" in s3_dict: 112 | if type(s3_dict["Records"]) is list: 113 | for record in s3_dict["Records"]: 114 | record["source"] = source 115 | s3_records.append(record) 116 | # maybe it's just a list already? 117 | elif s3_dict and type(s3_dict) is list: 118 | # a list of dicts 119 | for record in s3_dict: 120 | record["source"] = source 121 | s3_records.append(record) 122 | elif s3_dict and type(s3_dict) is dict: 123 | # a single dict, but lets add it to a list 124 | # for consistent handling 125 | s3_dict["source"] = source 126 | s3_records.append(s3_dict) 127 | 128 | logger.debug("pre-plugins s3_records is: {}".format(s3_records)) 129 | # send off to firehose for further processing 130 | if s3_records: 131 | send_to_firehose(s3_records) 132 | 133 | return 134 | -------------------------------------------------------------------------------- /lambdas/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/tests/__init__.py -------------------------------------------------------------------------------- /lambdas/tests/logging_config.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | formatters: 3 | simple: 4 | format: '%(asctime)s - %(module)s - %(levelname)s - %(message)s' 5 | datefmt: '%Y-%m-%d %H:%M:%S %Z' 6 | handlers: 7 | console: 8 | class: logging.StreamHandler 9 | level: INFO 10 | formatter: simple 11 | stream: ext://sys.stdout 12 | loggers: 13 | sampleLogger: 14 | level: DEBUG 15 | handlers: [console] 16 | propagate: no 17 | root: 18 | level: INFO 19 | handlers: [console] -------------------------------------------------------------------------------- /lambdas/tests/samples/sample_cloudfront_wordpress_probe.json: -------------------------------------------------------------------------------- 1 | { 2 | "date": "2020-09-01", 3 | "time": "17:48:18", 4 | "x-edge-location": "MAA50-C1", 5 | "sc-bytes": 587, 6 | "c-ip": "139.59.66.23", 7 | "cs-method": "GET", 8 | "cs(Host)": "c4ixl1pp8t7hvm.cloudfront.net", 9 | "cs-uri-stem": "/wp-login.php", 10 | "sc-status": 301, 11 | "cs(Referer)": "-", 12 | "cs(User-Agent)": "Mozilla/5.0%20(X11;%20Ubuntu;%20Linux%20x86_64;%20rv:62.0)%20Gecko/20100101%20Firefox/62.0", 13 | "cs-uri-query": "-", 14 | "cs(Cookie)": "-", 15 | "x-edge-result-type": "Redirect", 16 | "x-edge-request-id": "JvTXFvqgmlUuUYTFWBShrvVZiMTWLZRsuMcaOx39DLmthLmoNaijww==", 17 | "x-host-header": "somewhere.com", 18 | "cs-protocol": "http", 19 | "cs-bytes": 184, 20 | "time-taken": 0.0, 21 | "x-forwarded-for": "-", 22 | "ssl-protocol": "-", 23 | "ssl-cipher": "-", 24 | "x-edge-response-result-type": "Redirect", 25 | "cs-protocol-version": "HTTP/1.1", 26 | "fle-status": "-", 27 | "fle-encrypted-fields": "-", 28 | "c-port": 44276, 29 | "time-to-first-byte": 0.0, 30 | "x-edge-detailed-result-type": "Redirect", 31 | "sc-content-type": "text/html", 32 | "sc-content-len": 183, 33 | "sc-range-start": "-", 34 | "sc-range-end": "-" 35 | } -------------------------------------------------------------------------------- /lambdas/tests/samples/sample_cloudtrail_create_log_stream.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": "cloudtrail", 3 | "tags": [], 4 | "details": { 5 | "apiversion": "20140328", 6 | "eventtype": "AwsApiCall", 7 | "recipientaccountid": "123456789012", 8 | "responseelements": null, 9 | "requestparameters": { 10 | "loggroupname": "/aws/lambda/some_lambda", 11 | "logstreamname": "2019/09/04/[$LATEST]1759dcd0266b4e28a55147e10c28e984" 12 | }, 13 | "eventid": "2163d086-baa4-4203-a267-97b3a872c651", 14 | "eventsource": "logs.amazonaws.com", 15 | "useragent": "awslambda-worker", 16 | "eventname": "CreateLogStream", 17 | "eventversion": "1.05", 18 | "sourceipaddress": "54.21.12.27", 19 | "requestid": "0fb46c32-fd7c-4121-8eb7-7fa10670bc4b", 20 | "useridentity": { 21 | "type": "AssumedRole", 22 | "principalid": "AROAIQ45SXVRIH72NM:some_lambda", 23 | "arn": "arn:aws:sts::123456789012:assumed-role/some_lambda-us-west-2-lambdaRole/some_lambda", 24 | "accountid": "123456789012", 25 | "accesskeyid": "AROAIQ45SXVRIH72NM", 26 | "sessioncontext": { 27 | "attributes": { 28 | "mfaauthenticated": "false", 29 | "creationdate": "2019-09-04T17:01:34Z" 30 | }, 31 | "sessionissuer": { 32 | "type": "Role", 33 | "principalid": "AROAIQ45SXVRIH72NM", 34 | "arn": "arn:aws:iam::123456789012:role/some_lambda-us-west-2-lambdaRole", 35 | "accountid": "123456789012", 36 | "username": "some_lambda-us-west-2-lambdaRole" 37 | } 38 | } 39 | }, 40 | "lambda_details": { 41 | "function_version": "$LATEST", 42 | "function_arn": "arn:aws:lambda:us-west-2:123456789012:function:some_lambda", 43 | "function_name": "some_lambda", 44 | "memory_size": "1024" 45 | }, 46 | "awsregion": "us-west-2", 47 | "eventtime": "2019-09-04T17:54:59Z" 48 | } 49 | } -------------------------------------------------------------------------------- /lambdas/tests/samples/sample_gsuite_login_event.json: -------------------------------------------------------------------------------- 1 | { 2 | "kind": "admin#reports#activity", 3 | "id": { 4 | "time": "2020-03-02T17:54:33.253Z", 5 | "uniqueQualifier": "123456193837", 6 | "applicationName": "login", 7 | "customerId": "123456bbh" 8 | }, 9 | "etag": "\"12345684sebSczDxOtZ17CIssbQ/fcUkSWOHV-mPDcYGbkgHvS5ghwg\"", 10 | "actor": { 11 | "email": "someone@somewhere.com", 12 | "profileId": "123456359252796690369" 13 | }, 14 | "ipAddress": "123.456.253.226", 15 | "events": [ 16 | { 17 | "type": "login", 18 | "name": "login_success", 19 | "parameters": [ 20 | { 21 | "name": "login_type", 22 | "value": "exchange" 23 | }, 24 | { 25 | "name": "login_challenge_method", 26 | "multiValue": [ 27 | "none" 28 | ] 29 | }, 30 | { 31 | "name": "is_suspicious", 32 | "boolValue": false 33 | } 34 | ] 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /lambdas/tests/samples/sample_syslog_sudo.json: -------------------------------------------------------------------------------- 1 | { 2 | "category": "monitoring", 3 | "severity": "INFO", 4 | "utctimestamp": "2014-04-17T06:10:54+00:00", 5 | "summary": " nagios : TTY=unknown ; PWD=/ ; USER=root ; COMMAND=/usr/lib64/nagios/plugins/custom/check_auditd.sh\n", 6 | "source": "syslog", 7 | "tags": [ 8 | "sample" 9 | ], 10 | "details": { 11 | "processid": "123", 12 | "program": "sudo", 13 | "hostname": "something.example.com", 14 | "timestamp": "Apr 17 06:10:54" 15 | } 16 | } -------------------------------------------------------------------------------- /lambdas/tests/samples/sample_vpc_flow_log.json: -------------------------------------------------------------------------------- 1 | { 2 | "account_id": "123456789010", 3 | "action": "ACCEPT", 4 | "bytes": 840, 5 | "dstaddr": "192.0.2.1", 6 | "dstport": 49152, 7 | "end": "2014-12-14T04:07:50", 8 | "interface_id": "eni-102010ab", 9 | "log_status": "OK", 10 | "packets": 10, 11 | "protocol": 6, 12 | "srcaddr": "198.51.100.1", 13 | "srcport": 443, 14 | "start": "2014-12-14T04:06:50", 15 | "version": 2 16 | } -------------------------------------------------------------------------------- /lambdas/tests/test_core.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from io import BytesIO 3 | from subprocess import PIPE, Popen 4 | from pkg_resources import parse_version 5 | import pytest 6 | import yaml 7 | from datetime import timezone 8 | import datetime 9 | from utils.plugins import send_event_to_plugins, register_plugins 10 | from utils.helpers import is_cloudtrail, generate_metadata, short_uuid 11 | from utils.helpers import is_ip, isIPv4, isIPv6 12 | from utils.dict_helpers import ( 13 | merge, 14 | find_keys, 15 | enum_values, 16 | enum_keys, 17 | sub_dict, 18 | dict_match, 19 | getValueByPath, 20 | dictpath, 21 | ) 22 | from utils.dotdict import DotDict 23 | from utils.dates import toUTC, get_date_parts 24 | from pathlib import Path 25 | import logging, logging.config 26 | 27 | print("setting up logging") 28 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml") 29 | with open(logging_config_file_path, "r") as fd: 30 | logging_config = yaml.safe_load(fd) 31 | logging.config.dictConfig(logging_config) 32 | global logger 33 | logger = logging.getLogger() 34 | logger.info("logging established") 35 | 36 | 37 | class TestCore(object): 38 | def test_cloudtrail_file_identification(self): 39 | filename = "AWSLogs/722455710680/CloudTrail/us-west-2/2019/09/20/722455710680_CloudTrail_us-west-2_20190920T0000Z_2AKE4AyQfcPRcIoa.json.gz" 40 | assert is_cloudtrail(filename) == True 41 | filename = "not cloudtrailfile.json.gz" 42 | assert is_cloudtrail(filename) == False 43 | 44 | def test_lambda_metadata_generation(self): 45 | lambda_context = { 46 | "function_version": "$LATEST", 47 | "invoked_function_arn": "arn:aws:lambda:us-west-2:722455710680:function:processor-prod", 48 | "function_name": "processor-prod", 49 | "memory_limit_in_mb": "1024", 50 | } 51 | lambda_context = DotDict(lambda_context) 52 | result = generate_metadata(lambda_context) 53 | assert type(result.lambda_details) == type(lambda_context) 54 | assert "function_version" in result.lambda_details 55 | assert "function_arn" in result.lambda_details 56 | assert "function_name" in result.lambda_details 57 | assert "memory_size" in result.lambda_details 58 | 59 | def test_short_uuid(self): 60 | assert len(short_uuid()) == 8 61 | 62 | def test_to_utc(self): 63 | assert toUTC("Jan 1 12am 2020 UTC") == datetime.datetime( 64 | 2020, 1, 1, 0, 0, tzinfo=timezone.utc 65 | ) 66 | assert toUTC("Jan 1 12am 2020 UTC").isoformat() == "2020-01-01T00:00:00+00:00" 67 | 68 | def test_get_date_parts(self): 69 | parts = get_date_parts() 70 | assert len(parts) == 8 71 | 72 | def test_dictpath(self): 73 | assert list(dictpath("key.value")) == ["key", "value"] 74 | 75 | def test_get_value_by_path(self): 76 | assert getValueByPath({"key": "value"}, "key") == "value" 77 | assert getValueByPath({"key": {"key": "value"}}, "key.key") == "value" 78 | assert ( 79 | getValueByPath({"key": {"key": {"key": "value"}}}, "key.key.key") == "value" 80 | ) 81 | 82 | def test_ip_helpers(self): 83 | assert is_ip("127.0.0.1") 84 | assert is_ip("127.0.0.1/32") 85 | assert is_ip("127") == False 86 | assert is_ip("1") == False 87 | assert is_ip("1278.1.1.1.1") == False 88 | assert is_ip("fe80::") 89 | assert is_ip("fe80::/10") 90 | assert isIPv4("127.0.0.1") 91 | assert isIPv4("127.0.0.1/32") == False 92 | assert isIPv6("fe80::") 93 | assert isIPv6("::ffff:192.0.2.15") 94 | assert isIPv6(":ffff:192.0.2.15") == False 95 | 96 | def test_merge(self): 97 | dict1 = {"some_key": "some value"} 98 | dict2 = {"some_other_key": "some other value"} 99 | dict3 = merge(dict1, dict2) 100 | assert dict3 == {"some_key": "some value", "some_other_key": "some other value"} 101 | 102 | def test_find_keys(self): 103 | complex_dict1 = { 104 | "some_key": "some value", 105 | "sub_key": {"some_key": "some other value"}, 106 | } 107 | result = list(find_keys(complex_dict1, "some_key")) 108 | assert result == ["some value", "some other value"] 109 | 110 | def test_enum_values(self): 111 | complex_dict1 = { 112 | "some_key": "some value", 113 | "sub_key": {"some_key": "some other value"}, 114 | } 115 | result = list(enum_values(complex_dict1)) 116 | assert result == ["some value", "some other value"] 117 | 118 | def test_enum_keys(self): 119 | complex_dict1 = { 120 | "some_key": "some value", 121 | "sub_key": {"some_key": "some other value"}, 122 | } 123 | result = list(enum_keys(complex_dict1)) 124 | assert result == ["some_key", "sub_key", "some_key"] 125 | 126 | def test_sub_dict(self): 127 | complex_dict1 = { 128 | "some_key": "some value", 129 | "sub_key": {"some_key": "some other value"}, 130 | } 131 | result = sub_dict(complex_dict1, ["some_key"], "nothing") 132 | assert result == {"some_key": "some value"} 133 | result = sub_dict(complex_dict1, ["sub_key.some_key"], "nothing") 134 | assert result == {"sub_key.some_key": "nothing"} 135 | complex_dot_dict = DotDict(complex_dict1) 136 | result = sub_dict(complex_dot_dict, ["sub_key.some_key"], "nothing") 137 | assert result == {"sub_key.some_key": "some other value"} 138 | result = sub_dict(complex_dot_dict, ["some_key", "sub_key.some_key"]) 139 | assert result == { 140 | "some_key": "some value", 141 | "sub_key.some_key": "some other value", 142 | } 143 | 144 | def test_dict_match(self): 145 | complex_dict1 = { 146 | "some_key": "some value", 147 | "sub_key": {"some_key": "some other value"}, 148 | } 149 | assert dict_match({"some_key": "some value"}, complex_dict1) 150 | complex_dot_dict = DotDict(complex_dict1) 151 | assert dict_match({"sub_key.some_key": "some other value"}, complex_dot_dict) 152 | assert ( 153 | dict_match({"sub_key.some_key": "not some other value"}, complex_dot_dict) 154 | == False 155 | ) 156 | -------------------------------------------------------------------------------- /lambdas/tests/test_plugin_gsuite_logins.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | import json 4 | import uuid 5 | import logging, logging.config 6 | from pathlib import Path 7 | from utils.dotdict import DotDict 8 | from utils.dates import toUTC 9 | 10 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml") 11 | with open(logging_config_file_path, "r") as fd: 12 | logging_config = yaml.safe_load(fd) 13 | logging.config.dictConfig(logging_config) 14 | global logger 15 | logger = logging.getLogger() 16 | 17 | 18 | class TestPluginGsuiteLogins(object): 19 | def setup(self): 20 | from normalization_plugins.gsuite_login import message 21 | 22 | self.plugin = message() 23 | with open("./lambdas/tests/samples/sample_gsuite_login_event.json", "r") as f: 24 | self.inbound_event = json.loads(f.read()) 25 | # run the event through default plugins 26 | # to set the shell and lowercase all keys 27 | from normalization_plugins.event_shell import message as event_shell 28 | from normalization_plugins.lowercase_keys import message as lowercase_keys 29 | 30 | metadata = {"something": "else"} 31 | event = self.inbound_event 32 | event, metadata = event_shell().onMessage(event, metadata) 33 | event, metadata = lowercase_keys().onMessage(event, metadata) 34 | self.normalized_event = event 35 | 36 | def test_nochange(self): 37 | metadata = {"something": "else"} 38 | # use the native raw event 39 | event = self.inbound_event 40 | result, metadata = self.plugin.onMessage(event, metadata) 41 | # in = out - plugin didn't modify it 42 | # since it doesn't match the normalized format 43 | assert result == event 44 | 45 | def test_structure(self): 46 | metadata = {"something": "else"} 47 | # use the normalized event 48 | event = self.normalized_event 49 | result, metadata = self.plugin.onMessage(event, metadata) 50 | assert "utctimestamp" in result 51 | assert "severity" in result 52 | assert "summary" in result 53 | assert "category" in result 54 | assert "source" in result 55 | assert "tags" in result 56 | assert "plugins" in result 57 | assert "details" in result 58 | 59 | def test_values(self): 60 | metadata = {"something": "else"} 61 | # use the normalized event 62 | event = self.normalized_event 63 | result, metadata = self.plugin.onMessage(event, metadata) 64 | logger.debug(result) 65 | assert ( 66 | result["summary"] 67 | == "someone@somewhere.com login_success from IP 123.456.253.226" 68 | ) 69 | assert result["details"]["sourceipaddress"] == "123.456.253.226" 70 | assert result["category"] == "authentication" 71 | assert result["source"] == "gsuite" 72 | assert result["details"]["success"] == True -------------------------------------------------------------------------------- /lambdas/tests/test_plugin_ip_addresses.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | import json 4 | import uuid 5 | import logging, logging.config 6 | from pathlib import Path 7 | from utils.dotdict import DotDict 8 | from utils.dates import toUTC 9 | 10 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml") 11 | with open(logging_config_file_path, "r") as fd: 12 | logging_config = yaml.safe_load(fd) 13 | logging.config.dictConfig(logging_config) 14 | global logger 15 | logger = logging.getLogger() 16 | 17 | 18 | class TestPluginIpAddresses(object): 19 | def setup(self): 20 | from normalization_plugins.ip_addresses import message 21 | 22 | self.plugin = message() 23 | self.inbound_events = [] 24 | self.normalized_events = [] 25 | with open( 26 | "./lambdas/tests/samples/sample_cloudtrail_create_log_stream.json", "r" 27 | ) as f: 28 | self.inbound_events.append(json.loads(f.read())) 29 | with open( 30 | "./lambdas/tests/samples/sample_cloudfront_wordpress_probe.json", "r" 31 | ) as f: 32 | self.inbound_events.append(json.loads(f.read())) 33 | with open("./lambdas/tests/samples/sample_vpc_flow_log.json", "r") as f: 34 | self.inbound_events.append(json.loads(f.read())) 35 | # run the event through default plugins 36 | # to set the shell and lowercase all keys 37 | from normalization_plugins.event_shell import message as event_shell 38 | from normalization_plugins.lowercase_keys import message as lowercase_keys 39 | 40 | metadata = {"something": "else"} 41 | for event in self.inbound_events: 42 | event, metadata = event_shell().onMessage(event, metadata) 43 | event, metadata = lowercase_keys().onMessage(event, metadata) 44 | self.normalized_events.append(event) 45 | 46 | def test_nochange(self): 47 | metadata = {"something": "else"} 48 | event = {} 49 | # use an event without an ip 50 | # to test if the plugin is benign when it should not act 51 | with open("./lambdas/tests/samples/sample_syslog_sudo.json", "r") as f: 52 | event = json.loads(f.read()) 53 | # make sure we have a valid, populated dict 54 | assert len(event.keys()) 55 | result, metadata = self.plugin.onMessage(event, metadata) 56 | # in = out - plugin didn't modify it 57 | # since it doesn't match the normalized format 58 | # and won't find an ip field under 'details' 59 | 60 | assert result == event 61 | 62 | def test_structure(self): 63 | metadata = {"something": "else"} 64 | # use the normalized event 65 | for event in self.normalized_events: 66 | result, metadata = self.plugin.onMessage(event, metadata) 67 | assert "utctimestamp" in result 68 | assert "severity" in result 69 | assert "summary" in result 70 | assert "category" in result 71 | assert "source" in result 72 | assert "tags" in result 73 | assert "plugins" in result 74 | assert "details" in result 75 | # we should have a source or destination for these events 76 | assert ( 77 | "sourceipaddress" in result["details"] 78 | or "destinationipaddress" in result["details"] 79 | ) 80 | 81 | def test_values(self): 82 | metadata = {"something": "else"} 83 | # use normalized events 84 | # we know the end result for 85 | event = self.normalized_events[0] 86 | result, metadata = self.plugin.onMessage(event, metadata) 87 | logger.debug(result) 88 | assert result["details"]["sourceipaddress"] == "54.21.12.27" 89 | assert "54.21.12.27" in result["details"]["_ipaddresses"] 90 | 91 | event = self.normalized_events[1] 92 | result, metadata = self.plugin.onMessage(event, metadata) 93 | logger.debug(result) 94 | assert result["details"]["sourceipaddress"] == "139.59.66.23" 95 | assert "139.59.66.23" in result["details"]["_ipaddresses"] 96 | 97 | event = self.normalized_events[2] 98 | result, metadata = self.plugin.onMessage(event, metadata) 99 | logger.debug(result) 100 | assert result["details"]["sourceipaddress"] == "198.51.100.1" 101 | assert result["details"]["destinationipaddress"] == "192.0.2.1" 102 | assert "192.0.2.1" in result["details"]["_ipaddresses"] 103 | assert "198.51.100.1" in result["details"]["_ipaddresses"] 104 | 105 | def test_invalid_ip_values(self): 106 | """ 107 | purposefully invalidate IP addresses in ip address fields 108 | and make sure the plugin doesn't accept them 109 | """ 110 | metadata = {"something": "else"} 111 | # use normalized events 112 | # we know the end result for 113 | event = self.normalized_events[0] 114 | event["details"]["sourceipaddress"] = "nada" 115 | result, metadata = self.plugin.onMessage(event, metadata) 116 | logger.debug(result) 117 | assert result["details"]["sourceipaddress"] == "nada" 118 | assert "_ipaddresses" not in result["details"] 119 | 120 | event = self.normalized_events[1] 121 | event["details"]["c-ip"] = "1" 122 | result, metadata = self.plugin.onMessage(event, metadata) 123 | logger.debug(result) 124 | assert result["details"]["c-ip"] == "1" 125 | assert result["details"].get("sourceipaddress", None) == None 126 | assert "_ipaddresses" not in result["details"] 127 | 128 | event = self.normalized_events[2] 129 | event["details"]["srcaddr"] = "1320.2555.2555.2555" 130 | result, metadata = self.plugin.onMessage(event, metadata) 131 | logger.debug(result) 132 | assert result["details"]["srcaddr"] == "1320.2555.2555.2555" 133 | assert result["details"].get("sourceipaddress", None) == None 134 | assert "192.0.2.1" in result["details"]["_ipaddresses"] 135 | -------------------------------------------------------------------------------- /lambdas/tests/test_plugin_timestamps.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | import json 4 | import uuid 5 | import logging, logging.config 6 | from pathlib import Path 7 | from utils.dotdict import DotDict 8 | from utils.dates import toUTC 9 | import tzlocal 10 | import os 11 | 12 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml") 13 | with open(logging_config_file_path, "r") as fd: 14 | logging_config = yaml.safe_load(fd) 15 | logging.config.dictConfig(logging_config) 16 | global logger 17 | logger = logging.getLogger() 18 | 19 | os.environ["TZ"] = "UTC" 20 | logger.info(f"using timezone {tzlocal.get_localzone()}") 21 | 22 | 23 | class TestPluginTimestamps(object): 24 | def setup(self): 25 | from normalization_plugins.timestamps import message 26 | 27 | self.plugin = message() 28 | self.inbound_events = [] 29 | self.normalized_events = [] 30 | 31 | with open( 32 | "./lambdas/tests/samples/sample_cloudtrail_create_log_stream.json", "r" 33 | ) as f: 34 | self.inbound_events.append(json.loads(f.read())) 35 | with open( 36 | "./lambdas/tests/samples/sample_cloudfront_wordpress_probe.json", "r" 37 | ) as f: 38 | self.inbound_events.append(json.loads(f.read())) 39 | with open("./lambdas/tests/samples/sample_vpc_flow_log.json", "r") as f: 40 | self.inbound_events.append(json.loads(f.read())) 41 | # run the event through default plugins 42 | # to set the shell and lowercase all keys 43 | from normalization_plugins.event_shell import message as event_shell 44 | from normalization_plugins.lowercase_keys import message as lowercase_keys 45 | 46 | metadata = {"something": "else"} 47 | for event in self.inbound_events: 48 | event, metadata = event_shell().onMessage(event, metadata) 49 | event, metadata = lowercase_keys().onMessage(event, metadata) 50 | self.normalized_events.append(event) 51 | 52 | def test_nochange(self): 53 | 54 | metadata = {"something": "else"} 55 | event = {} 56 | # use an event without an ip 57 | # to test if the plugin is benign when it should not act 58 | with open("./lambdas/tests/samples/sample_syslog_sudo.json", "r") as f: 59 | event = json.loads(f.read()) 60 | # make sure we have a valid, populated dict 61 | assert len(event.keys()) 62 | # remove the timstamp in this event 63 | # that would trigger the plugin 64 | # to see if it passes the no change test 65 | del event["details"]["timestamp"] 66 | result, metadata = self.plugin.onMessage(event, metadata) 67 | # the plugin adds a metadata field 68 | # assert that it worked 69 | assert result["details"]["_utcprocessedtimestamp"] 70 | 71 | # next, remove it for the 72 | # in = out - plugin didn't modify it 73 | # test 74 | del result["details"]["_utcprocessedtimestamp"] 75 | 76 | assert result == event 77 | 78 | def test_structure(self): 79 | metadata = {"something": "else"} 80 | # use the normalized event 81 | for event in self.normalized_events: 82 | result, metadata = self.plugin.onMessage(event, metadata) 83 | assert "severity" in result 84 | assert "summary" in result 85 | assert "category" in result 86 | assert "source" in result 87 | assert "tags" in result 88 | assert "plugins" in result 89 | assert "details" in result 90 | # we should have these valid timestamps 91 | assert "utctimestamp" in result 92 | assert "_utcprocessedtimestamp" in result["details"] 93 | 94 | def test_values(self): 95 | metadata = {"something": "else"} 96 | # use normalized events 97 | # we know the end result for 98 | event = self.normalized_events[0] 99 | result, metadata = self.plugin.onMessage(event, metadata) 100 | logger.debug(result) 101 | assert result["utctimestamp"] == "2019-09-04T17:54:59+00:00" 102 | assert result["details"]["_utcprocessedtimestamp"] 103 | 104 | event = self.normalized_events[1] 105 | result, metadata = self.plugin.onMessage(event, metadata) 106 | assert result["utctimestamp"] == "2020-09-01T17:48:18+00:00" 107 | assert result["details"]["_utcprocessedtimestamp"] 108 | 109 | event = self.normalized_events[2] 110 | result, metadata = self.plugin.onMessage(event, metadata) 111 | logger.debug(result) 112 | assert result["utctimestamp"] == "2014-12-14T04:06:50+00:00" 113 | assert result["details"]["_utcprocessedtimestamp"] 114 | 115 | def test_invalid_date_values(self): 116 | """ 117 | purposefully invalidate dates in date fields 118 | and make sure the plugin doesn't accept them 119 | """ 120 | metadata = {"something": "else"} 121 | # use normalized events 122 | # we know the end result for 123 | event = self.normalized_events[0] 124 | event["details"]["eventtime"] = "nada" 125 | result, metadata = self.plugin.onMessage(event, metadata) 126 | logger.debug(result) 127 | assert result["details"]["eventtime"] == "nada" 128 | 129 | event = self.normalized_events[1] 130 | event["details"]["time"] = "nada" 131 | result, metadata = self.plugin.onMessage(event, metadata) 132 | logger.debug(result) 133 | assert result["details"]["time"] == "nada" 134 | 135 | event = self.normalized_events[2] 136 | event["details"]["start"] = "nada" 137 | result, metadata = self.plugin.onMessage(event, metadata) 138 | logger.debug(result) 139 | assert result["details"]["start"] == "nada" -------------------------------------------------------------------------------- /lambdas/tests/test_plugins.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import yaml 3 | import json 4 | import uuid 5 | from utils.dotdict import DotDict 6 | from utils.dates import toUTC 7 | 8 | 9 | class TestLowerCaseKeys(object): 10 | def setup(self): 11 | from normalization_plugins.lowercase_keys import message 12 | 13 | self.plugin = message() 14 | 15 | def test_nochange(self): 16 | metadata = {"something": "else"} 17 | event = {"key1": "syslog", "tags": ["atag"]} 18 | result, metadata = self.plugin.onMessage(event, metadata) 19 | # in = out - plugin didn't modify it 20 | assert result == event 21 | 22 | def test_lower_a_key(self): 23 | metadata = {"something": "else"} 24 | event = {"KEY1": "syslog", "tags": ["atag"]} 25 | expected = {"key1": "syslog", "tags": ["atag"]} 26 | result, metadata = self.plugin.onMessage(event, metadata) 27 | # lower case the upper case key 28 | assert result == expected 29 | 30 | def test_lower_a_sub_key(self): 31 | metadata = {"something": "else"} 32 | event = {"KEY1": "syslog", "tags": ["atag"], "details": {"SUBKEY": "subvalue"}} 33 | expected = { 34 | "key1": "syslog", 35 | "tags": ["atag"], 36 | "details": {"subkey": "subvalue"}, 37 | } 38 | result, metadata = self.plugin.onMessage(event, metadata) 39 | # lower case the upper case keys wherever they are 40 | assert result == expected 41 | 42 | 43 | class TestEnsureEventID(object): 44 | def setup(self): 45 | from enrichment_plugins.ensure_eventid import message 46 | 47 | self.plugin = message() 48 | 49 | def test_ensure_event_id(self): 50 | metadata = {"something": "else"} 51 | event = {"key1": "syslog", "tags": ["atag"]} 52 | result, metadata = self.plugin.onMessage(event, metadata) 53 | assert "eventid" in result 54 | assert "eventid" in event 55 | assert type(uuid.UUID(event["eventid"])) == uuid.UUID 56 | 57 | 58 | class TestEventShell(object): 59 | def setup(self): 60 | from normalization_plugins.event_shell import message 61 | 62 | self.plugin = message() 63 | 64 | def test_ensure_base_event_shell(self): 65 | # given a really empty message 66 | # does it get the base shell? 67 | # does it move any non base items to 'details'? 68 | metadata = {"something": "else"} 69 | event = {"key1": "syslog", "tags": ["atag"]} 70 | result, metadata = self.plugin.onMessage(event, metadata) 71 | assert "severity" in result 72 | assert "tags" in result 73 | assert "atag" in result["tags"] 74 | assert "key1" in result["details"] 75 | 76 | def test_ensure_complex_event_shell(self): 77 | # given a complex message 78 | # does it get the base shell? 79 | # does it move any non base items to 'details'? 80 | metadata = {"something": "else"} 81 | event = { 82 | "key1": "syslog", 83 | "tags": ["atag"], 84 | "complexkey": {"subkey": "subvalue"}, 85 | } 86 | result, metadata = self.plugin.onMessage(event, metadata) 87 | assert "severity" in result 88 | assert "tags" in result 89 | assert "atag" in result["tags"] 90 | assert "key1" in result["details"] 91 | assert "complexkey" in result["details"] 92 | assert "subkey" in result["details"]["complexkey"] 93 | -------------------------------------------------------------------------------- /lambdas/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/utils/__init__.py -------------------------------------------------------------------------------- /lambdas/utils/athena.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | import logging 4 | logger = logging.getLogger() 5 | 6 | def default_bucket(session): 7 | ''' 8 | Return the s3 bucket name for athena results 9 | to allow us to get the csv natively 10 | ''' 11 | account_id = session.client('sts').get_caller_identity().get('Account') 12 | return '{}-{}-query-results-{}'.format(account_id, 'aws-athena', session.region_name) 13 | 14 | def run_query(athena, query, database, s3_output): 15 | ''' 16 | Function for executing Athena queries and return the query ID 17 | ''' 18 | response = athena.start_query_execution( 19 | QueryString=query, 20 | QueryExecutionContext={ 21 | 'Database': database 22 | }, 23 | ResultConfiguration={ 24 | 'OutputLocation': 's3://{}'.format(s3_output), 25 | } 26 | ) 27 | logger.debug('Execution ID: ' + response['QueryExecutionId']) 28 | return response 29 | 30 | def dataframe_from_athena_s3(session,athena_response,bucket_name): 31 | ''' 32 | Retrieve the native athena csv results as a pandas dataframe 33 | for easy conversion and analysis 34 | ''' 35 | s3=session.resource('s3') 36 | key_name=athena_response['QueryExecutionId'] 37 | s3_response = s3.Bucket(bucket_name).Object(key= key_name + '.csv').get() 38 | 39 | return pd.read_csv(io.BytesIO(s3_response['Body'].read()), encoding='utf8') -------------------------------------------------------------------------------- /lambdas/utils/dates.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pytz 3 | import tzlocal 4 | from datetime import datetime, timedelta 5 | from dateutil.parser import parse 6 | import logging 7 | 8 | logger = logging.getLogger() 9 | 10 | 11 | def get_date_parts(): 12 | now = datetime.utcnow() 13 | last_hour_now = now - timedelta(hours=1) 14 | 15 | now_hour = str(now.hour).rjust(2, "0") 16 | now_month = str(now.month).rjust(2, "0") 17 | now_day = str(now.day).rjust(2, "0") 18 | now_year = str(now.year) 19 | last_hour_hour = str(last_hour_now.hour).rjust(2, "0") 20 | last_hour_month = str(last_hour_now.month).rjust(2, "0") 21 | last_hour_day = str(last_hour_now.day).rjust(2, "0") 22 | last_hour_year = str(last_hour_now.year) 23 | 24 | return ( 25 | now_hour, 26 | now_month, 27 | now_day, 28 | now_year, 29 | last_hour_hour, 30 | last_hour_month, 31 | last_hour_day, 32 | last_hour_year, 33 | ) 34 | 35 | 36 | def toUTC(suspectedDate): 37 | """make a UTC date out of almost anything""" 38 | utc = pytz.UTC 39 | objDate = None 40 | # pick up any environment TZ changes 41 | tzlocal.reload_localzone() 42 | 43 | LOCAL_TIMEZONE = tzlocal.get_localzone() 44 | 45 | if type(suspectedDate) == datetime: 46 | objDate = suspectedDate 47 | elif type(suspectedDate) == float: 48 | if suspectedDate <= 0: 49 | objDate = datetime(1970, 1, 1) 50 | else: 51 | # This breaks in the year 2286 52 | EPOCH_MAGNITUDE = 9 53 | magnitude = int(math.log10(int(suspectedDate))) 54 | if magnitude > EPOCH_MAGNITUDE: 55 | suspectedDate = suspectedDate / 10 ** (magnitude - EPOCH_MAGNITUDE) 56 | objDate = datetime.fromtimestamp(suspectedDate, LOCAL_TIMEZONE) 57 | elif str(suspectedDate).isdigit(): 58 | if int(str(suspectedDate)) <= 0: 59 | objDate = datetime(1970, 1, 1) 60 | else: 61 | # epoch? but seconds/milliseconds/nanoseconds (lookin at you heka) 62 | epochDivisor = int(str(1) + "0" * (len(str(suspectedDate)) % 10)) 63 | objDate = datetime.fromtimestamp( 64 | float(suspectedDate / epochDivisor), LOCAL_TIMEZONE 65 | ) 66 | elif type(suspectedDate) is str: 67 | # try to parse float or negative number from string: 68 | objDate = None 69 | try: 70 | suspected_float = float(suspectedDate) 71 | if suspected_float <= 0: 72 | objDate = datetime(1970, 1, 1) 73 | except ValueError: 74 | pass 75 | if objDate is None: 76 | objDate = parse(suspectedDate, fuzzy=True) 77 | try: 78 | if objDate.tzinfo is None: 79 | objDate = LOCAL_TIMEZONE.localize(objDate) 80 | except AttributeError as e: 81 | raise ValueError( 82 | "Date %s which was converted to %s has no " 83 | "tzinfo attribute : %s" % (suspectedDate, objDate, e) 84 | ) 85 | 86 | objDate = utc.normalize(objDate) 87 | 88 | return objDate 89 | 90 | 91 | def utcnow(): 92 | """python is silly and returns naive datetime 93 | when datetime.utcnow() is called 94 | But if you call now with a UTC timezone 95 | it returns a non naive datetime 96 | """ 97 | return datetime.now(pytz.UTC) -------------------------------------------------------------------------------- /lambdas/utils/dict_helpers.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from copy import deepcopy 3 | 4 | 5 | def merge(dict1, dict2): 6 | """ Return a new dictionary by merging two dictionaries recursively. """ 7 | 8 | result = deepcopy(dict1) 9 | 10 | for key, value in dict2.items(): 11 | if isinstance(value, collections.abc.Mapping): 12 | result[key] = merge(result.get(key, {}), value) 13 | else: 14 | result[key] = deepcopy(dict2[key]) 15 | 16 | return result 17 | 18 | 19 | def find_keys(node, kv): 20 | """Returns all the keys matching kv in a given node/dict""" 21 | 22 | if isinstance(node, list): 23 | for i in node: 24 | for x in find_keys(i, kv): 25 | yield x 26 | elif isinstance(node, dict): 27 | if kv in node: 28 | yield node[kv] 29 | for j in node.values(): 30 | for x in find_keys(j, kv): 31 | yield x 32 | 33 | 34 | def enum_values(node): 35 | """Returns all the values in a given dict/node""" 36 | 37 | if isinstance(node, list): 38 | for i in node: 39 | for x in enum_values(i): 40 | yield x 41 | elif isinstance(node, dict): 42 | for j in node.values(): 43 | for x in enum_values(j): 44 | yield x 45 | else: 46 | yield node 47 | 48 | 49 | def enum_keys(node): 50 | """Returns all the keys in a given dict/node""" 51 | 52 | if isinstance(node, list): 53 | for i in node: 54 | for x in enum_keys(i): 55 | yield x 56 | elif isinstance(node, dict): 57 | for j in node.keys(): 58 | yield j 59 | for x in enum_keys(node[j]): 60 | yield x 61 | 62 | 63 | def sub_dict(somedict, somekeys, default=None): 64 | """Return just the given keys from a dict""" 65 | 66 | return dict([(k, somedict.get(k, default)) for k in somekeys]) 67 | 68 | 69 | def dict_match(query_dict, target_dict): 70 | """Determine if the target_dict contains the keys/values in the query_dict""" 71 | 72 | query_keys = list(enum_keys(query_dict)) 73 | if sub_dict(target_dict, query_keys) == query_dict: 74 | return True 75 | else: 76 | return False 77 | 78 | 79 | def dictpath(path): 80 | """split a string representing a 81 | nested dictionary path key.subkey.subkey 82 | """ 83 | for i in path.split("."): 84 | yield "{0}".format(i) 85 | 86 | 87 | def getValueByPath(input_dict, path_string): 88 | """ 89 | Gets data/value from a dictionary using a dotted accessor-string 90 | http://stackoverflow.com/a/7534478 91 | path_string can be key.subkey.subkey.subkey 92 | """ 93 | return_data = input_dict 94 | for chunk in path_string.split("."): 95 | return_data = return_data.get(chunk, {}) 96 | return return_data -------------------------------------------------------------------------------- /lambdas/utils/dotdict.py: -------------------------------------------------------------------------------- 1 | class DotDict(dict): 2 | '''dict.item notation for dict()'s''' 3 | __getattr__ = dict.__getitem__ 4 | __setattr__ = dict.__setitem__ 5 | __delattr__ = dict.__delitem__ 6 | 7 | def __init__(self, dct={}): 8 | for key, value in dct.items(): 9 | if hasattr(value, 'keys'): 10 | value = DotDict(value) 11 | self[key] = value 12 | 13 | def get(self, key, default=None): 14 | """get to allow for dot string notation 15 | :param str key: Key in dot-notation (e.g. 'foo.lol'). 16 | :return: value. None if no value was found. 17 | """ 18 | try: 19 | return self.__lookup(self, key) 20 | except KeyError: 21 | return default 22 | 23 | def __lookup(self, dct, key): 24 | """Checks dct recursive to find the value for key. 25 | Is used by get() internally. 26 | :param dict dct: input dictionary 27 | :param str key: The key we are looking for. 28 | :return: The value. 29 | :raise KeyError: If the given key is not found 30 | """ 31 | if '.' in key: 32 | key, node = key.split('.', 1) 33 | return self.__lookup(dct[key], node) 34 | else: 35 | return dct[key] -------------------------------------------------------------------------------- /lambdas/utils/helpers.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import re 3 | import collections 4 | import logging 5 | import netaddr 6 | from utils.dotdict import DotDict 7 | 8 | logger = logging.getLogger() 9 | 10 | CLOUDTRAIL_FILE_NAME_REGEX = re.compile( 11 | r"\d+_cloudtrail_.+.json.gz$", re.I 12 | ) 13 | 14 | def emit_json_block(stream): 15 | ''' take a stream of io.StringIO(blob) 16 | iterate it and emit json blocks as they are found 17 | ''' 18 | open_brackets = 0 19 | block = '' 20 | while True: 21 | c = stream.read(1) 22 | if not c: 23 | break 24 | 25 | if c == '{': 26 | open_brackets += 1 27 | elif c == '}': 28 | open_brackets -= 1 29 | block += c 30 | 31 | if open_brackets == 0: 32 | yield block.strip() 33 | block = '' 34 | 35 | def short_uuid(): 36 | return str(uuid.uuid4())[0:8] 37 | 38 | def is_cloudtrail(filename): 39 | match = CLOUDTRAIL_FILE_NAME_REGEX.search(filename) 40 | return bool(match) 41 | 42 | def is_ip(ip): 43 | ''' 44 | validate an ipv4/ipv6 or cidr mask 45 | valid_ipv4/6 won't recognize a cidr mask 46 | ''' 47 | try: 48 | # by default netaddr will validate single digits like '0' as 0.0.0.0/32 49 | # lets be a bit more precise and support cidr masks 50 | # by checking for format chars (. or :) 51 | # and using the IPNetwork constructor 52 | if ('.' in ip) or (':' in ip): 53 | netaddr.IPNetwork(ip) 54 | return True 55 | else: 56 | return False 57 | except Exception: 58 | return False 59 | 60 | def isIPv4(ip): 61 | try: 62 | return netaddr.valid_ipv4(ip,flags=1) 63 | except: 64 | return False 65 | 66 | def isIPv6(ip): 67 | try: 68 | return netaddr.valid_ipv6(ip,flags=1) 69 | except: 70 | return False 71 | 72 | def generate_metadata(context): 73 | metadata = { 74 | "lambda_details": { 75 | "function_version": context.function_version, 76 | "function_arn": context.invoked_function_arn, 77 | "function_name": context.function_name.lower(), 78 | "memory_size": context.memory_limit_in_mb, 79 | }, 80 | } 81 | 82 | return DotDict(metadata) 83 | 84 | def chunks(l, n): 85 | """Yield successive n-sized chunks from l.""" 86 | for i in range(0, len(l), n): 87 | yield l[i:i + n] 88 | 89 | def first_matching_index_value(iterable, condition = lambda x: True): 90 | """ 91 | Returns the first index,value tuple in the list that 92 | satisfies the `condition`. 93 | 94 | If the condition is not given, returns the first of the iterable. 95 | condition is passed as: 96 | condition = lambda i: 97 | >>> first_matching_item( (1,2,3), condition=lambda x: x % 2 == 0) 98 | (1, 2) 99 | """ 100 | try: 101 | return next((index,value) for index,value in enumerate(iterable) if condition(value)) 102 | 103 | except StopIteration: 104 | return (None,None) -------------------------------------------------------------------------------- /lambdas/utils/plugins.py: -------------------------------------------------------------------------------- 1 | import pynsive 2 | import os 3 | from operator import itemgetter 4 | import json 5 | import logging 6 | from utils.dict_helpers import enum_keys 7 | 8 | logger = logging.getLogger() 9 | 10 | 11 | def event_criteria_values(an_event): 12 | """set up the list of event values to use when comparing plugins 13 | to this event to see if they should fire 14 | target values are the .keys() of the dict and the values of the 'category' and 'tags' fields 15 | where category is a key/value and tags is a list of values. 16 | """ 17 | criteria_values = [e for e in enum_keys(an_event)] 18 | if ( 19 | "tags" in criteria_values 20 | and isinstance(an_event.get("tags"), list) 21 | and len(an_event.get("tags", "")) > 0 22 | ): 23 | for tag in an_event["tags"]: 24 | criteria_values.append(tag) 25 | if "category" in criteria_values and isinstance(an_event.get("category"), str): 26 | criteria_values.append(an_event["category"]) 27 | 28 | return criteria_values 29 | 30 | 31 | def register_plugins(directory_name): 32 | """ 33 | take a directory name, scan it for python modules 34 | and register them (module,registration criteria, priority) 35 | """ 36 | pluginList = list() # tuple of module,registration dict,priority 37 | if os.path.exists(directory_name): 38 | modules = pynsive.list_modules(directory_name) 39 | for mname in modules: 40 | module = pynsive.import_module(mname) 41 | if not module: 42 | raise ImportError("Unable to load module {}".format(mname)) 43 | else: 44 | if "message" in dir(module): 45 | mclass = module.message() 46 | mreg = mclass.registration 47 | if "priority" in dir(mclass): 48 | mpriority = mclass.priority 49 | else: 50 | mpriority = 100 51 | if isinstance(mreg, list): 52 | logger.info( 53 | "[*] plugin {0} registered to receive messages with {1}".format( 54 | mname, mreg 55 | ) 56 | ) 57 | pluginList.append((mclass, mreg, mpriority)) 58 | return pluginList 59 | 60 | 61 | def send_event_to_plugins(anevent, metadata, pluginList): 62 | """compare the event to the plugin registrations. 63 | plugins register with a list of keys or values 64 | or values they want to match on 65 | this function compares that registration list 66 | to the current event and sends the event to plugins 67 | in order 68 | """ 69 | if not isinstance(anevent, dict): 70 | raise TypeError("event is type {0}, should be a dict".format(type(anevent))) 71 | 72 | # expecting tuple of module, criteria, priority in pluginList 73 | # sort the plugin list by priority 74 | executed_plugins = [] 75 | for plugin in sorted(pluginList, key=itemgetter(2), reverse=False): 76 | # assume we don't run this event through the plugin 77 | send = False 78 | if isinstance(plugin[1], list): 79 | try: 80 | if "*" in plugin[1]: 81 | # plugin wants to see all events, early exit the check 82 | send = True 83 | else: 84 | # intersect the plugin field names 85 | # with the fields in the event 86 | # if they match, the plugin wants to see this event 87 | plugin_matching_keys = set([item.lower() for item in plugin[1]]) 88 | event_tokens = [e for e in event_criteria_values(anevent)] 89 | if plugin_matching_keys.intersection(event_tokens): 90 | send = True 91 | except TypeError: 92 | logger.error( 93 | "TypeError on set intersection for dict {0}".format(anevent) 94 | ) 95 | return (anevent, metadata) 96 | if send: 97 | (anevent, metadata) = plugin[0].onMessage(anevent, metadata) 98 | if anevent is None: 99 | # plug-in is signalling to drop this message 100 | # early exit 101 | return (anevent, metadata) 102 | plugin_name = plugin[0].__module__.replace("plugins.", "") 103 | executed_plugins.append(plugin_name) 104 | # Tag all events with what plugins ran on it 105 | if "plugins" in anevent: 106 | anevent["plugins"] = anevent["plugins"] + executed_plugins 107 | else: 108 | anevent["plugins"] = executed_plugins 109 | 110 | return (anevent, metadata) 111 | -------------------------------------------------------------------------------- /main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">=0.12.25" 3 | required_providers { 4 | aws = ">= 2.25.0" 5 | } 6 | } 7 | 8 | provider "aws" { 9 | region = "us-west-2" 10 | profile = "default" 11 | } 12 | data "aws_caller_identity" "current" {} 13 | data "aws_region" "current" {} 14 | 15 | output "account_id" { 16 | value = "${data.aws_caller_identity.current.account_id}" 17 | } 18 | 19 | output "datalake_arn" { 20 | value = aws_athena_database.defenda_datalake.id 21 | } 22 | 23 | resource "aws_s3_bucket" "data_lake_input_bucket" { 24 | bucket = "${data.aws_caller_identity.current.account_id}-defenda-data-lake-input-bucket" 25 | acl = "private" 26 | 27 | versioning { 28 | enabled = false 29 | } 30 | 31 | lifecycle_rule { 32 | enabled = true 33 | 34 | transition { 35 | days = 30 36 | storage_class = "STANDARD_IA" 37 | } 38 | 39 | expiration { 40 | days = 90 41 | } 42 | } 43 | } 44 | 45 | resource "aws_s3_bucket_public_access_block" "data_lake_input_bucket" { 46 | bucket = aws_s3_bucket.data_lake_input_bucket.id 47 | 48 | block_public_acls = true 49 | block_public_policy = true 50 | ignore_public_acls = true 51 | restrict_public_buckets = true 52 | } 53 | 54 | resource "aws_s3_bucket" "data_lake_output_bucket" { 55 | bucket = "${data.aws_caller_identity.current.account_id}-defenda-data-lake-output-bucket" 56 | acl = "private" 57 | 58 | versioning { 59 | enabled = false 60 | } 61 | 62 | lifecycle_rule { 63 | enabled = true 64 | 65 | transition { 66 | days = 90 67 | storage_class = "STANDARD_IA" 68 | } 69 | 70 | expiration { 71 | days = 360 72 | } 73 | } 74 | } 75 | 76 | resource "aws_s3_bucket_public_access_block" "data_lake_output_bucket" { 77 | bucket = aws_s3_bucket.data_lake_output_bucket.id 78 | 79 | block_public_acls = true 80 | block_public_policy = true 81 | ignore_public_acls = true 82 | restrict_public_buckets = true 83 | } 84 | 85 | resource "aws_iam_role" "data-lake-firehose-role" { 86 | name = "defenda-data-lake-firehose-role" 87 | 88 | assume_role_policy = <