├── .github
    └── FUNDING.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── Pipfile
├── README.md
├── generate_lambda_zip.py
├── lambdas
    ├── .dockerignore
    ├── Dockerfile
    ├── enrichment_plugins
    │   ├── __init__.py
    │   └── ensure_eventid.py
    ├── generate_partitions.py
    ├── normalization_plugins
    │   ├── __init__.py
    │   ├── event_shell.py
    │   ├── gsuite_login.py
    │   ├── ip_addresses.py
    │   ├── lowercase_keys.py
    │   └── timestamps.py
    ├── processor.py
    ├── requirements.txt
    ├── s3_to_firehose.py
    ├── tests
    │   ├── __init__.py
    │   ├── logging_config.yml
    │   ├── samples
    │   │   ├── sample_cloudfront_wordpress_probe.json
    │   │   ├── sample_cloudtrail_create_log_stream.json
    │   │   ├── sample_gsuite_login_event.json
    │   │   ├── sample_syslog_sudo.json
    │   │   └── sample_vpc_flow_log.json
    │   ├── test_core.py
    │   ├── test_plugin_gsuite_logins.py
    │   ├── test_plugin_ip_addresses.py
    │   ├── test_plugin_timestamps.py
    │   └── test_plugins.py
    └── utils
    │   ├── __init__.py
    │   ├── athena.py
    │   ├── dates.py
    │   ├── dict_helpers.py
    │   ├── dotdict.py
    │   ├── helpers.py
    │   └── plugins.py
├── main.tf
├── pytest.ini
├── terraform.tfvars
└── variables.tf


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | liberapay: defendA
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .vscode
132 | #piplock
133 | Pipfile.lock
134 | lambda.zip
135 | .DS_Store
136 | terraform.tfstate
137 | terraform.tfstate.backup
138 | .terraform/plugins/darwin_amd64/lock.json
139 | .terraform/plugins/darwin_amd64/terraform-provider-aws_v3.0.0_x5
140 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # safelist
 2 | branches:
 3 |   only:
 4 |     - main
 5 | 
 6 | os:
 7 |   - linux
 8 | 
 9 | language: python
10 | python:
11 |   - "3.8"
12 | 
13 | env:
14 |   -
15 | 
16 | install:
17 |   - pip install pipenv
18 |   - pipenv install -d
19 | 
20 | # command to run tests
21 | script:
22 |   - pytest
23 | 
24 | notifications:
25 |   email:
26 |     on_success: never
27 |     on_failure: always


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | pylint = "*"
 8 | black = "*"
 9 | pytest = "*"
10 | 
11 | [packages]
12 | docker = "*"
13 | boto3 = "*"
14 | pyyaml = "*"
15 | pandas = "*"
16 | tzlocal = "*"
17 | netaddr = "*"
18 | pynsive = "*"
19 | 5400915 = {file = "https://github.com/noahmorrison/chevron/archive/master.zip"}
20 | pyathena = "*"
21 | 
22 | [requires]
23 | python_version = "3.8"
24 | 
25 | [pipenv]
26 | allow_prereleases = true
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # defendA Data Lake
  2 | An AWS-native, serverless log management system to allow you to ingest unstructured JSON, normalize & enrich it and store it in Athena for queries and analysis.
  3 | 
  4 | **Build Status:**
  5 | - Master [![Build Status](https://travis-ci.com/0xdefenda/defenda-data-lake.svg?branch=master) ](https://travis-ci.com/0xdefenda/defenda-data-lake)
  6 | 
  7 | ## Video intro
  8 | Here's a brief video intro to the data lake.
  9 | 
 10 | [![video intro](http://img.youtube.com/vi/eYQ0gjTMVhc/0.jpg)](http://www.youtube.com/watch?v=eYQ0gjTMVhc "defendA security data lake")
 11 | 
 12 | ## Why?
 13 | Centralized log/event management is a core element of an infosec program, yet most solutions are not cloud native, require unnecessary servers/clusters and force you to massage your events into a strict format.
 14 | 
 15 | The reality is that infosec teams aren't able to dictate what format events come in which is usually arbitrary, nested JSON.
 16 | 
 17 | This solution uses only serverless constructs to allow you to store unstructured JSON from any source in a predictable data structure that can be accessed using Athena's native SQL.
 18 | 
 19 | ## Deployment:
 20 | 
 21 | Deployment is via python/pipenv, terraform and a mini-docker environment to compile the lambdas.
 22 | 
 23 | It uses us-west-2 as the default region, set a terraform.tfvars variable ( aws_region = "some-other-region ) if you'd like it elsewhere.
 24 | 
 25 | 
 26 | First get the code and initiate pipenv (or [install it if you aren't converted yet](https://pipenv.pypa.io/en/latest/install/)):
 27 | 
 28 | ```bash
 29 | git clone <this repo> .
 30 | pipenv --python 3.8
 31 | ```
 32 | 
 33 | Now build the lambdas:
 34 | 
 35 | ```bash
 36 | ./generate_lambda_zip.py
 37 | ```
 38 | 
 39 | Init and run terraform
 40 | ```bash
 41 | terraform init
 42 | terraform plan
 43 | terraform apply
 44 | ```
 45 | and you will end up with:
 46 | 
 47 | - An Athena database: `defenda_datalake`
 48 | - A table: `events`
 49 | - An s3 bucket to serve as the data store for the athena data lake: `data-lake-<accountnumber>-output-bucket`
 50 | - An s3 bucket to act as an input if you have things that can't talk directly to firehose: `data-lake-<accountnumber>-input-bucket`
 51 | - A firehose delivery stream: `data_lake_s3_stream`
 52 | - A lambda to operate on records bound for athena: `data_lake_lambda`
 53 | - A lambda to generate partitions: `data_lake_generate_partitions`
 54 | - All the iam permissions and glue to sync these together
 55 | 
 56 | ## Event structure
 57 | Athena does require *some* structure to allow for querying. To enable that and still allow unstructured JSON we use the following `event shell`
 58 | 
 59 | - eventid (string)
 60 |     - a unique guid
 61 | - utctimestamp (string)
 62 |     - timestamp for the event in UTC, ISO format
 63 | - severity (string)
 64 |     - DEBUG, INFO, WARNING, etc
 65 | - summary (string)
 66 |     - a human readable text description of the event
 67 | - category (string)
 68 |     - what sort of event: authentication, etc.
 69 | - source (string)
 70 |     - where the event came from (gsuite, sophos, cloudtrail, okta, etc)
 71 | - tags (array<string>)
 72 |     - a series of tags you'd like to add
 73 | - plugins (array<string>)
 74 |     - a record of what plugins operated on this event
 75 | - details (string)
 76 |     - this `string` is the native JSON of the event. Stored as a string to allow for json_extract_scalar operations to query the JSON structure.
 77 | - year (string) (Partitioned)
 78 |     - partition for athena
 79 | - month (string) (Partitioned)
 80 |     - partition for athena
 81 | - day (string) (Partitioned)
 82 |     - partition for athena
 83 | - hour (string) (Partitioned)
 84 |     - partition for athena
 85 | 
 86 | ### Sample query
 87 | So what does it look like to use this data lake? Here's a sample query that would return all AWS console logins in a certain partition/timeframe:
 88 | 
 89 | ```sql
 90 | SELECT utctimestamp,
 91 |          summary,
 92 |          source,
 93 |          details
 94 | 
 95 | FROM "defenda_data_lake"."events"
 96 |     where
 97 |     source='cloudtrail' AND json_extract_scalar(details,'$.eventname') = 'ConsoleLogin'
 98 |     AND (
 99 |             (year='2020'
100 |             AND month='06'
101 |             AND day='19'
102 |             AND hour='01')
103 |             OR
104 |             (year='2020'
105 |             AND month='06'
106 |             AND day='19'
107 |             AND hour='00')
108 |     )
109 |     limit 100
110 | ```
111 | 
112 | You can use the [json_extract_scalar](https://prestodb.io/docs/current/functions/json.html) function and [json path expressions](https://goessner.net/articles/JsonPath/index.html#e2) to get at any layer of the nested JSON stored in the 'details' field as part of your query.
113 | 
114 | The date portion of the where clause allows us to hone in on a particular time period and allows us to limit the cost of the query by limiting the amount of data scanned by Athena.
115 | 
116 | Queries can be any valid [Presto SQL](https://prestodb.io/docs/current/sql/select.html) including [functions](https://prestodb.io/docs/current/functions.html)
117 | 
118 | 
119 | Here's another, slightly more complex query taking advantage of the work the ip_addresses.py plugin does to gather all the ips it's seen into a list. We can use that to query for any events involving a suspect ip like so:
120 | 
121 | ```sql
122 | SELECT
123 |     utctimestamp,
124 |     summary,
125 |     source,
126 |     details,
127 |     tags
128 | FROM defenda_data_lake.events
129 |     where
130 |         source ='cloudtrail'
131 |     AND json_array_contains(json_extract(details,'$._ipaddresses'),'7.1.14.12')
132 |     AND year='2020'
133 |     AND month='09'
134 |     AND day='07'
135 |     AND hour='18'
136 |     LIMIT 100;
137 | ```
138 | 
139 | The plugin searches events for likely IP fields, verifies them, normalizes source/destination IPs and then appends them to a metadata list details._ipaddresses. We can query that json natively by extracting it from the details athena field and use the Presto function json_array_contains to narrow our query to the IP address in question.
140 | 
141 | ### Python querying
142 | Thanks to the [pyathena library](https://pypi.org/project/PyAthena/) and [pandas](https://pandas.pydata.org/), querying and exploring data is easy!
143 | 
144 | Here's the same sample query looking for IP address events, but performed from a python environment.
145 | 
146 | ```python
147 | from pyathena import connect
148 | from pyathena.util import as_pandas
149 | from pyathena.pandas_cursor import PandasCursor
150 | import pandas as pd
151 | 
152 | cursor = connect(work_group='defenda_data_lake',region_name='us-west-2',cursor_class=PandasCursor).cursor()
153 | 
154 | cursor.execute("""
155 | SELECT
156 |     utctimestamp,
157 |     summary,
158 |     source,
159 |     details,
160 |     tags
161 | FROM defenda_data_lake.events
162 |     where
163 |         source ='cloudtrail'
164 |     AND json_array_contains(json_extract(details,'$._ipaddresses'),'7.1.14.12')
165 |     AND year='2020'
166 |     AND month='09'
167 |     AND day='07'
168 |     AND hour='18'
169 |     LIMIT 100;
170 |                """)
171 | df = as_pandas(cursor)
172 | df.head()
173 | 
174 | ```
175 | 
176 | You simply create a cursor to handle your results, send it a query and your result is a pandas data frame.
177 | 
178 | If you'd like your query results restored to a list of python dictionaries you can convert the JSON in the details field like so:
179 | 
180 | ```python
181 | query_results=[]
182 | for message in df.to_dict('records'):
183 |     message['details']=json.loads(message['details'])
184 |     query_results.append(message)
185 | ```
186 | 
187 | ### Advantages
188 | 
189 | #### Serverless!
190 | No servers to manage and this scales up as your event ingestion scales. You can store as much data as s3/athena can handle and due to the JSON handling, changes in data structures won't blow up your infrastructure.
191 | 
192 | #### Security
193 | Operating via serverless, there is nothing to maintain, patch, etc. Python libraries will of course update over time.
194 | 
195 | There is nothing exposed to the outside world, no extra costs for authentication, no extra licensing for secure transport, etc.
196 | 
197 | #### Customizable
198 | A simple plugin system allows you to write your own custom event handlers to either normalize your data or enhance it as you see fit. Plugins are in python, usually a dozen lines of code and an be fine tuned to operate only on the events of interest.
199 | 
200 | #### Integration
201 | For input that can't be hooked up to firehose, you can deposit raw JSON in the s3 input bucket and it will be send automatically through to firehose/athena. You can use this to hook up legacy event producers that may not be able to speak native firehose but can write files to s3.
202 | 
203 | #### Cost
204 | This costs nothing to deploy. Costs will vary depending on your data ingestion, but can get started today without having to guesstimate event per second, data size, throughput, or other statistics you usually have to commit to in other log management platforms.
205 | 
206 | Preliminary tests sending 500MB of data to the data lake resulted in the following costs:
207 | 
208 | Test using s3 as the input (copying json files to s3):
209 |  - s3: $0.51
210 |  - firehose: $0.02
211 |  - athena: $0.00
212 | 
213 | Test using firehose only as the input (no files, direct to firehose):
214 |  - s3: $0.02
215 |  - firehose: $0.02
216 |  - athena: $0.00
217 | 
218 | 
219 | ### Disadvantages
220 | 
221 | #### Latency
222 | Depending on your rate of event ingestion, firehose will queue events for 60 seconds before flushing to s3. If you have enough flow, this usually isn't a problem but if your event flow is very low you may see a slight delay.
223 | 
224 | #### Query Cost potential
225 | 
226 | Athena's pricing is based on $/query/data that as of this writing is $5 per terabyte. Each query is charged based on the amount of underlying data that was scanned to resolve the query and prorated accordingly. So if your query operated on a megabyte of data in a partition, your charge would be only for that megabyte.
227 | 
228 | However it is a `per query` charge. So if you aren't careful with your queries and don't make use of partitions you can run up a bill.
229 | 
230 | To help, data is automatically partitioned in hour chunks (year/month/day/hour structure in the s3 bucket). By simply adding some criteria to your where clause you can limit the amount of data you interact with and are charged for. Data is also automatically gzipped to also reduce the charges.
231 | 
232 | 
233 | ## Companion Projects
234 | 
235 | Anything that sends json to firehost can be used as an input into the data lake. Here are some sample companion projects that do just that to send security events from some common data sources:
236 | 
237 | - [gsuite log ingestion](https://github.com/jeffbryner/gsuite-activity-lambda)
238 | - [sophos log ingestion](https://github.com/jeffbryner/sophos-activity-lambda)
239 | - [meraki log ingestion](https://github.com/jeffbryner/meraki-activity-lambda)
240 | - [beats log ingestion](https://github.com/jeffbryner/firehose-es-input#browserbeat-example)
241 | 
242 | ## Plugins
243 | Inspired by [MozDef's plugin system](https://github.com/mozilla/MozDef/tree/master/mq/plugins) via [pynsive](https://github.com/zinic/pynsive/), the plugins in the data lake use a similar concept of operations, but are ordered a bit differently.
244 | 
245 | ### Plugin types
246 | Plugins can either normalize or enrich an event. Events are first run through normalization plugins, then through enrichment plugins. This makes it easier to target your plugin to the task at hand, and makes it easier to perform whatever operation you are envisoning.
247 | 
248 | Plugins are python, and register themselves to receive events containing a field, a category or a tag. Plugins can signal they'd like to see all events by registering for '*'.
249 | 
250 | If an event matches the registration, the event and it's metadata are sent to the plugin where the plugin can rearrange/rename fields (normalization), add information to the event (enrichment) or perform any operation you might envision with the event.
251 | 
252 | A plugin can signal to drop the event by returning None for the message. The pipeline will not store the event, which can help weed out noise.
253 | 
254 | ### Sample plugin
255 | Lets look at the sample Gsuite login plugin configured to operate on events from the [gsuite log ingestion](https://github.com/jeffbryner/gsuite-activity-lambda) project that polls Google for gsuite security events and sends them to firehose.
256 | 
257 | ```python
258 | class message(object):
259 | 
260 |     def __init__(self):
261 |         '''
262 |         handle gsuite login activity record
263 |         '''
264 | 
265 |         self.registration = ['kind']
266 |         self.priority = 20
267 | ```
268 | 
269 | The plugin registers to receive any even that has a field named 'kind'. The registration property is a list and can contain a list of fields that, if present, the plugin would like to receive. You could have a registration of ```['ipaddress','ip_address','srcip']``` for example to receive any event that contains any or all of those fields.
270 | 
271 | Next, the plugin puts itself as priority 20, meaning any plugin with a lower number will receive the event first. This allows you to order your plugins in case that is important in the plugin pipeline logic. Plugins will be called in order of priority, 0 going first, higher numbers going later.
272 | 
273 | Next the plugin contains the logic to use when encountering a matching event:
274 | 
275 | ```python
276 |     def onMessage(self, message, metadata):
277 |         # for convenience, make a dot dict version of the message
278 |         dot_message=DotDict(message)
279 | 
280 |         # double check that this is our target message
281 |         if 'admin#reports#activity' not in dot_message.get('details.kind','')\
282 |             or 'id' not in message.get('details','') \
283 |             or 'etag' not in message.get('details',''):
284 |             return(message, metadata)
285 | # <trimmed for brevity>
286 | ```
287 | 
288 | Your plugins can make use of the utils functions like DotDict, etc to operate on an event. It's best practice to first ensure this event fully matches what you expect and this plugin is double checking for certain fields in the structure and returning the message unchanged if there isn't a match.
289 | 
290 | Normalization plugins usually cherry pick fields from the original event and surface them to standardized fields to make querying/correlating easier. For example this plugin sets some tags and brings out the IP address and timestamp:
291 | 
292 | ```python
293 |         message["source"]="gsuite"
294 |         message["tags"].append("gsuite")
295 | 
296 |         # clean up ipaddress field
297 |         if 'ipaddress' in message['details']:
298 |             message['details']['sourceipaddress']=message['details']['ipaddress']
299 |             del message['details']['ipaddress']
300 | 
301 |         # set the actual time
302 |         if dot_message.get("details.id.time",None):
303 |             message['utctimestamp']=toUTC(message['details']['id']['time']).isoformat()
304 | 
305 | ```
306 | 
307 | it goes on to do the same for other common fields and most importantly sets a human readable summary:
308 | 
309 | ```python
310 |         # set summary
311 |         message["summary"]=chevron.render("{{details.user}} {{details.events.0.name}} from IP {{details.sourceipaddress}}",message)
312 | ```
313 | The [chevron library](https://github.com/noahmorrison/chevron) allows us to use mustache templates to access fields and fields within lists to pull out information from the event as needed. ```details.events.0.name``` in this case is looking for the first item in the details.events list and if that exists, it uses the ```name``` field in the text. Chevron is forgiving, you can reference fields that may not exist, or only exist in some cases.
314 | 
315 | The utility libraries are purposefully crafted to allow you to get at the most stubborn data. In a gsuite event for example, the majority of the information is tucked away in key/value fields. Take this marker for suspicious logins as an example:
316 | 
317 | ```json
318 |     "events": [
319 |         {
320 |             "type": "login",
321 |             "name": "login_success",
322 |             "parameters": [
323 |                 {
324 |                     "name": "login_type",
325 |                     "value": "exchange"
326 |                 },
327 |                 {
328 |                     "name": "login_challenge_method",
329 |                     "multiValue": [
330 |                         "none"
331 |                     ]
332 |                 },
333 |                 {
334 |                     "name": "is_suspicious",
335 |                     "boolValue": false
336 |                 }
337 |             ]
338 |         }
339 |     ]
340 | ```
341 | 
342 | You can see there are several 'name' fields with a parameters list that make it difficult to programatically query.
343 | 
344 | This plugin solves this via the use of the dict_match function like so:
345 | 
346 | ```python
347 |         #suspicious?
348 |         suspicious={"boolvalue":True,"name":"is_suspicious"}
349 |         for e in dot_message.get("details.events",[]):
350 |             for p in e.get("parameters",[]):
351 |                 if dict_match(suspicious,p):
352 |                     message["details"]["suspicious"]=True
353 | ```
354 | 
355 | The dict_match function takes a dictionary of keys and values and compares it to something. If the keys and values match, it returns true which in this case allows to mark an event as suspicious if the name='is_suspicious' and a field called 'boolvalue' is True.
356 | 
357 | Lastly the plugin returns the event and metadata back to the pipeline to be sent on to another plugin, or to the final data lake:
358 | 
359 | ```python
360 |         return (message, metadata)
361 | ```
362 | 
363 | It's best to include tests for plugins, and the [test for the gsuite login plugin can be found here](https://github.com/0xdefendA/defenda-data-lake/blob/main/lambdas/tests/test_plugin_gsuite_logins.py) as an example.


--------------------------------------------------------------------------------
/generate_lambda_zip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import docker
 3 | from os import path
 4 | import subprocess
 5 | 
 6 | 
 7 | def refresh_requirements():
 8 |     subprocess.Popen(
 9 |         "pipenv run pip freeze > lambdas/requirements.txt",
10 |         shell=True,
11 |         stdout=subprocess.PIPE,
12 |     ).stdout.read()
13 | 
14 | 
15 | def build_lambda_image():
16 |     docker_client = docker.from_env()
17 |     docker_client.images.build(path="lambdas/", tag="datalake-lambdas", quiet=False)
18 | 
19 | 
20 | def get_lambda_zip():
21 |     docker_client = docker.from_env()
22 |     docker_client.containers.run(
23 |         "datalake-lambdas",
24 |         "cp /asset-output/lambda.zip /mnt/cdk-data-lake/lambdas",
25 |         volumes={
26 |             path.abspath("."): {
27 |                 "bind": "/mnt/cdk-data-lake",
28 |                 "mode": "rw",
29 |             }
30 |         },
31 |         remove=True,
32 |     )
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     print("refreshing requirements.txt using pipenv")
37 |     refresh_requirements()
38 |     print("Building image with requirements.txt")
39 |     build_lambda_image()
40 |     print("Retrieving zip file for lambda")
41 |     get_lambda_zip()
42 | 


--------------------------------------------------------------------------------
/lambdas/.dockerignore:
--------------------------------------------------------------------------------
1 | cdk*
2 | .vscode*
3 | .git
4 | *log
5 | __pycache__**


--------------------------------------------------------------------------------
/lambdas/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM lambci/lambda:build-python3.8
 2 | 
 3 | ENV AWS_DEFAULT_REGION us-west-2
 4 | RUN yum install -y rsync
 5 | RUN mkdir /asset-input
 6 | WORKDIR /asset-input
 7 | ADD . .
 8 | 
 9 | #RUN pip3 install -r requirements.txt
10 | RUN pip3 install -r requirements.txt -t /asset-output
11 | RUN rsync -r . /asset-output
12 | WORKDIR /asset-output
13 | RUN zip -9yr lambda.zip .


--------------------------------------------------------------------------------
/lambdas/enrichment_plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/enrichment_plugins/__init__.py


--------------------------------------------------------------------------------
/lambdas/enrichment_plugins/ensure_eventid.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | class message(object):
 4 | 
 5 |     def __init__(self):
 6 |         '''
 7 |         takes an incoming message
 8 |         and adds an event id to the message if missing
 9 |         '''
10 | 
11 |         self.registration = ['*']
12 |         self.priority = 10
13 | 
14 |     def onMessage(self, message, metadata):
15 | 
16 |         if 'eventid' not in message:
17 |             message['eventid']=str(uuid.uuid4())
18 |         
19 |         return (message, metadata)


--------------------------------------------------------------------------------
/lambdas/generate_partitions.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import boto3
 3 | import time
 4 | import logging, logging.config
 5 | from utils.dotdict import DotDict
 6 | from utils.dates import get_date_parts
 7 | import pyathena
 8 | from pyathena import connect
 9 | 
10 | logger = logging.getLogger()
11 | logger.setLevel(logging.INFO)
12 | 
13 | 
14 | def get_athena_query(config):
15 |     (
16 |         hour,
17 |         month,
18 |         day,
19 |         year,
20 |         last_hour_hour,
21 |         last_hour_month,
22 |         last_hour_day,
23 |         last_hour_year,
24 |     ) = get_date_parts()
25 |     query = f"""
26 |     ALTER TABLE {config.athena_database}.{config.athena_table}
27 |     ADD IF NOT EXISTS PARTITION
28 |     (year='{year}',
29 |     month='{month}',
30 |     day='{day}',
31 |     hour='{hour}'
32 |     )
33 |     location 's3://{config.account}-defenda-data-lake-output-bucket/{year}/{month}/{day}/{hour}'
34 |     """
35 |     return query
36 | 
37 | 
38 | def lambda_handler(event, context):
39 |     config = DotDict({})
40 |     config.account = boto3.client("sts").get_caller_identity().get("Account")
41 |     config.athena_workgroup = os.environ.get("ATHENA_WORKGROUP", "defenda_data_lake")
42 |     config.athena_database = os.environ.get("ATHENA_DATABASE", "defenda_data_lake")
43 |     config.athena_table = os.environ.get("ATHENA_TABLE", "events")
44 | 
45 |     # query status/wait for response
46 | 
47 |     athena_query = get_athena_query(config)
48 |     logger.debug(athena_query)
49 |     cursor = connect(work_group=config.athena_workgroup).cursor()
50 |     cursor.execute(athena_query)
51 |     logger.debug("Query finished: {}".format(cursor.state))
52 |     return
53 | 


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/normalization_plugins/__init__.py


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/event_shell.py:
--------------------------------------------------------------------------------
 1 | from utils.dict_helpers import enum_keys, merge
 2 | from utils.dates import utcnow
 3 | 
 4 | 
 5 | class message(object):
 6 |     def __init__(self):
 7 |         """
 8 |         takes an incoming message
 9 |         and ensures it matches our event shell structure
10 |         """
11 | 
12 |         self.registration = ["*"]
13 |         self.priority = 2
14 | 
15 |     def onMessage(self, message, metadata):
16 |         # our target shell
17 |         event_shell = {
18 |             "utctimestamp": utcnow().isoformat(),
19 |             "severity": "INFO",
20 |             "summary": "UNKNOWN",
21 |             "category": "UNKNOWN",
22 |             "source": "UNKNOWN",
23 |             "tags": [],
24 |             "plugins": [],
25 |             "details": {},
26 |         }
27 |         # maybe the shell elements are already there?
28 |         event_set = set(enum_keys(event_shell))
29 |         message_set = set(enum_keys(message))
30 |         if not event_set.issubset(message_set):
31 |             # we have work to do
32 |             # merge the dicts letting any message values win
33 |             # if the message lacks any keys, our shell values win
34 |             message = merge(event_shell, message)
35 | 
36 |         # move any non shell keys to 'details'
37 |         for item in message_set:
38 |             # enum_keys traverses sub dicts, we only move the top level
39 |             # so check if the key is note a core element
40 |             # present in the top level and move it to details
41 |             if item not in event_shell and item in message:
42 |                 message["details"][item] = message.get(item)
43 |                 del message[item]
44 | 
45 |         return (message, metadata)
46 | 


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/gsuite_login.py:
--------------------------------------------------------------------------------
 1 | from utils.dict_helpers import sub_dict, enum_keys, dict_match
 2 | from utils.dotdict import DotDict
 3 | from utils.dates import toUTC
 4 | import chevron
 5 | 
 6 | class message(object):
 7 | 
 8 |     def __init__(self):
 9 |         '''
10 |         handle gsuite login activity record
11 |         '''
12 | 
13 |         self.registration = ['kind']
14 |         self.priority = 20
15 | 
16 |     def onMessage(self, message, metadata):
17 |         # for convenience, make a dot dict version of the message
18 |         dot_message=DotDict(message)
19 | 
20 |         # double check that this is our target message
21 |         if 'admin#reports#activity' not in dot_message.get('details.kind','')\
22 |             or 'id' not in message.get('details','') \
23 |             or 'etag' not in message.get('details',''):
24 |             return(message, metadata)
25 | 
26 |         message["source"]="gsuite"
27 |         message["tags"].append("gsuite")
28 | 
29 |         # clean up ipaddress field
30 |         if 'ipaddress' in message['details']:
31 |             message['details']['sourceipaddress']=message['details']['ipaddress']
32 |             del message['details']['ipaddress']
33 | 
34 |         # set the actual time
35 |         if dot_message.get("details.id.time",None):
36 |             message['utctimestamp']=toUTC(message['details']['id']['time']).isoformat()
37 | 
38 |         # set the user_name
39 |         if dot_message.get("details.actor.email",None):
40 |             message["details"]["user"]=dot_message.get("details.actor.email","")
41 | 
42 |         # set summary
43 |         message["summary"]=chevron.render("{{details.user}} {{details.events.0.name}} from IP {{details.sourceipaddress}}",message)
44 | 
45 | 
46 |         # set category
47 |         message['category']="authentication"
48 | 
49 |         #success/failure
50 |         if 'fail' in message["summary"]:
51 |             message["details"]["success"]=False
52 |         if 'success' in message["summary"]:
53 |             message["details"]["success"]=True
54 | 
55 |         #suspicious?
56 |         suspicious={"boolvalue":True,"name":"is_suspicious"}
57 |         for e in dot_message.get("details.events",[]):
58 |             for p in e.get("parameters",[]):
59 |                 if dict_match(suspicious,p):
60 |                     message["details"]["suspicious"]=True
61 | 
62 |         return (message, metadata)


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/ip_addresses.py:
--------------------------------------------------------------------------------
  1 | from utils.dict_helpers import enum_keys, getValueByPath, find_keys
  2 | from utils.dotdict import DotDict
  3 | from utils.helpers import is_ip
  4 | 
  5 | 
  6 | class message(object):
  7 |     def __init__(self):
  8 |         """
  9 |         takes an incoming message
 10 |         discovers ip addresses and
 11 |         normalizes the field names (source/destination)
 12 |         """
 13 | 
 14 |         self.registration = ["*"]
 15 |         self.priority = 20
 16 | 
 17 |     def onMessage(self, message, metadata):
 18 |         # help ourselves to a dot dict and list of keys
 19 |         message = DotDict(message)
 20 |         message_keys = list(enum_keys(message))
 21 | 
 22 |         # all the ips we encounter along the way
 23 |         all_ips = []
 24 | 
 25 |         # search for source ip address
 26 |         # likely places for a source IP
 27 |         likely_source_fields = [
 28 |             "src",
 29 |             "srcaddr",
 30 |             "srcip",
 31 |             "src_ip",
 32 |             "source_ip",
 33 |             "sourceipaddress",
 34 |             "source_ip_address",
 35 |             "c-ip",
 36 |             "clientip",
 37 |             "remoteip",
 38 |             "remote_ip",
 39 |             "remoteaddr",
 40 |             "remote_host_ip_address",
 41 |             "ipaddress",
 42 |             "ip_address",
 43 |             "ipaddr",
 44 |             "id_orig_h",
 45 |             "x-forwarded-for",
 46 |             "http-x-forwarded-for",
 47 |         ]
 48 | 
 49 |         likely_destination_fields = [
 50 |             "dst",
 51 |             "dstip",
 52 |             "dst_ip",
 53 |             "dstaddr",
 54 |             "dest",
 55 |             "destaddr",
 56 |             "dest_ip",
 57 |             "destination_ip",
 58 |             "destinationipaddress",
 59 |             "destination_ip_address",
 60 |             "id_resp_h",
 61 |             "serverip",
 62 |         ]
 63 |         # lets find a source
 64 |         # first match wins
 65 |         try:
 66 |             for field in likely_source_fields:
 67 |                 if field in message_keys:
 68 |                     # do we already have one?
 69 |                     if not getValueByPath(message, "details.sourceipaddress"):
 70 |                         # search the message for any instance of this field
 71 |                         # a list since it could appear multiple times
 72 |                         source_ips = list(find_keys(message, field))
 73 |                         for ip in source_ips:
 74 |                             if "," in ip:
 75 |                                 # some fields like x-forwarded can include multiple IPs
 76 |                                 # get the first one
 77 |                                 ip = ip.split(",")[0].strip()
 78 |                             if is_ip(ip):
 79 |                                 message.details.sourceipaddress = ip
 80 |                                 # first one wins
 81 |                                 # raise an error to break both for loops
 82 |                                 raise StopIteration
 83 |         except StopIteration:
 84 |             pass
 85 | 
 86 |         # harvest the result or existing source ip
 87 |         source_ip_address = getValueByPath(message, "details.sourceipaddress")
 88 |         if source_ip_address:
 89 |             if is_ip(source_ip_address):
 90 |                 all_ips.append(source_ip_address)
 91 |             else:
 92 |                 # hrm, there's an entry here that's not an ip
 93 |                 # sometimes cloudtrail does this (config.amazonaws.com )
 94 |                 # and also sets a useragent field to the same
 95 |                 if getValueByPath(message, "details.sourceipaddress") == getValueByPath(
 96 |                     message, "details.useragent"
 97 |                 ):
 98 |                     del message.details.sourceipaddress
 99 | 
100 |         # lets find a destination
101 |         # first match wins
102 |         try:
103 |             for field in likely_destination_fields:
104 |                 if field in message_keys:
105 |                     # do we already have one?
106 |                     if not getValueByPath(message, "details.destinationipaddress"):
107 |                         # search the message for any instance of this field
108 |                         # a list since it could appear multiple times
109 |                         destination_ips = list(find_keys(message, field))
110 |                         for ip in destination_ips:
111 |                             if is_ip(ip):
112 |                                 message.details.destinationipaddress = ip
113 |                                 # first one wins
114 |                                 # raise an error to break both for loops
115 |                                 raise StopIteration
116 |         except StopIteration:
117 |             pass
118 | 
119 |         # harvest the result or existing destination ip
120 |         destination_ip_address = getValueByPath(message, "details.destinationipaddress")
121 |         if destination_ip_address and is_ip(destination_ip_address):
122 |             all_ips.append(destination_ip_address)
123 | 
124 |         # save all the ips we found along the way
125 |         # in details._ipaddresses as a list
126 |         if all_ips:
127 |             if not getValueByPath(message, "details._ipaddresses"):
128 |                 message.details._ipaddresses = all_ips
129 |             else:
130 |                 if isinstance(message.details._ipaddresses, list):
131 |                     for ip in all_ips:
132 |                         if ip not in message.details._ipaddresses:
133 |                             message.details._ipaddresses.append(ip)
134 | 
135 |         return (message, metadata)


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/lowercase_keys.py:
--------------------------------------------------------------------------------
 1 | class message(object):
 2 | 
 3 |     def __init__(self):
 4 |         '''
 5 |         takes an incoming message
 6 |         and sets the keys to lowercase
 7 |         '''
 8 | 
 9 |         self.registration = ['*']
10 |         self.priority = 1
11 | 
12 |     def onMessage(self, message, metadata):
13 |         def lower_key(in_dict):
14 |             if isinstance(in_dict,dict):
15 |                 out_dict = {}
16 |                 for key, item in in_dict.items():
17 |                     out_dict[key.lower()] = lower_key(item)
18 |                 return out_dict
19 |             elif isinstance(in_dict,list):
20 |                 return [lower_key(obj) for obj in in_dict]
21 |             else:
22 |                 return in_dict
23 | 
24 |         message = lower_key(message)
25 |         return (message, metadata)


--------------------------------------------------------------------------------
/lambdas/normalization_plugins/timestamps.py:
--------------------------------------------------------------------------------
 1 | from utils.dict_helpers import enum_keys, getValueByPath, find_keys
 2 | from utils.dotdict import DotDict
 3 | from utils.dates import toUTC, utcnow
 4 | from datetime import datetime
 5 | import logging
 6 | 
 7 | logger = logging.getLogger()
 8 | 
 9 | # likely timestamp fields
10 | likely_timestamp_fields = [
11 |     "timestamp",
12 |     "@timestamp",
13 |     "time",
14 |     "eventtime",
15 |     "start",
16 | ]
17 | 
18 | 
19 | class message(object):
20 |     def __init__(self):
21 |         """
22 |         takes an incoming message
23 |         discovers timestamps
24 |         normalizes the format and updates utctimestamp
25 |         appends _utcprocessedtimestamp
26 |         """
27 | 
28 |         # register for all events
29 |         # so we can add the processed timestamp metadata field
30 |         self.registration = ["*"]
31 |         self.priority = 20
32 | 
33 |     def onMessage(self, message, metadata):
34 |         # help ourselves to a dot dict and list of keys
35 |         message = DotDict(message)
36 |         message_keys = list(enum_keys(message))
37 | 
38 |         try:
39 |             for field in likely_timestamp_fields:
40 |                 if field in message_keys:
41 |                     timestamps = list(find_keys(message, field))
42 |                     if field == "time" and "date" in message_keys:
43 |                         # combine date and time for a timestamp
44 |                         dates = list(find_keys(message, "date"))
45 |                         if dates:
46 |                             # setup a new list for the zipped results
47 |                             date_timestamps = []
48 |                             for i in zip(dates, timestamps):
49 |                                 date_timestamps.append(f"{i[0]} {i[1]}")
50 | 
51 |                             if date_timestamps:
52 |                                 # replace the original list
53 |                                 # with this list of date + time
54 |                                 timestamps = date_timestamps
55 | 
56 |                     for timestamp in timestamps:
57 |                         utctimestamp = ""
58 |                         try:
59 |                             utctimestamp = toUTC(timestamp)
60 |                         except Exception as e:
61 |                             logger.error(
62 |                                 f"exception {e} while converting {timestamp} to utc"
63 |                             )
64 |                             pass
65 |                         if isinstance(utctimestamp, datetime):
66 |                             message["utctimestamp"] = utctimestamp.isoformat()
67 |                             # first match wins
68 |                             raise StopIteration
69 | 
70 |         except StopIteration:
71 |             pass
72 | 
73 |         # append processed timestamp as metadata
74 |         message["details"]["_utcprocessedtimestamp"] = utcnow().isoformat()
75 | 
76 |         return (message, metadata)


--------------------------------------------------------------------------------
/lambdas/processor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import base64
 4 | import json
 5 | from json import JSONDecodeError
 6 | from io import StringIO
 7 | from utils.dotdict import DotDict
 8 | from utils.plugins import send_event_to_plugins, register_plugins
 9 | from utils.helpers import is_cloudtrail, generate_metadata, emit_json_block, chunks
10 | from utils.dict_helpers import merge
11 | import logging
12 | 
13 | logger = logging.getLogger()
14 | logger.setLevel(logging.INFO)
15 | 
16 | 
17 | def lambda_handler(event, context):
18 |     output = []
19 |     metadata = generate_metadata(context)
20 |     logger.debug(f"metadata is: {metadata}")
21 |     normalization_plugins = register_plugins("normalization_plugins")
22 |     enrichment_plugins = register_plugins("enrichment_plugins")
23 | 
24 |     if "records" in event:
25 |         for record in event["records"]:
26 |             output_record = {}
27 |             logger.debug(f"found record in event: {record}")
28 |             payload = base64.b64decode(record["data"])
29 | 
30 |             payload_dict = None
31 |             try:
32 |                 # load the json we have from either a .json file or a gunziped file
33 |                 payload_dict = json.loads(payload)
34 |             except JSONDecodeError as e:
35 |                 # file isn't well formed json, see if we can interpret json from it
36 |                 logger.error(f"payload is not valid json decode error {e}")
37 | 
38 |             if payload_dict:
39 |                 # normalize it
40 |                 result_record, metadata = send_event_to_plugins(
41 |                     payload_dict, metadata, normalization_plugins
42 |                 )
43 |                 # enrich it
44 |                 result_record, metadata = send_event_to_plugins(
45 |                     result_record, metadata, enrichment_plugins
46 |                 )
47 |                 if result_record:
48 |                     # TODO, what to do with lambda info as metadata? Do we care?
49 |                     # result_record = merge(result_record, metadata)
50 |                     logger.debug(f" resulting norm/enriched is: {result_record}")
51 |                     # json ending in new line so athena recognizes the records
52 |                     output_record = {
53 |                         "recordId": record["recordId"],
54 |                         "result": "Ok",
55 |                         "data": base64.b64encode(
56 |                             json.dumps(result_record).encode("utf-8") + b"\n"
57 |                         ).decode("utf-8"),
58 |                     }
59 |                 else:
60 |                     # result as None, means drop the record
61 |                     # TODO, what is the right result in firehose terms
62 |                     logger.error(f"record {record['recordId']} failed processing")
63 |                     output_record = {
64 |                         "recordId": record["recordId"],
65 |                         "result": "ProcessingFailed",
66 |                         "data": record["data"],
67 |                     }
68 |             else:
69 |                 logger.error(
70 |                     f"record {record['recordId']} failed processing, no resulting dict"
71 |                 )
72 |                 output_record = {
73 |                     "recordId": record["recordId"],
74 |                     "result": "ProcessingFailed",
75 |                     "data": record["data"],
76 |                 }
77 | 
78 |             output.append(output_record)
79 | 
80 |         logger.info("Processed {} records.".format(len(event["records"])))
81 | 
82 |         return {"records": output}
83 |     else:
84 |         logger.info(f"no records found in {event} with context: {context}")
85 | 
86 | 


--------------------------------------------------------------------------------
/lambdas/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | astroid==2.4.2
 3 | attrs==20.1.0
 4 | black==20.8b1
 5 | boto3==1.14.48
 6 | botocore==1.17.48
 7 | certifi==2020.6.20
 8 | chardet==3.0.4
 9 | chevron @ https://github.com/noahmorrison/chevron/archive/master.zip
10 | click==7.1.2
11 | docker==4.3.1
12 | docutils==0.15.2
13 | future==0.18.2
14 | idna==2.10
15 | iniconfig==1.0.1
16 | isort==5.4.2
17 | jmespath==0.10.0
18 | lazy-object-proxy==1.4.3
19 | mccabe==0.6.1
20 | more-itertools==8.4.0
21 | mypy-extensions==0.4.3
22 | netaddr==0.8.0
23 | numpy==1.19.1
24 | packaging==20.4
25 | pandas==1.1.1
26 | pathspec==0.8.0
27 | pluggy==0.13.1
28 | py==1.9.0
29 | PyAthena==1.11.1
30 | pylint==2.6.0
31 | pynsive==0.2.7
32 | pyparsing==3.0.0a2
33 | pytest==6.0.1
34 | python-dateutil==2.8.1
35 | pytz==2020.1
36 | PyYAML==5.3.1
37 | regex==2020.7.14
38 | requests==2.24.0
39 | s3transfer==0.3.3
40 | six==1.15.0
41 | tenacity==6.2.0
42 | toml==0.10.1
43 | typed-ast==1.4.1
44 | typing-extensions==3.7.4.3
45 | tzlocal==2.1
46 | urllib3==1.25.10
47 | websocket-client==0.57.0
48 | wrapt==1.12.1
49 | zipp==3.1.0
50 | 


--------------------------------------------------------------------------------
/lambdas/s3_to_firehose.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import gzip
  3 | import json
  4 | import logging
  5 | import os
  6 | from time import sleep
  7 | from io import BytesIO, TextIOWrapper, StringIO
  8 | from utils.dotdict import DotDict
  9 | from utils.helpers import is_cloudtrail, generate_metadata, emit_json_block, chunks
 10 | from json import JSONDecodeError
 11 | 
 12 | logger = logging.getLogger()
 13 | logger.setLevel(logging.INFO)
 14 | FIREHOSE_DELIVERY_STREAM = os.environ.get(
 15 |     "FIREHOSE_DELIVERY_STREAM", "defenda_data_lake_s3_stream"
 16 | )
 17 | FIREHOSE_BATCH_SIZE = os.environ.get("FIREHOSE_BATCH_SIZE", 100)
 18 | 
 19 | 
 20 | def send_to_firehose(records):
 21 |     f_hose = boto3.client("firehose")
 22 | 
 23 |     # records should be a list of dicts
 24 |     response = None
 25 |     if type(records) is list:
 26 |         # batch up the list below the limits of firehose
 27 |         for batch in chunks(records, FIREHOSE_BATCH_SIZE):
 28 |             response = f_hose.put_record_batch(
 29 |                 DeliveryStreamName=FIREHOSE_DELIVERY_STREAM,
 30 |                 Records=[
 31 |                     {"Data": bytes(str(json.dumps(record) + "\n").encode("UTF-8"))}
 32 |                     for record in batch
 33 |                 ],
 34 |             )
 35 |             logger.debug("firehose response is: {}".format(response))
 36 | 
 37 | 
 38 | def lambda_handler(event, context):
 39 |     """
 40 |         Called on a PUT to s3
 41 |         Make every attempt to read in json records
 42 |         from the s3 source
 43 |     """
 44 |     metadata = generate_metadata(context)
 45 |     logger.debug("Event is: {}".format(event))
 46 | 
 47 |     # make the event easier to traverse
 48 |     event = DotDict(event)
 49 | 
 50 |     # test harnesses
 51 |     if event == {"test": "true"}:
 52 |         return {"Hello": "from s3_to_firehose"}
 53 |     elif event == {"metadata": "name"}:
 54 |         return metadata
 55 |     elif "Records" in event:
 56 |         # should be triggered by s3 Put/Object created events
 57 |         s3 = boto3.client("s3")
 58 |         for record in event.Records:
 59 |             record = DotDict(record)
 60 |             s3_bucket = record.s3.bucket.name
 61 |             s3_key = record.s3.object.key
 62 |             # a new bucket will fire for folders *and* files, early exit if it's a folder
 63 |             if s3_key.endswith("/"):
 64 |                 continue
 65 |             # assume the file is just good ol json
 66 |             source = "s3json"
 67 |             # if the file name is cloudtrail-ish
 68 |             if is_cloudtrail(s3_key):
 69 |                 source = "cloudtrail"
 70 |             # up to 5 attempts to get the object ( in case s3 file commit on write is lagging)
 71 |             s3_response = None
 72 |             for x in range(1, 6):
 73 |                 try:
 74 |                     s3_response = s3.get_object(Bucket=s3_bucket, Key=s3_key)
 75 |                     break
 76 |                 except Exception as e:
 77 |                     logger.error(
 78 |                         f"Attempt {x}: {e} while attempting to get_object {s3_bucket} {s3_key}"
 79 |                     )
 80 |                     sleep(1)
 81 |                     continue
 82 |             if not s3_response:
 83 |                 logger.error(
 84 |                     f"5 attempts to retrieve {s3_bucket} {s3_key} failed, moving on"
 85 |                 )
 86 |                 continue
 87 |             s3_data = ""
 88 |             # gunzip if zipped
 89 |             if s3_key[-3:] == ".gz":
 90 |                 s3_raw_data = s3_response["Body"].read()
 91 |                 with gzip.GzipFile(fileobj=BytesIO(s3_raw_data)) as gzip_stream:
 92 |                     s3_data += "".join(TextIOWrapper(gzip_stream, encoding="utf-8"))
 93 |             else:
 94 |                 s3_data = s3_response["Body"].read().decode("utf-8")
 95 | 
 96 |             # create our list of records to append out findings to
 97 |             s3_records = []
 98 |             s3_dict = None
 99 |             try:
100 |                 # load the json we have from either a .json file or a gunziped file
101 |                 s3_dict = json.loads(s3_data)
102 |             except JSONDecodeError:
103 |                 # file isn't well formed json, see if we can interpret json from it
104 |                 for block in emit_json_block(StringIO(s3_data)):
105 |                     if block:
106 |                         record = json.loads(block)
107 |                         record["source"] = source
108 |                         s3_records.append(record)
109 |             # if this is a dict of a single 'Records' list, unroll the list into
110 |             # it's sub records
111 |             if s3_dict and "Records" in s3_dict:
112 |                 if type(s3_dict["Records"]) is list:
113 |                     for record in s3_dict["Records"]:
114 |                         record["source"] = source
115 |                         s3_records.append(record)
116 |             # maybe it's just a list already?
117 |             elif s3_dict and type(s3_dict) is list:
118 |                 # a list of dicts
119 |                 for record in s3_dict:
120 |                     record["source"] = source
121 |                     s3_records.append(record)
122 |             elif s3_dict and type(s3_dict) is dict:
123 |                 # a single dict, but lets add it to a list
124 |                 # for consistent handling
125 |                 s3_dict["source"] = source
126 |                 s3_records.append(s3_dict)
127 | 
128 |             logger.debug("pre-plugins s3_records is: {}".format(s3_records))
129 |             # send off to firehose for further processing
130 |             if s3_records:
131 |                 send_to_firehose(s3_records)
132 | 
133 |         return
134 | 


--------------------------------------------------------------------------------
/lambdas/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/tests/__init__.py


--------------------------------------------------------------------------------
/lambdas/tests/logging_config.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | formatters:
 3 |   simple:
 4 |     format: '%(asctime)s - %(module)s - %(levelname)s - %(message)s'
 5 |     datefmt: '%Y-%m-%d %H:%M:%S %Z'
 6 | handlers:
 7 |   console:
 8 |     class: logging.StreamHandler
 9 |     level: INFO
10 |     formatter: simple
11 |     stream: ext://sys.stdout
12 | loggers:
13 |   sampleLogger:
14 |     level: DEBUG
15 |     handlers: [console]
16 |     propagate: no
17 | root:
18 |   level: INFO
19 |   handlers: [console]


--------------------------------------------------------------------------------
/lambdas/tests/samples/sample_cloudfront_wordpress_probe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "date": "2020-09-01",
 3 |     "time": "17:48:18",
 4 |     "x-edge-location": "MAA50-C1",
 5 |     "sc-bytes": 587,
 6 |     "c-ip": "139.59.66.23",
 7 |     "cs-method": "GET",
 8 |     "cs(Host)": "c4ixl1pp8t7hvm.cloudfront.net",
 9 |     "cs-uri-stem": "/wp-login.php",
10 |     "sc-status": 301,
11 |     "cs(Referer)": "-",
12 |     "cs(User-Agent)": "Mozilla/5.0%20(X11;%20Ubuntu;%20Linux%20x86_64;%20rv:62.0)%20Gecko/20100101%20Firefox/62.0",
13 |     "cs-uri-query": "-",
14 |     "cs(Cookie)": "-",
15 |     "x-edge-result-type": "Redirect",
16 |     "x-edge-request-id": "JvTXFvqgmlUuUYTFWBShrvVZiMTWLZRsuMcaOx39DLmthLmoNaijww==",
17 |     "x-host-header": "somewhere.com",
18 |     "cs-protocol": "http",
19 |     "cs-bytes": 184,
20 |     "time-taken": 0.0,
21 |     "x-forwarded-for": "-",
22 |     "ssl-protocol": "-",
23 |     "ssl-cipher": "-",
24 |     "x-edge-response-result-type": "Redirect",
25 |     "cs-protocol-version": "HTTP/1.1",
26 |     "fle-status": "-",
27 |     "fle-encrypted-fields": "-",
28 |     "c-port": 44276,
29 |     "time-to-first-byte": 0.0,
30 |     "x-edge-detailed-result-type": "Redirect",
31 |     "sc-content-type": "text/html",
32 |     "sc-content-len": 183,
33 |     "sc-range-start": "-",
34 |     "sc-range-end": "-"
35 | }


--------------------------------------------------------------------------------
/lambdas/tests/samples/sample_cloudtrail_create_log_stream.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "source": "cloudtrail",
 3 |     "tags": [],
 4 |     "details": {
 5 |         "apiversion": "20140328",
 6 |         "eventtype": "AwsApiCall",
 7 |         "recipientaccountid": "123456789012",
 8 |         "responseelements": null,
 9 |         "requestparameters": {
10 |             "loggroupname": "/aws/lambda/some_lambda",
11 |             "logstreamname": "2019/09/04/[$LATEST]1759dcd0266b4e28a55147e10c28e984"
12 |         },
13 |         "eventid": "2163d086-baa4-4203-a267-97b3a872c651",
14 |         "eventsource": "logs.amazonaws.com",
15 |         "useragent": "awslambda-worker",
16 |         "eventname": "CreateLogStream",
17 |         "eventversion": "1.05",
18 |         "sourceipaddress": "54.21.12.27",
19 |         "requestid": "0fb46c32-fd7c-4121-8eb7-7fa10670bc4b",
20 |         "useridentity": {
21 |             "type": "AssumedRole",
22 |             "principalid": "AROAIQ45SXVRIH72NM:some_lambda",
23 |             "arn": "arn:aws:sts::123456789012:assumed-role/some_lambda-us-west-2-lambdaRole/some_lambda",
24 |             "accountid": "123456789012",
25 |             "accesskeyid": "AROAIQ45SXVRIH72NM",
26 |             "sessioncontext": {
27 |                 "attributes": {
28 |                     "mfaauthenticated": "false",
29 |                     "creationdate": "2019-09-04T17:01:34Z"
30 |                 },
31 |                 "sessionissuer": {
32 |                     "type": "Role",
33 |                     "principalid": "AROAIQ45SXVRIH72NM",
34 |                     "arn": "arn:aws:iam::123456789012:role/some_lambda-us-west-2-lambdaRole",
35 |                     "accountid": "123456789012",
36 |                     "username": "some_lambda-us-west-2-lambdaRole"
37 |                 }
38 |             }
39 |         },
40 |         "lambda_details": {
41 |             "function_version": "$LATEST",
42 |             "function_arn": "arn:aws:lambda:us-west-2:123456789012:function:some_lambda",
43 |             "function_name": "some_lambda",
44 |             "memory_size": "1024"
45 |         },
46 |         "awsregion": "us-west-2",
47 |         "eventtime": "2019-09-04T17:54:59Z"
48 |     }
49 | }


--------------------------------------------------------------------------------
/lambdas/tests/samples/sample_gsuite_login_event.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "kind": "admin#reports#activity",
 3 |     "id": {
 4 |         "time": "2020-03-02T17:54:33.253Z",
 5 |         "uniqueQualifier": "123456193837",
 6 |         "applicationName": "login",
 7 |         "customerId": "123456bbh"
 8 |     },
 9 |     "etag": "\"12345684sebSczDxOtZ17CIssbQ/fcUkSWOHV-mPDcYGbkgHvS5ghwg\"",
10 |     "actor": {
11 |         "email": "someone@somewhere.com",
12 |         "profileId": "123456359252796690369"
13 |     },
14 |     "ipAddress": "123.456.253.226",
15 |     "events": [
16 |         {
17 |             "type": "login",
18 |             "name": "login_success",
19 |             "parameters": [
20 |                 {
21 |                     "name": "login_type",
22 |                     "value": "exchange"
23 |                 },
24 |                 {
25 |                     "name": "login_challenge_method",
26 |                     "multiValue": [
27 |                         "none"
28 |                     ]
29 |                 },
30 |                 {
31 |                     "name": "is_suspicious",
32 |                     "boolValue": false
33 |                 }
34 |             ]
35 |         }
36 |     ]
37 | }


--------------------------------------------------------------------------------
/lambdas/tests/samples/sample_syslog_sudo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "category": "monitoring",
 3 |     "severity": "INFO",
 4 |     "utctimestamp": "2014-04-17T06:10:54+00:00",
 5 |     "summary": "  nagios : TTY=unknown ; PWD=/ ; USER=root ; COMMAND=/usr/lib64/nagios/plugins/custom/check_auditd.sh\n",
 6 |     "source": "syslog",
 7 |     "tags": [
 8 |         "sample"
 9 |     ],
10 |     "details": {
11 |         "processid": "123",
12 |         "program": "sudo",
13 |         "hostname": "something.example.com",
14 |         "timestamp": "Apr 17 06:10:54"
15 |     }
16 | }


--------------------------------------------------------------------------------
/lambdas/tests/samples/sample_vpc_flow_log.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "account_id": "123456789010",
 3 |     "action": "ACCEPT",
 4 |     "bytes": 840,
 5 |     "dstaddr": "192.0.2.1",
 6 |     "dstport": 49152,
 7 |     "end": "2014-12-14T04:07:50",
 8 |     "interface_id": "eni-102010ab",
 9 |     "log_status": "OK",
10 |     "packets": 10,
11 |     "protocol": 6,
12 |     "srcaddr": "198.51.100.1",
13 |     "srcport": 443,
14 |     "start": "2014-12-14T04:06:50",
15 |     "version": 2
16 | }


--------------------------------------------------------------------------------
/lambdas/tests/test_core.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from io import BytesIO
  3 | from subprocess import PIPE, Popen
  4 | from pkg_resources import parse_version
  5 | import pytest
  6 | import yaml
  7 | from datetime import timezone
  8 | import datetime
  9 | from utils.plugins import send_event_to_plugins, register_plugins
 10 | from utils.helpers import is_cloudtrail, generate_metadata, short_uuid
 11 | from utils.helpers import is_ip, isIPv4, isIPv6
 12 | from utils.dict_helpers import (
 13 |     merge,
 14 |     find_keys,
 15 |     enum_values,
 16 |     enum_keys,
 17 |     sub_dict,
 18 |     dict_match,
 19 |     getValueByPath,
 20 |     dictpath,
 21 | )
 22 | from utils.dotdict import DotDict
 23 | from utils.dates import toUTC, get_date_parts
 24 | from pathlib import Path
 25 | import logging, logging.config
 26 | 
 27 | print("setting up logging")
 28 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml")
 29 | with open(logging_config_file_path, "r") as fd:
 30 |     logging_config = yaml.safe_load(fd)
 31 |     logging.config.dictConfig(logging_config)
 32 | global logger
 33 | logger = logging.getLogger()
 34 | logger.info("logging established")
 35 | 
 36 | 
 37 | class TestCore(object):
 38 |     def test_cloudtrail_file_identification(self):
 39 |         filename = "AWSLogs/722455710680/CloudTrail/us-west-2/2019/09/20/722455710680_CloudTrail_us-west-2_20190920T0000Z_2AKE4AyQfcPRcIoa.json.gz"
 40 |         assert is_cloudtrail(filename) == True
 41 |         filename = "not cloudtrailfile.json.gz"
 42 |         assert is_cloudtrail(filename) == False
 43 | 
 44 |     def test_lambda_metadata_generation(self):
 45 |         lambda_context = {
 46 |             "function_version": "$LATEST",
 47 |             "invoked_function_arn": "arn:aws:lambda:us-west-2:722455710680:function:processor-prod",
 48 |             "function_name": "processor-prod",
 49 |             "memory_limit_in_mb": "1024",
 50 |         }
 51 |         lambda_context = DotDict(lambda_context)
 52 |         result = generate_metadata(lambda_context)
 53 |         assert type(result.lambda_details) == type(lambda_context)
 54 |         assert "function_version" in result.lambda_details
 55 |         assert "function_arn" in result.lambda_details
 56 |         assert "function_name" in result.lambda_details
 57 |         assert "memory_size" in result.lambda_details
 58 | 
 59 |     def test_short_uuid(self):
 60 |         assert len(short_uuid()) == 8
 61 | 
 62 |     def test_to_utc(self):
 63 |         assert toUTC("Jan 1 12am 2020 UTC") == datetime.datetime(
 64 |             2020, 1, 1, 0, 0, tzinfo=timezone.utc
 65 |         )
 66 |         assert toUTC("Jan 1 12am 2020 UTC").isoformat() == "2020-01-01T00:00:00+00:00"
 67 | 
 68 |     def test_get_date_parts(self):
 69 |         parts = get_date_parts()
 70 |         assert len(parts) == 8
 71 | 
 72 |     def test_dictpath(self):
 73 |         assert list(dictpath("key.value")) == ["key", "value"]
 74 | 
 75 |     def test_get_value_by_path(self):
 76 |         assert getValueByPath({"key": "value"}, "key") == "value"
 77 |         assert getValueByPath({"key": {"key": "value"}}, "key.key") == "value"
 78 |         assert (
 79 |             getValueByPath({"key": {"key": {"key": "value"}}}, "key.key.key") == "value"
 80 |         )
 81 | 
 82 |     def test_ip_helpers(self):
 83 |         assert is_ip("127.0.0.1")
 84 |         assert is_ip("127.0.0.1/32")
 85 |         assert is_ip("127") == False
 86 |         assert is_ip("1") == False
 87 |         assert is_ip("1278.1.1.1.1") == False
 88 |         assert is_ip("fe80::")
 89 |         assert is_ip("fe80::/10")
 90 |         assert isIPv4("127.0.0.1")
 91 |         assert isIPv4("127.0.0.1/32") == False
 92 |         assert isIPv6("fe80::")
 93 |         assert isIPv6("::ffff:192.0.2.15")
 94 |         assert isIPv6(":ffff:192.0.2.15") == False
 95 | 
 96 |     def test_merge(self):
 97 |         dict1 = {"some_key": "some value"}
 98 |         dict2 = {"some_other_key": "some other value"}
 99 |         dict3 = merge(dict1, dict2)
100 |         assert dict3 == {"some_key": "some value", "some_other_key": "some other value"}
101 | 
102 |     def test_find_keys(self):
103 |         complex_dict1 = {
104 |             "some_key": "some value",
105 |             "sub_key": {"some_key": "some other value"},
106 |         }
107 |         result = list(find_keys(complex_dict1, "some_key"))
108 |         assert result == ["some value", "some other value"]
109 | 
110 |     def test_enum_values(self):
111 |         complex_dict1 = {
112 |             "some_key": "some value",
113 |             "sub_key": {"some_key": "some other value"},
114 |         }
115 |         result = list(enum_values(complex_dict1))
116 |         assert result == ["some value", "some other value"]
117 | 
118 |     def test_enum_keys(self):
119 |         complex_dict1 = {
120 |             "some_key": "some value",
121 |             "sub_key": {"some_key": "some other value"},
122 |         }
123 |         result = list(enum_keys(complex_dict1))
124 |         assert result == ["some_key", "sub_key", "some_key"]
125 | 
126 |     def test_sub_dict(self):
127 |         complex_dict1 = {
128 |             "some_key": "some value",
129 |             "sub_key": {"some_key": "some other value"},
130 |         }
131 |         result = sub_dict(complex_dict1, ["some_key"], "nothing")
132 |         assert result == {"some_key": "some value"}
133 |         result = sub_dict(complex_dict1, ["sub_key.some_key"], "nothing")
134 |         assert result == {"sub_key.some_key": "nothing"}
135 |         complex_dot_dict = DotDict(complex_dict1)
136 |         result = sub_dict(complex_dot_dict, ["sub_key.some_key"], "nothing")
137 |         assert result == {"sub_key.some_key": "some other value"}
138 |         result = sub_dict(complex_dot_dict, ["some_key", "sub_key.some_key"])
139 |         assert result == {
140 |             "some_key": "some value",
141 |             "sub_key.some_key": "some other value",
142 |         }
143 | 
144 |     def test_dict_match(self):
145 |         complex_dict1 = {
146 |             "some_key": "some value",
147 |             "sub_key": {"some_key": "some other value"},
148 |         }
149 |         assert dict_match({"some_key": "some value"}, complex_dict1)
150 |         complex_dot_dict = DotDict(complex_dict1)
151 |         assert dict_match({"sub_key.some_key": "some other value"}, complex_dot_dict)
152 |         assert (
153 |             dict_match({"sub_key.some_key": "not some other value"}, complex_dot_dict)
154 |             == False
155 |         )
156 | 


--------------------------------------------------------------------------------
/lambdas/tests/test_plugin_gsuite_logins.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import yaml
 3 | import json
 4 | import uuid
 5 | import logging, logging.config
 6 | from pathlib import Path
 7 | from utils.dotdict import DotDict
 8 | from utils.dates import toUTC
 9 | 
10 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml")
11 | with open(logging_config_file_path, "r") as fd:
12 |     logging_config = yaml.safe_load(fd)
13 |     logging.config.dictConfig(logging_config)
14 | global logger
15 | logger = logging.getLogger()
16 | 
17 | 
18 | class TestPluginGsuiteLogins(object):
19 |     def setup(self):
20 |         from normalization_plugins.gsuite_login import message
21 | 
22 |         self.plugin = message()
23 |         with open("./lambdas/tests/samples/sample_gsuite_login_event.json", "r") as f:
24 |             self.inbound_event = json.loads(f.read())
25 |         # run the event through default plugins
26 |         # to set the shell and lowercase all keys
27 |         from normalization_plugins.event_shell import message as event_shell
28 |         from normalization_plugins.lowercase_keys import message as lowercase_keys
29 | 
30 |         metadata = {"something": "else"}
31 |         event = self.inbound_event
32 |         event, metadata = event_shell().onMessage(event, metadata)
33 |         event, metadata = lowercase_keys().onMessage(event, metadata)
34 |         self.normalized_event = event
35 | 
36 |     def test_nochange(self):
37 |         metadata = {"something": "else"}
38 |         # use the native raw event
39 |         event = self.inbound_event
40 |         result, metadata = self.plugin.onMessage(event, metadata)
41 |         # in = out - plugin didn't modify it
42 |         # since it doesn't match the normalized format
43 |         assert result == event
44 | 
45 |     def test_structure(self):
46 |         metadata = {"something": "else"}
47 |         # use the normalized event
48 |         event = self.normalized_event
49 |         result, metadata = self.plugin.onMessage(event, metadata)
50 |         assert "utctimestamp" in result
51 |         assert "severity" in result
52 |         assert "summary" in result
53 |         assert "category" in result
54 |         assert "source" in result
55 |         assert "tags" in result
56 |         assert "plugins" in result
57 |         assert "details" in result
58 | 
59 |     def test_values(self):
60 |         metadata = {"something": "else"}
61 |         # use the normalized event
62 |         event = self.normalized_event
63 |         result, metadata = self.plugin.onMessage(event, metadata)
64 |         logger.debug(result)
65 |         assert (
66 |             result["summary"]
67 |             == "someone@somewhere.com login_success from IP 123.456.253.226"
68 |         )
69 |         assert result["details"]["sourceipaddress"] == "123.456.253.226"
70 |         assert result["category"] == "authentication"
71 |         assert result["source"] == "gsuite"
72 |         assert result["details"]["success"] == True


--------------------------------------------------------------------------------
/lambdas/tests/test_plugin_ip_addresses.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import yaml
  3 | import json
  4 | import uuid
  5 | import logging, logging.config
  6 | from pathlib import Path
  7 | from utils.dotdict import DotDict
  8 | from utils.dates import toUTC
  9 | 
 10 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml")
 11 | with open(logging_config_file_path, "r") as fd:
 12 |     logging_config = yaml.safe_load(fd)
 13 |     logging.config.dictConfig(logging_config)
 14 | global logger
 15 | logger = logging.getLogger()
 16 | 
 17 | 
 18 | class TestPluginIpAddresses(object):
 19 |     def setup(self):
 20 |         from normalization_plugins.ip_addresses import message
 21 | 
 22 |         self.plugin = message()
 23 |         self.inbound_events = []
 24 |         self.normalized_events = []
 25 |         with open(
 26 |             "./lambdas/tests/samples/sample_cloudtrail_create_log_stream.json", "r"
 27 |         ) as f:
 28 |             self.inbound_events.append(json.loads(f.read()))
 29 |         with open(
 30 |             "./lambdas/tests/samples/sample_cloudfront_wordpress_probe.json", "r"
 31 |         ) as f:
 32 |             self.inbound_events.append(json.loads(f.read()))
 33 |         with open("./lambdas/tests/samples/sample_vpc_flow_log.json", "r") as f:
 34 |             self.inbound_events.append(json.loads(f.read()))
 35 |         # run the event through default plugins
 36 |         # to set the shell and lowercase all keys
 37 |         from normalization_plugins.event_shell import message as event_shell
 38 |         from normalization_plugins.lowercase_keys import message as lowercase_keys
 39 | 
 40 |         metadata = {"something": "else"}
 41 |         for event in self.inbound_events:
 42 |             event, metadata = event_shell().onMessage(event, metadata)
 43 |             event, metadata = lowercase_keys().onMessage(event, metadata)
 44 |             self.normalized_events.append(event)
 45 | 
 46 |     def test_nochange(self):
 47 |         metadata = {"something": "else"}
 48 |         event = {}
 49 |         # use an event without an ip
 50 |         # to test if the plugin is benign when it should not act
 51 |         with open("./lambdas/tests/samples/sample_syslog_sudo.json", "r") as f:
 52 |             event = json.loads(f.read())
 53 |         # make sure we have a valid, populated dict
 54 |         assert len(event.keys())
 55 |         result, metadata = self.plugin.onMessage(event, metadata)
 56 |         # in = out - plugin didn't modify it
 57 |         # since it doesn't match the normalized format
 58 |         # and won't find an ip field under 'details'
 59 | 
 60 |         assert result == event
 61 | 
 62 |     def test_structure(self):
 63 |         metadata = {"something": "else"}
 64 |         # use the normalized event
 65 |         for event in self.normalized_events:
 66 |             result, metadata = self.plugin.onMessage(event, metadata)
 67 |             assert "utctimestamp" in result
 68 |             assert "severity" in result
 69 |             assert "summary" in result
 70 |             assert "category" in result
 71 |             assert "source" in result
 72 |             assert "tags" in result
 73 |             assert "plugins" in result
 74 |             assert "details" in result
 75 |             # we should have a source or destination for these events
 76 |             assert (
 77 |                 "sourceipaddress" in result["details"]
 78 |                 or "destinationipaddress" in result["details"]
 79 |             )
 80 | 
 81 |     def test_values(self):
 82 |         metadata = {"something": "else"}
 83 |         # use normalized events
 84 |         # we know the end result for
 85 |         event = self.normalized_events[0]
 86 |         result, metadata = self.plugin.onMessage(event, metadata)
 87 |         logger.debug(result)
 88 |         assert result["details"]["sourceipaddress"] == "54.21.12.27"
 89 |         assert "54.21.12.27" in result["details"]["_ipaddresses"]
 90 | 
 91 |         event = self.normalized_events[1]
 92 |         result, metadata = self.plugin.onMessage(event, metadata)
 93 |         logger.debug(result)
 94 |         assert result["details"]["sourceipaddress"] == "139.59.66.23"
 95 |         assert "139.59.66.23" in result["details"]["_ipaddresses"]
 96 | 
 97 |         event = self.normalized_events[2]
 98 |         result, metadata = self.plugin.onMessage(event, metadata)
 99 |         logger.debug(result)
100 |         assert result["details"]["sourceipaddress"] == "198.51.100.1"
101 |         assert result["details"]["destinationipaddress"] == "192.0.2.1"
102 |         assert "192.0.2.1" in result["details"]["_ipaddresses"]
103 |         assert "198.51.100.1" in result["details"]["_ipaddresses"]
104 | 
105 |     def test_invalid_ip_values(self):
106 |         """
107 |         purposefully invalidate IP addresses in ip address fields
108 |         and make sure the plugin doesn't accept them
109 |         """
110 |         metadata = {"something": "else"}
111 |         # use normalized events
112 |         # we know the end result for
113 |         event = self.normalized_events[0]
114 |         event["details"]["sourceipaddress"] = "nada"
115 |         result, metadata = self.plugin.onMessage(event, metadata)
116 |         logger.debug(result)
117 |         assert result["details"]["sourceipaddress"] == "nada"
118 |         assert "_ipaddresses" not in result["details"]
119 | 
120 |         event = self.normalized_events[1]
121 |         event["details"]["c-ip"] = "1"
122 |         result, metadata = self.plugin.onMessage(event, metadata)
123 |         logger.debug(result)
124 |         assert result["details"]["c-ip"] == "1"
125 |         assert result["details"].get("sourceipaddress", None) == None
126 |         assert "_ipaddresses" not in result["details"]
127 | 
128 |         event = self.normalized_events[2]
129 |         event["details"]["srcaddr"] = "1320.2555.2555.2555"
130 |         result, metadata = self.plugin.onMessage(event, metadata)
131 |         logger.debug(result)
132 |         assert result["details"]["srcaddr"] == "1320.2555.2555.2555"
133 |         assert result["details"].get("sourceipaddress", None) == None
134 |         assert "192.0.2.1" in result["details"]["_ipaddresses"]
135 | 


--------------------------------------------------------------------------------
/lambdas/tests/test_plugin_timestamps.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import yaml
  3 | import json
  4 | import uuid
  5 | import logging, logging.config
  6 | from pathlib import Path
  7 | from utils.dotdict import DotDict
  8 | from utils.dates import toUTC
  9 | import tzlocal
 10 | import os
 11 | 
 12 | logging_config_file_path = Path(__file__).parent.joinpath("logging_config.yml")
 13 | with open(logging_config_file_path, "r") as fd:
 14 |     logging_config = yaml.safe_load(fd)
 15 |     logging.config.dictConfig(logging_config)
 16 | global logger
 17 | logger = logging.getLogger()
 18 | 
 19 | os.environ["TZ"] = "UTC"
 20 | logger.info(f"using timezone {tzlocal.get_localzone()}")
 21 | 
 22 | 
 23 | class TestPluginTimestamps(object):
 24 |     def setup(self):
 25 |         from normalization_plugins.timestamps import message
 26 | 
 27 |         self.plugin = message()
 28 |         self.inbound_events = []
 29 |         self.normalized_events = []
 30 | 
 31 |         with open(
 32 |             "./lambdas/tests/samples/sample_cloudtrail_create_log_stream.json", "r"
 33 |         ) as f:
 34 |             self.inbound_events.append(json.loads(f.read()))
 35 |         with open(
 36 |             "./lambdas/tests/samples/sample_cloudfront_wordpress_probe.json", "r"
 37 |         ) as f:
 38 |             self.inbound_events.append(json.loads(f.read()))
 39 |         with open("./lambdas/tests/samples/sample_vpc_flow_log.json", "r") as f:
 40 |             self.inbound_events.append(json.loads(f.read()))
 41 |         # run the event through default plugins
 42 |         # to set the shell and lowercase all keys
 43 |         from normalization_plugins.event_shell import message as event_shell
 44 |         from normalization_plugins.lowercase_keys import message as lowercase_keys
 45 | 
 46 |         metadata = {"something": "else"}
 47 |         for event in self.inbound_events:
 48 |             event, metadata = event_shell().onMessage(event, metadata)
 49 |             event, metadata = lowercase_keys().onMessage(event, metadata)
 50 |             self.normalized_events.append(event)
 51 | 
 52 |     def test_nochange(self):
 53 | 
 54 |         metadata = {"something": "else"}
 55 |         event = {}
 56 |         # use an event without an ip
 57 |         # to test if the plugin is benign when it should not act
 58 |         with open("./lambdas/tests/samples/sample_syslog_sudo.json", "r") as f:
 59 |             event = json.loads(f.read())
 60 |         # make sure we have a valid, populated dict
 61 |         assert len(event.keys())
 62 |         # remove the timstamp in this event
 63 |         # that would trigger the plugin
 64 |         # to see if it passes the no change test
 65 |         del event["details"]["timestamp"]
 66 |         result, metadata = self.plugin.onMessage(event, metadata)
 67 |         # the plugin adds a metadata field
 68 |         # assert that it worked
 69 |         assert result["details"]["_utcprocessedtimestamp"]
 70 | 
 71 |         # next, remove it for the
 72 |         # in = out - plugin didn't modify it
 73 |         # test
 74 |         del result["details"]["_utcprocessedtimestamp"]
 75 | 
 76 |         assert result == event
 77 | 
 78 |     def test_structure(self):
 79 |         metadata = {"something": "else"}
 80 |         # use the normalized event
 81 |         for event in self.normalized_events:
 82 |             result, metadata = self.plugin.onMessage(event, metadata)
 83 |             assert "severity" in result
 84 |             assert "summary" in result
 85 |             assert "category" in result
 86 |             assert "source" in result
 87 |             assert "tags" in result
 88 |             assert "plugins" in result
 89 |             assert "details" in result
 90 |             # we should have these valid timestamps
 91 |             assert "utctimestamp" in result
 92 |             assert "_utcprocessedtimestamp" in result["details"]
 93 | 
 94 |     def test_values(self):
 95 |         metadata = {"something": "else"}
 96 |         # use normalized events
 97 |         # we know the end result for
 98 |         event = self.normalized_events[0]
 99 |         result, metadata = self.plugin.onMessage(event, metadata)
100 |         logger.debug(result)
101 |         assert result["utctimestamp"] == "2019-09-04T17:54:59+00:00"
102 |         assert result["details"]["_utcprocessedtimestamp"]
103 | 
104 |         event = self.normalized_events[1]
105 |         result, metadata = self.plugin.onMessage(event, metadata)
106 |         assert result["utctimestamp"] == "2020-09-01T17:48:18+00:00"
107 |         assert result["details"]["_utcprocessedtimestamp"]
108 | 
109 |         event = self.normalized_events[2]
110 |         result, metadata = self.plugin.onMessage(event, metadata)
111 |         logger.debug(result)
112 |         assert result["utctimestamp"] == "2014-12-14T04:06:50+00:00"
113 |         assert result["details"]["_utcprocessedtimestamp"]
114 | 
115 |     def test_invalid_date_values(self):
116 |         """
117 |         purposefully invalidate dates in date fields
118 |         and make sure the plugin doesn't accept them
119 |         """
120 |         metadata = {"something": "else"}
121 |         # use normalized events
122 |         # we know the end result for
123 |         event = self.normalized_events[0]
124 |         event["details"]["eventtime"] = "nada"
125 |         result, metadata = self.plugin.onMessage(event, metadata)
126 |         logger.debug(result)
127 |         assert result["details"]["eventtime"] == "nada"
128 | 
129 |         event = self.normalized_events[1]
130 |         event["details"]["time"] = "nada"
131 |         result, metadata = self.plugin.onMessage(event, metadata)
132 |         logger.debug(result)
133 |         assert result["details"]["time"] == "nada"
134 | 
135 |         event = self.normalized_events[2]
136 |         event["details"]["start"] = "nada"
137 |         result, metadata = self.plugin.onMessage(event, metadata)
138 |         logger.debug(result)
139 |         assert result["details"]["start"] == "nada"


--------------------------------------------------------------------------------
/lambdas/tests/test_plugins.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import yaml
 3 | import json
 4 | import uuid
 5 | from utils.dotdict import DotDict
 6 | from utils.dates import toUTC
 7 | 
 8 | 
 9 | class TestLowerCaseKeys(object):
10 |     def setup(self):
11 |         from normalization_plugins.lowercase_keys import message
12 | 
13 |         self.plugin = message()
14 | 
15 |     def test_nochange(self):
16 |         metadata = {"something": "else"}
17 |         event = {"key1": "syslog", "tags": ["atag"]}
18 |         result, metadata = self.plugin.onMessage(event, metadata)
19 |         # in = out - plugin didn't modify it
20 |         assert result == event
21 | 
22 |     def test_lower_a_key(self):
23 |         metadata = {"something": "else"}
24 |         event = {"KEY1": "syslog", "tags": ["atag"]}
25 |         expected = {"key1": "syslog", "tags": ["atag"]}
26 |         result, metadata = self.plugin.onMessage(event, metadata)
27 |         # lower case the upper case key
28 |         assert result == expected
29 | 
30 |     def test_lower_a_sub_key(self):
31 |         metadata = {"something": "else"}
32 |         event = {"KEY1": "syslog", "tags": ["atag"], "details": {"SUBKEY": "subvalue"}}
33 |         expected = {
34 |             "key1": "syslog",
35 |             "tags": ["atag"],
36 |             "details": {"subkey": "subvalue"},
37 |         }
38 |         result, metadata = self.plugin.onMessage(event, metadata)
39 |         # lower case the upper case keys wherever they are
40 |         assert result == expected
41 | 
42 | 
43 | class TestEnsureEventID(object):
44 |     def setup(self):
45 |         from enrichment_plugins.ensure_eventid import message
46 | 
47 |         self.plugin = message()
48 | 
49 |     def test_ensure_event_id(self):
50 |         metadata = {"something": "else"}
51 |         event = {"key1": "syslog", "tags": ["atag"]}
52 |         result, metadata = self.plugin.onMessage(event, metadata)
53 |         assert "eventid" in result
54 |         assert "eventid" in event
55 |         assert type(uuid.UUID(event["eventid"])) == uuid.UUID
56 | 
57 | 
58 | class TestEventShell(object):
59 |     def setup(self):
60 |         from normalization_plugins.event_shell import message
61 | 
62 |         self.plugin = message()
63 | 
64 |     def test_ensure_base_event_shell(self):
65 |         # given a really empty message
66 |         # does it get the base shell?
67 |         # does it move any non base items to 'details'?
68 |         metadata = {"something": "else"}
69 |         event = {"key1": "syslog", "tags": ["atag"]}
70 |         result, metadata = self.plugin.onMessage(event, metadata)
71 |         assert "severity" in result
72 |         assert "tags" in result
73 |         assert "atag" in result["tags"]
74 |         assert "key1" in result["details"]
75 | 
76 |     def test_ensure_complex_event_shell(self):
77 |         # given a complex message
78 |         # does it get the base shell?
79 |         # does it move any non base items to 'details'?
80 |         metadata = {"something": "else"}
81 |         event = {
82 |             "key1": "syslog",
83 |             "tags": ["atag"],
84 |             "complexkey": {"subkey": "subvalue"},
85 |         }
86 |         result, metadata = self.plugin.onMessage(event, metadata)
87 |         assert "severity" in result
88 |         assert "tags" in result
89 |         assert "atag" in result["tags"]
90 |         assert "key1" in result["details"]
91 |         assert "complexkey" in result["details"]
92 |         assert "subkey" in result["details"]["complexkey"]
93 | 


--------------------------------------------------------------------------------
/lambdas/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/lambdas/utils/__init__.py


--------------------------------------------------------------------------------
/lambdas/utils/athena.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import logging
 4 | logger = logging.getLogger()
 5 | 
 6 | def default_bucket(session):
 7 |     '''
 8 |     Return the s3 bucket name for athena results
 9 |     to allow us to get the csv natively
10 |     '''
11 |     account_id = session.client('sts').get_caller_identity().get('Account')
12 |     return '{}-{}-query-results-{}'.format(account_id, 'aws-athena',  session.region_name)    
13 | 
14 | def run_query(athena, query, database, s3_output):
15 |     ''' 
16 |     Function for executing Athena queries and return the query ID 
17 |     '''
18 |     response = athena.start_query_execution(
19 |         QueryString=query,
20 |         QueryExecutionContext={
21 |             'Database': database
22 |             },
23 |         ResultConfiguration={
24 |             'OutputLocation': 's3://{}'.format(s3_output),
25 |             }
26 |         )
27 |     logger.debug('Execution ID: ' + response['QueryExecutionId'])
28 |     return response
29 | 
30 | def dataframe_from_athena_s3(session,athena_response,bucket_name):
31 |     '''
32 |     Retrieve the native athena csv results as a pandas dataframe
33 |     for easy conversion and analysis
34 |     '''
35 |     s3=session.resource('s3')
36 |     key_name=athena_response['QueryExecutionId']
37 |     s3_response = s3.Bucket(bucket_name).Object(key= key_name + '.csv').get()
38 | 
39 |     return pd.read_csv(io.BytesIO(s3_response['Body'].read()), encoding='utf8') 


--------------------------------------------------------------------------------
/lambdas/utils/dates.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import pytz
 3 | import tzlocal
 4 | from datetime import datetime, timedelta
 5 | from dateutil.parser import parse
 6 | import logging
 7 | 
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def get_date_parts():
12 |     now = datetime.utcnow()
13 |     last_hour_now = now - timedelta(hours=1)
14 | 
15 |     now_hour = str(now.hour).rjust(2, "0")
16 |     now_month = str(now.month).rjust(2, "0")
17 |     now_day = str(now.day).rjust(2, "0")
18 |     now_year = str(now.year)
19 |     last_hour_hour = str(last_hour_now.hour).rjust(2, "0")
20 |     last_hour_month = str(last_hour_now.month).rjust(2, "0")
21 |     last_hour_day = str(last_hour_now.day).rjust(2, "0")
22 |     last_hour_year = str(last_hour_now.year)
23 | 
24 |     return (
25 |         now_hour,
26 |         now_month,
27 |         now_day,
28 |         now_year,
29 |         last_hour_hour,
30 |         last_hour_month,
31 |         last_hour_day,
32 |         last_hour_year,
33 |     )
34 | 
35 | 
36 | def toUTC(suspectedDate):
37 |     """make a UTC date out of almost anything"""
38 |     utc = pytz.UTC
39 |     objDate = None
40 |     # pick up any environment TZ changes
41 |     tzlocal.reload_localzone()
42 | 
43 |     LOCAL_TIMEZONE = tzlocal.get_localzone()
44 | 
45 |     if type(suspectedDate) == datetime:
46 |         objDate = suspectedDate
47 |     elif type(suspectedDate) == float:
48 |         if suspectedDate <= 0:
49 |             objDate = datetime(1970, 1, 1)
50 |         else:
51 |             # This breaks in the year 2286
52 |             EPOCH_MAGNITUDE = 9
53 |             magnitude = int(math.log10(int(suspectedDate)))
54 |             if magnitude > EPOCH_MAGNITUDE:
55 |                 suspectedDate = suspectedDate / 10 ** (magnitude - EPOCH_MAGNITUDE)
56 |             objDate = datetime.fromtimestamp(suspectedDate, LOCAL_TIMEZONE)
57 |     elif str(suspectedDate).isdigit():
58 |         if int(str(suspectedDate)) <= 0:
59 |             objDate = datetime(1970, 1, 1)
60 |         else:
61 |             # epoch? but seconds/milliseconds/nanoseconds (lookin at you heka)
62 |             epochDivisor = int(str(1) + "0" * (len(str(suspectedDate)) % 10))
63 |             objDate = datetime.fromtimestamp(
64 |                 float(suspectedDate / epochDivisor), LOCAL_TIMEZONE
65 |             )
66 |     elif type(suspectedDate) is str:
67 |         # try to parse float or negative number from string:
68 |         objDate = None
69 |         try:
70 |             suspected_float = float(suspectedDate)
71 |             if suspected_float <= 0:
72 |                 objDate = datetime(1970, 1, 1)
73 |         except ValueError:
74 |             pass
75 |         if objDate is None:
76 |             objDate = parse(suspectedDate, fuzzy=True)
77 |     try:
78 |         if objDate.tzinfo is None:
79 |             objDate = LOCAL_TIMEZONE.localize(objDate)
80 |     except AttributeError as e:
81 |         raise ValueError(
82 |             "Date %s which was converted to %s has no "
83 |             "tzinfo attribute : %s" % (suspectedDate, objDate, e)
84 |         )
85 | 
86 |     objDate = utc.normalize(objDate)
87 | 
88 |     return objDate
89 | 
90 | 
91 | def utcnow():
92 |     """python is silly and returns naive datetime
93 |     when datetime.utcnow() is called
94 |     But if you call now with a UTC timezone
95 |     it returns a non naive datetime
96 |     """
97 |     return datetime.now(pytz.UTC)


--------------------------------------------------------------------------------
/lambdas/utils/dict_helpers.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | from copy import deepcopy
 3 | 
 4 | 
 5 | def merge(dict1, dict2):
 6 |     """ Return a new dictionary by merging two dictionaries recursively. """
 7 | 
 8 |     result = deepcopy(dict1)
 9 | 
10 |     for key, value in dict2.items():
11 |         if isinstance(value, collections.abc.Mapping):
12 |             result[key] = merge(result.get(key, {}), value)
13 |         else:
14 |             result[key] = deepcopy(dict2[key])
15 | 
16 |     return result
17 | 
18 | 
19 | def find_keys(node, kv):
20 |     """Returns all the keys matching kv in a given node/dict"""
21 | 
22 |     if isinstance(node, list):
23 |         for i in node:
24 |             for x in find_keys(i, kv):
25 |                 yield x
26 |     elif isinstance(node, dict):
27 |         if kv in node:
28 |             yield node[kv]
29 |         for j in node.values():
30 |             for x in find_keys(j, kv):
31 |                 yield x
32 | 
33 | 
34 | def enum_values(node):
35 |     """Returns all the values in a given dict/node"""
36 | 
37 |     if isinstance(node, list):
38 |         for i in node:
39 |             for x in enum_values(i):
40 |                 yield x
41 |     elif isinstance(node, dict):
42 |         for j in node.values():
43 |             for x in enum_values(j):
44 |                 yield x
45 |     else:
46 |         yield node
47 | 
48 | 
49 | def enum_keys(node):
50 |     """Returns all the keys in a given dict/node"""
51 | 
52 |     if isinstance(node, list):
53 |         for i in node:
54 |             for x in enum_keys(i):
55 |                 yield x
56 |     elif isinstance(node, dict):
57 |         for j in node.keys():
58 |             yield j
59 |             for x in enum_keys(node[j]):
60 |                 yield x
61 | 
62 | 
63 | def sub_dict(somedict, somekeys, default=None):
64 |     """Return just the given keys from a dict"""
65 | 
66 |     return dict([(k, somedict.get(k, default)) for k in somekeys])
67 | 
68 | 
69 | def dict_match(query_dict, target_dict):
70 |     """Determine if the target_dict contains the keys/values in the query_dict"""
71 | 
72 |     query_keys = list(enum_keys(query_dict))
73 |     if sub_dict(target_dict, query_keys) == query_dict:
74 |         return True
75 |     else:
76 |         return False
77 | 
78 | 
79 | def dictpath(path):
80 |     """split a string representing a
81 |     nested dictionary path key.subkey.subkey
82 |     """
83 |     for i in path.split("."):
84 |         yield "{0}".format(i)
85 | 
86 | 
87 | def getValueByPath(input_dict, path_string):
88 |     """
89 |     Gets data/value from a dictionary using a dotted accessor-string
90 |     http://stackoverflow.com/a/7534478
91 |     path_string can be key.subkey.subkey.subkey
92 |     """
93 |     return_data = input_dict
94 |     for chunk in path_string.split("."):
95 |         return_data = return_data.get(chunk, {})
96 |     return return_data


--------------------------------------------------------------------------------
/lambdas/utils/dotdict.py:
--------------------------------------------------------------------------------
 1 | class DotDict(dict):
 2 |     '''dict.item notation for dict()'s'''
 3 |     __getattr__ = dict.__getitem__
 4 |     __setattr__ = dict.__setitem__
 5 |     __delattr__ = dict.__delitem__
 6 | 
 7 |     def __init__(self, dct={}):
 8 |         for key, value in dct.items():
 9 |             if hasattr(value, 'keys'):
10 |                 value = DotDict(value)
11 |             self[key] = value
12 | 
13 |     def get(self, key, default=None):
14 |         """get to allow for dot string notation
15 |         :param str key: Key in dot-notation (e.g. 'foo.lol').
16 |         :return: value. None if no value was found.
17 |         """
18 |         try:
19 |             return self.__lookup(self, key)
20 |         except KeyError:
21 |             return default
22 | 
23 |     def __lookup(self, dct, key):
24 |         """Checks dct recursive to find the value for key.
25 |         Is used by get() internally.
26 |         :param dict dct: input dictionary
27 |         :param str key: The key we are looking for.
28 |         :return: The  value.
29 |         :raise KeyError: If the given key is not found
30 |         """
31 |         if '.' in key:
32 |             key, node = key.split('.', 1)
33 |             return self.__lookup(dct[key], node)
34 |         else:
35 |             return dct[key]


--------------------------------------------------------------------------------
/lambdas/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import re
  3 | import collections
  4 | import logging
  5 | import netaddr
  6 | from utils.dotdict import DotDict
  7 | 
  8 | logger = logging.getLogger()
  9 | 
 10 | CLOUDTRAIL_FILE_NAME_REGEX = re.compile(
 11 |     r"\d+_cloudtrail_.+.json.gz$", re.I
 12 | )
 13 | 
 14 | def emit_json_block(stream):
 15 |     ''' take a stream of io.StringIO(blob)
 16 |         iterate it and emit json blocks as they are found
 17 |     '''
 18 |     open_brackets = 0
 19 |     block = ''
 20 |     while True:
 21 |         c = stream.read(1)
 22 |         if not c:
 23 |             break
 24 | 
 25 |         if c == '{':
 26 |             open_brackets += 1
 27 |         elif c == '}':
 28 |             open_brackets -= 1
 29 |         block += c
 30 | 
 31 |         if open_brackets == 0:
 32 |             yield block.strip()
 33 |             block = ''
 34 | 
 35 | def short_uuid():
 36 |     return str(uuid.uuid4())[0:8]
 37 | 
 38 | def is_cloudtrail(filename):
 39 |     match = CLOUDTRAIL_FILE_NAME_REGEX.search(filename)
 40 |     return bool(match)
 41 | 
 42 | def is_ip(ip):
 43 |     '''
 44 |         validate an ipv4/ipv6 or cidr mask
 45 |         valid_ipv4/6 won't recognize a cidr mask
 46 |     '''
 47 |     try:
 48 |         # by default netaddr will validate single digits like '0' as 0.0.0.0/32
 49 |         # lets be a bit more precise and support cidr masks
 50 |         # by checking for format chars (. or :)
 51 |         # and using the IPNetwork constructor
 52 |         if ('.' in ip) or (':' in ip):
 53 |             netaddr.IPNetwork(ip)
 54 |             return True
 55 |         else:
 56 |             return False
 57 |     except Exception:
 58 |         return False
 59 | 
 60 | def isIPv4(ip):
 61 |     try:
 62 |         return netaddr.valid_ipv4(ip,flags=1)
 63 |     except:
 64 |         return False
 65 | 
 66 | def isIPv6(ip):
 67 |     try:
 68 |         return netaddr.valid_ipv6(ip,flags=1)
 69 |     except:
 70 |         return False
 71 | 
 72 | def generate_metadata(context):
 73 |     metadata = {
 74 |         "lambda_details": {
 75 |             "function_version": context.function_version,
 76 |             "function_arn": context.invoked_function_arn,
 77 |             "function_name": context.function_name.lower(),
 78 |             "memory_size": context.memory_limit_in_mb,
 79 |         },
 80 |     }
 81 | 
 82 |     return DotDict(metadata)
 83 | 
 84 | def chunks(l, n):
 85 |     """Yield successive n-sized chunks from l."""
 86 |     for i in range(0, len(l), n):
 87 |         yield l[i:i + n]
 88 | 
 89 | def first_matching_index_value(iterable, condition = lambda x: True):
 90 |     """
 91 |     Returns the first index,value tuple in the list that
 92 |     satisfies the `condition`.
 93 | 
 94 |     If the condition is not given, returns the first of the iterable.
 95 |     condition is passed as:
 96 |     condition = lambda i: <test>
 97 |     >>> first_matching_item( (1,2,3), condition=lambda x: x % 2 == 0)
 98 |     (1, 2)
 99 |     """
100 |     try:
101 |         return next((index,value) for index,value in enumerate(iterable) if condition(value))
102 | 
103 |     except StopIteration:
104 |         return (None,None)


--------------------------------------------------------------------------------
/lambdas/utils/plugins.py:
--------------------------------------------------------------------------------
  1 | import pynsive
  2 | import os
  3 | from operator import itemgetter
  4 | import json
  5 | import logging
  6 | from utils.dict_helpers import enum_keys
  7 | 
  8 | logger = logging.getLogger()
  9 | 
 10 | 
 11 | def event_criteria_values(an_event):
 12 |     """set up the list of event values to use when comparing plugins
 13 |     to this event to see if they should fire
 14 |     target values are the .keys() of the dict and the values of the 'category' and 'tags' fields
 15 |     where category is a key/value and tags is a list of values.
 16 |     """
 17 |     criteria_values = [e for e in enum_keys(an_event)]
 18 |     if (
 19 |         "tags" in criteria_values
 20 |         and isinstance(an_event.get("tags"), list)
 21 |         and len(an_event.get("tags", "")) > 0
 22 |     ):
 23 |         for tag in an_event["tags"]:
 24 |             criteria_values.append(tag)
 25 |     if "category" in criteria_values and isinstance(an_event.get("category"), str):
 26 |         criteria_values.append(an_event["category"])
 27 | 
 28 |     return criteria_values
 29 | 
 30 | 
 31 | def register_plugins(directory_name):
 32 |     """
 33 |     take a directory name, scan it for python modules
 34 |     and register them (module,registration criteria, priority)
 35 |     """
 36 |     pluginList = list()  # tuple of module,registration dict,priority
 37 |     if os.path.exists(directory_name):
 38 |         modules = pynsive.list_modules(directory_name)
 39 |         for mname in modules:
 40 |             module = pynsive.import_module(mname)
 41 |             if not module:
 42 |                 raise ImportError("Unable to load module {}".format(mname))
 43 |             else:
 44 |                 if "message" in dir(module):
 45 |                     mclass = module.message()
 46 |                     mreg = mclass.registration
 47 |                     if "priority" in dir(mclass):
 48 |                         mpriority = mclass.priority
 49 |                     else:
 50 |                         mpriority = 100
 51 |                     if isinstance(mreg, list):
 52 |                         logger.info(
 53 |                             "[*] plugin {0} registered to receive messages with {1}".format(
 54 |                                 mname, mreg
 55 |                             )
 56 |                         )
 57 |                         pluginList.append((mclass, mreg, mpriority))
 58 |     return pluginList
 59 | 
 60 | 
 61 | def send_event_to_plugins(anevent, metadata, pluginList):
 62 |     """compare the event to the plugin registrations.
 63 |     plugins register with a list of keys or values
 64 |     or values they want to match on
 65 |     this function compares that registration list
 66 |     to the current event and sends the event to plugins
 67 |     in order
 68 |     """
 69 |     if not isinstance(anevent, dict):
 70 |         raise TypeError("event is type {0}, should be a dict".format(type(anevent)))
 71 | 
 72 |     # expecting tuple of module, criteria, priority in pluginList
 73 |     # sort the plugin list by priority
 74 |     executed_plugins = []
 75 |     for plugin in sorted(pluginList, key=itemgetter(2), reverse=False):
 76 |         # assume we don't run this event through the plugin
 77 |         send = False
 78 |         if isinstance(plugin[1], list):
 79 |             try:
 80 |                 if "*" in plugin[1]:
 81 |                     # plugin wants to see all events, early exit the check
 82 |                     send = True
 83 |                 else:
 84 |                     # intersect the plugin field names
 85 |                     # with the fields in the event
 86 |                     # if they match, the plugin wants to see this event
 87 |                     plugin_matching_keys = set([item.lower() for item in plugin[1]])
 88 |                     event_tokens = [e for e in event_criteria_values(anevent)]
 89 |                     if plugin_matching_keys.intersection(event_tokens):
 90 |                         send = True
 91 |             except TypeError:
 92 |                 logger.error(
 93 |                     "TypeError on set intersection for dict {0}".format(anevent)
 94 |                 )
 95 |                 return (anevent, metadata)
 96 |         if send:
 97 |             (anevent, metadata) = plugin[0].onMessage(anevent, metadata)
 98 |             if anevent is None:
 99 |                 # plug-in is signalling to drop this message
100 |                 # early exit
101 |                 return (anevent, metadata)
102 |             plugin_name = plugin[0].__module__.replace("plugins.", "")
103 |             executed_plugins.append(plugin_name)
104 |     # Tag all events with what plugins ran on it
105 |     if "plugins" in anevent:
106 |         anevent["plugins"] = anevent["plugins"] + executed_plugins
107 |     else:
108 |         anevent["plugins"] = executed_plugins
109 | 
110 |     return (anevent, metadata)
111 | 


--------------------------------------------------------------------------------
/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">=0.12.25"
  3 |   required_providers {
  4 |     aws = ">= 2.25.0"
  5 |   }
  6 | }
  7 | 
  8 | provider "aws" {
  9 |   region  = "us-west-2"
 10 |   profile = "default"
 11 | }
 12 | data "aws_caller_identity" "current" {}
 13 | data "aws_region" "current" {}
 14 | 
 15 | output "account_id" {
 16 |   value = "${data.aws_caller_identity.current.account_id}"
 17 | }
 18 | 
 19 | output "datalake_arn" {
 20 |   value = aws_athena_database.defenda_datalake.id
 21 | }
 22 | 
 23 | resource "aws_s3_bucket" "data_lake_input_bucket" {
 24 |   bucket = "${data.aws_caller_identity.current.account_id}-defenda-data-lake-input-bucket"
 25 |   acl    = "private"
 26 | 
 27 |   versioning {
 28 |     enabled = false
 29 |   }
 30 | 
 31 |   lifecycle_rule {
 32 |     enabled = true
 33 | 
 34 |     transition {
 35 |       days          = 30
 36 |       storage_class = "STANDARD_IA"
 37 |     }
 38 | 
 39 |     expiration {
 40 |       days = 90
 41 |     }
 42 |   }
 43 | }
 44 | 
 45 | resource "aws_s3_bucket_public_access_block" "data_lake_input_bucket" {
 46 |   bucket = aws_s3_bucket.data_lake_input_bucket.id
 47 | 
 48 |   block_public_acls       = true
 49 |   block_public_policy     = true
 50 |   ignore_public_acls      = true
 51 |   restrict_public_buckets = true
 52 | }
 53 | 
 54 | resource "aws_s3_bucket" "data_lake_output_bucket" {
 55 |   bucket = "${data.aws_caller_identity.current.account_id}-defenda-data-lake-output-bucket"
 56 |   acl    = "private"
 57 | 
 58 |   versioning {
 59 |     enabled = false
 60 |   }
 61 | 
 62 |   lifecycle_rule {
 63 |     enabled = true
 64 | 
 65 |     transition {
 66 |       days          = 90
 67 |       storage_class = "STANDARD_IA"
 68 |     }
 69 | 
 70 |     expiration {
 71 |       days = 360
 72 |     }
 73 |   }
 74 | }
 75 | 
 76 | resource "aws_s3_bucket_public_access_block" "data_lake_output_bucket" {
 77 |   bucket = aws_s3_bucket.data_lake_output_bucket.id
 78 | 
 79 |   block_public_acls       = true
 80 |   block_public_policy     = true
 81 |   ignore_public_acls      = true
 82 |   restrict_public_buckets = true
 83 | }
 84 | 
 85 | resource "aws_iam_role" "data-lake-firehose-role" {
 86 |   name = "defenda-data-lake-firehose-role"
 87 | 
 88 |   assume_role_policy = <<EOF
 89 | {
 90 |   "Version": "2012-10-17",
 91 |   "Statement": [
 92 |     {
 93 |       "Action": "sts:AssumeRole",
 94 |       "Principal": {
 95 |         "Service": "firehose.amazonaws.com"
 96 |       },
 97 |       "Effect": "Allow",
 98 |       "Sid": ""
 99 |     }
100 |   ]
101 | }
102 | EOF
103 | }
104 | 
105 | resource "aws_iam_role_policy" "data-lake-firehose-policy" {
106 |   name = "defenda-data-lake-firehose-policy"
107 |   role = aws_iam_role.data-lake-firehose-role.id
108 | 
109 |   policy = <<EOF
110 | {
111 |   "Version": "2012-10-17",
112 |   "Statement": [
113 |     {
114 |       "Sid": "",
115 |       "Effect": "Allow",
116 |       "Action": [
117 |         "glue:GetTable",
118 |         "glue:GetTableVersion",
119 |         "glue:GetTableVersions"
120 |       ],
121 |       "Resource": "*"
122 |     },
123 |     {
124 |       "Action": [
125 |         "s3:*"
126 |       ],
127 |       "Effect": "Allow",
128 |       "Resource": [
129 |         "${aws_s3_bucket.data_lake_output_bucket.arn}",
130 |         "${aws_s3_bucket.data_lake_output_bucket.arn}/*"
131 |       ]
132 |     },
133 |     {
134 |       "Sid": "",
135 |       "Effect": "Allow",
136 |       "Action": [
137 |           "logs:PutLogEvents"
138 |       ],
139 |       "Resource": [
140 |           "arn:aws:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/kinesisfirehose/data-lake-firehose-logging:log-stream:*"
141 |       ]
142 |     },
143 |     {
144 |       "Sid": "",
145 |       "Effect": "Allow",
146 |       "Action": [
147 |           "kinesis:DescribeStream",
148 |           "kinesis:GetShardIterator",
149 |           "kinesis:GetRecords"
150 |       ],
151 |       "Resource": "arn:aws:kinesis:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:stream/data-lake-firehose-logging"
152 |     }
153 |   ]
154 | }
155 | EOF
156 | }
157 | 
158 | 
159 | 
160 | 
161 | resource "aws_s3_bucket" "data_lake_athena_bucket" {
162 |   bucket = "${data.aws_caller_identity.current.account_id}-defenda-data-lake-athena-query-results-${data.aws_region.current.name}"
163 |   acl    = "private"
164 | 
165 |   versioning {
166 |     enabled = false
167 |   }
168 | 
169 |   lifecycle_rule {
170 |     enabled = true
171 | 
172 |     expiration {
173 |       days = 30
174 |     }
175 |   }
176 | }
177 | 
178 | resource "aws_s3_bucket_public_access_block" "data_lake_athena_bucket" {
179 |   bucket = aws_s3_bucket.data_lake_athena_bucket.id
180 | 
181 |   block_public_acls       = true
182 |   block_public_policy     = true
183 |   ignore_public_acls      = true
184 |   restrict_public_buckets = true
185 | }
186 | 
187 | resource "aws_athena_database" "defenda_datalake" {
188 |   name   = "defenda_data_lake"
189 |   bucket = aws_s3_bucket.data_lake_athena_bucket.id
190 | }
191 | 
192 | resource "aws_athena_workgroup" "defenda_datalake_workgroup" {
193 |   name = "defenda_data_lake"
194 | 
195 |   configuration {
196 |     enforce_workgroup_configuration    = true
197 |     publish_cloudwatch_metrics_enabled = true
198 | 
199 |     result_configuration {
200 |       output_location = "s3://${aws_s3_bucket.data_lake_athena_bucket.bucket}/"
201 |     }
202 |   }
203 | }
204 | 
205 | resource "aws_glue_catalog_table" "data_lake_events_table" {
206 |   name          = "events"
207 |   database_name = aws_athena_database.defenda_datalake.name
208 |   table_type    = "EXTERNAL_TABLE"
209 | 
210 |   parameters = {
211 |     EXTERNAL = "TRUE"
212 |   }
213 | 
214 |   partition_keys {
215 |     name = "year"
216 |     type = "string"
217 |   }
218 | 
219 |   partition_keys {
220 |     name = "month"
221 |     type = "string"
222 |   }
223 | 
224 |   partition_keys {
225 |     name = "day"
226 |     type = "string"
227 |   }
228 | 
229 |   partition_keys {
230 |     name = "hour"
231 |     type = "string"
232 |   }
233 | 
234 |   storage_descriptor {
235 |     location      = "s3://${aws_s3_bucket.data_lake_output_bucket.id}/"
236 |     input_format  = "org.apache.hadoop.mapred.TextInputFormat"
237 |     output_format = "org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat"
238 | 
239 |     ser_de_info {
240 |       name                  = "json-serde"
241 |       serialization_library = "org.openx.data.jsonserde.JsonSerDe"
242 |     }
243 | 
244 |     columns {
245 |       name = "eventid"
246 |       type = "string"
247 |     }
248 | 
249 |     columns {
250 |       name = "utctimestamp"
251 |       type = "string"
252 |     }
253 | 
254 |     columns {
255 |       name = "severity"
256 |       type = "string"
257 |     }
258 | 
259 |     columns {
260 |       name = "summary"
261 |       type = "string"
262 |     }
263 | 
264 |     columns {
265 |       name = "category"
266 |       type = "string"
267 |     }
268 | 
269 |     columns {
270 |       name = "source"
271 |       type = "string"
272 |     }
273 | 
274 |     columns {
275 |       name = "tags"
276 |       type = "array<string>"
277 |     }
278 | 
279 |     columns {
280 |       name = "plugins"
281 |       type = "array<string>"
282 |     }
283 | 
284 |     columns {
285 |       name = "details"
286 |       type = "string"
287 |     }
288 | 
289 |   }
290 | }
291 | 
292 | 
293 | resource "aws_iam_role" "data_lake_instance_role" {
294 |   name = "defenda_data_lake_instance_role"
295 | 
296 |   assume_role_policy = <<EOF
297 | {
298 |   "Version": "2012-10-17",
299 |   "Statement": [
300 |     {
301 |       "Action": "sts:AssumeRole",
302 |       "Principal": {
303 |         "Service": "ec2.amazonaws.com"
304 |       },
305 |       "Effect": "Allow",
306 |       "Sid": ""
307 |     }
308 |   ]
309 | }
310 | EOF
311 | }
312 | 
313 | resource "aws_iam_instance_profile" "data_lake_instance_profile" {
314 |   name = "data_lake_instance_profile"
315 |   role = aws_iam_role.data_lake_instance_role.name
316 | }
317 | 
318 | resource "aws_iam_role_policy" "data_lake_instance_policy" {
319 |   name = "data_lake_instance_policy"
320 |   role = aws_iam_role.data_lake_instance_role.id
321 | 
322 |   policy = <<EOF
323 | {
324 |   "Version": "2012-10-17",
325 |   "Statement": [
326 |     {
327 |       "Action": [
328 |         "s3:*"
329 |       ],
330 |       "Effect": "Allow",
331 |       "Resource": "*"
332 |     },
333 |     {
334 |       "Action": [
335 |       "glue:GetDatabase*",
336 |       "glue:GetTable*",
337 |       "glue:GetPartitions",
338 |       "glue:BatchCreatePartition"
339 |       ],
340 |       "Effect": "Allow",
341 |       "Resource": "*"
342 |     },
343 |     {
344 |       "Action": [
345 |       "athena:Get*",
346 |       "athena:ListQueryExecutions",
347 |       "athena:StartQueryExecution"
348 |       ],
349 |       "Effect": "Allow",
350 |       "Resource": "*"
351 |     },
352 |     {
353 |       "Action":[
354 |       "secretsmanager:GetSecretValue"
355 |       ],
356 |       "Effect": "Allow",
357 |       "Resource":"arn:aws:secretsmanager:*:*:secret:data_lake*"
358 |     }
359 |   ]
360 | }
361 | EOF
362 | }
363 | 
364 | data "aws_iam_policy_document" "data_lake_lambda_role_policy_document" {
365 |   statement {
366 |     sid = "1"
367 | 
368 |     actions = [
369 |       "s3:*",
370 |     ]
371 | 
372 |     resources = [
373 |       aws_s3_bucket.data_lake_input_bucket.arn,
374 |       "${aws_s3_bucket.data_lake_input_bucket.arn}/*",
375 |       aws_s3_bucket.data_lake_output_bucket.arn,
376 |       "${aws_s3_bucket.data_lake_output_bucket.arn}/*",
377 |       aws_s3_bucket.data_lake_athena_bucket.arn,
378 |       "${aws_s3_bucket.data_lake_athena_bucket.arn}/*"
379 |     ]
380 |   }
381 |   statement {
382 |     sid = "2"
383 |     actions = [
384 |       "glue:GetDatabase*",
385 |       "glue:GetTable*",
386 |       "glue:GetPartitions",
387 |       "glue:BatchCreatePartition"
388 |     ]
389 |     resources = ["*"]
390 |   }
391 |   statement {
392 |     sid = "3"
393 |     actions = [
394 |       "athena:Get*",
395 |       "athena:ListQueryExecutions",
396 |       "athena:StartQueryExecution"
397 |     ]
398 |     resources = [
399 |       "arn:aws:athena:*:*:workgroup/*",
400 |       "arn:aws:athena:*:*:datacatalog/*"
401 |     ]
402 |   }
403 |   statement {
404 |     sid = "4"
405 |     actions = [
406 |       "lambda:InvokeFunction"
407 |     ]
408 |     resources = [
409 |       aws_lambda_function.data_lake_firehose_input.arn,
410 |       "${aws_lambda_function.data_lake_firehose_input.arn}:$LATEST"
411 |     ]
412 |   }
413 |   statement {
414 |     sid = "5"
415 |     actions = [
416 |       "firehose:PutRecordBatch"
417 |     ]
418 |     resources = [
419 |       aws_kinesis_firehose_delivery_stream.data_lake_s3_stream.arn
420 |     ]
421 |   }
422 | }
423 | 
424 | resource "aws_iam_role" "data_lake_lambda_role" {
425 |   name               = "defenda_data_lake_lambda_role"
426 |   assume_role_policy = <<EOF
427 | {
428 |   "Version": "2012-10-17",
429 |   "Statement": [
430 |     {
431 |       "Action": "sts:AssumeRole",
432 |       "Principal": {
433 |         "Service": "lambda.amazonaws.com"
434 |       },
435 |       "Effect": "Allow",
436 |       "Sid": ""
437 |     }
438 |   ]
439 | }
440 | EOF
441 | }
442 | 
443 | 
444 | resource "aws_iam_policy" "data_lake_lambda_role_policy" {
445 |   name   = "defenda_data_lake_lambda_role_policy"
446 |   path   = "/"
447 |   policy = data.aws_iam_policy_document.data_lake_lambda_role_policy_document.json
448 | }
449 | 
450 | resource "aws_iam_role_policy_attachment" "data_lake_lambda_role_attach" {
451 |   role       = aws_iam_role.data_lake_lambda_role.name
452 |   policy_arn = aws_iam_policy.data_lake_lambda_role_policy.arn
453 | }
454 | 
455 | 
456 | resource "aws_iam_role_policy_attachment" "lambda-exec-role" {
457 |   role       = aws_iam_role.data_lake_lambda_role.id
458 |   policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"
459 | }
460 | 
461 | resource "aws_lambda_function" "data_lake_firehose_input" {
462 |   filename         = "lambdas/lambda.zip"
463 |   function_name    = "defenda_data_lake_firehose_input"
464 |   role             = aws_iam_role.data_lake_lambda_role.arn
465 |   handler          = "processor.lambda_handler"
466 |   runtime          = "python3.8"
467 |   timeout          = 100
468 |   source_code_hash = filesha256("lambdas/lambda.zip")
469 | }
470 | 
471 | resource "aws_lambda_function" "data_lake_s3_input" {
472 |   filename         = "lambdas/lambda.zip"
473 |   function_name    = "defenda_data_lake_s3_input"
474 |   role             = aws_iam_role.data_lake_lambda_role.arn
475 |   handler          = "s3_to_firehose.lambda_handler"
476 |   runtime          = "python3.8"
477 |   timeout          = 100
478 |   source_code_hash = filesha256("lambdas/lambda.zip")
479 | }
480 | 
481 | resource "aws_lambda_function" "data_lake_generate_partitions_lambda" {
482 |   filename         = "lambdas/lambda.zip"
483 |   function_name    = "defenda_data_lake_generate_partitions"
484 |   role             = aws_iam_role.data_lake_lambda_role.arn
485 |   handler          = "generate_partitions.lambda_handler"
486 |   runtime          = "python3.8"
487 |   timeout          = 100
488 |   source_code_hash = filesha256("lambdas/lambda.zip")
489 | }
490 | 
491 | # cloudwatch timer for the generate partitions lambda
492 | resource "aws_cloudwatch_event_rule" "data_lake_generate_partitions_event" {
493 |   name                = "defenda_data_lake_generate_partitions_event"
494 |   description         = "Trigger a call to generate an athena partition"
495 |   schedule_expression = "cron(0/10 * * * ? *)"
496 | }
497 | 
498 | resource "aws_lambda_permission" "data_lake_allow_cloudwatch_generate_partitions" {
499 |   statement_id  = "AllowExecutionFromCloudWatch"
500 |   action        = "lambda:InvokeFunction"
501 |   function_name = aws_lambda_function.data_lake_generate_partitions_lambda.function_name
502 |   principal     = "events.amazonaws.com"
503 |   source_arn    = aws_cloudwatch_event_rule.data_lake_generate_partitions_event.arn
504 | }
505 | 
506 | resource "aws_cloudwatch_event_target" "data_lake_generate_partitions_event_target" {
507 |   target_id = "defenda_data_lake_generate_partitions_event_target"
508 |   rule      = aws_cloudwatch_event_rule.data_lake_generate_partitions_event.name
509 |   arn       = aws_lambda_function.data_lake_generate_partitions_lambda.arn
510 | }
511 | 
512 | 
513 | # s3 lambda trigger
514 | resource "aws_lambda_permission" "allow_bucket_lambda" {
515 |   statement_id  = "AllowExecutionFromS3Bucket"
516 |   action        = "lambda:InvokeFunction"
517 |   function_name = aws_lambda_function.data_lake_s3_input.arn
518 |   principal     = "s3.amazonaws.com"
519 |   source_arn    = aws_s3_bucket.data_lake_input_bucket.arn
520 | }
521 | 
522 | resource "aws_s3_bucket_notification" "lambda_bucket_notification" {
523 |   bucket = aws_s3_bucket.data_lake_input_bucket.id
524 | 
525 |   lambda_function {
526 |     lambda_function_arn = aws_lambda_function.data_lake_s3_input.arn
527 |     events              = ["s3:ObjectCreated:*"]
528 |   }
529 | 
530 |   depends_on = [aws_lambda_permission.allow_bucket_lambda]
531 | }
532 | 
533 | resource "aws_iam_role" "data_lake_firehose_role" {
534 |   name = "defenda_data_lake_firehose_role"
535 | 
536 |   assume_role_policy = <<EOF
537 | {
538 |   "Version": "2012-10-17",
539 |   "Statement": [
540 |     {
541 |       "Action": "sts:AssumeRole",
542 |       "Principal": {
543 |         "Service": "firehose.amazonaws.com"
544 |       },
545 |       "Effect": "Allow",
546 |       "Sid": ""
547 |     }
548 |   ]
549 | }
550 | EOF
551 | }
552 | 
553 | data "aws_iam_policy_document" "data_lake_firehose_role_policy_document" {
554 |   statement {
555 |     sid = "1"
556 | 
557 |     actions = [
558 |       "s3:*",
559 |     ]
560 | 
561 |     resources = [
562 |       aws_s3_bucket.data_lake_output_bucket.arn,
563 |       "${aws_s3_bucket.data_lake_output_bucket.arn}/*"
564 |     ]
565 |   }
566 |   statement {
567 |     sid = "2"
568 |     actions = [
569 |       "kinesis:DescribeStreamSummary",
570 |       "kinesis:GetShardIterator",
571 |       "kinesis:GetRecords",
572 |       "kinesis:ListShards",
573 |       "kinesis:SubscribeToShard",
574 |       "kinesis:DescribeStream",
575 |     ]
576 |     resources = [aws_kinesis_firehose_delivery_stream.data_lake_s3_stream.arn]
577 |   }
578 |   statement {
579 |     sid = "3"
580 |     actions = [
581 |       "logs:CreateLogGroup",
582 |       "logs:CreateLogStream",
583 |       "logs:PutLogEvents"
584 |     ]
585 |     resources = ["*"]
586 |   }
587 |   statement {
588 |     sid = "4"
589 |     actions = [
590 |       "lambda:InvokeFunction"
591 |     ]
592 |     resources = [aws_lambda_function.data_lake_firehose_input.arn,
593 |     "${aws_lambda_function.data_lake_firehose_input.arn}:$LATEST"]
594 |   }
595 | }
596 | 
597 | resource "aws_iam_policy" "data_lake_firehose_role_policy" {
598 |   name   = "defenda_data_lake_firehose_role_policy"
599 |   path   = "/"
600 |   policy = data.aws_iam_policy_document.data_lake_firehose_role_policy_document.json
601 | }
602 | 
603 | resource "aws_iam_role_policy_attachment" "data_lake_firehose_role_attach" {
604 |   role       = aws_iam_role.data_lake_firehose_role.name
605 |   policy_arn = aws_iam_policy.data_lake_firehose_role_policy.arn
606 | }
607 | 
608 | resource "aws_kinesis_firehose_delivery_stream" "data_lake_s3_stream" {
609 |   name        = "defenda_data_lake_s3_stream"
610 |   destination = "extended_s3"
611 | 
612 |   extended_s3_configuration {
613 |     role_arn            = aws_iam_role.data_lake_firehose_role.arn
614 |     bucket_arn          = aws_s3_bucket.data_lake_output_bucket.arn
615 |     compression_format  = "GZIP"
616 |     buffer_interval     = 60
617 |     buffer_size         = 1
618 |     error_output_prefix = "errors"
619 | 
620 |     processing_configuration {
621 |       enabled = "true"
622 | 
623 |       processors {
624 |         type = "Lambda"
625 | 
626 |         parameters {
627 |           parameter_name  = "LambdaArn"
628 |           parameter_value = "${aws_lambda_function.data_lake_firehose_input.arn}:$LATEST"
629 |         }
630 |       }
631 |     }
632 |   }
633 | }
634 | 
635 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -v -s -p no:cacheprovider
3 | filterwarnings =
4 |     ignore:.*U.*is deprecated:DeprecationWarning


--------------------------------------------------------------------------------
/terraform.tfvars:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xdefendA/defenda-data-lake/79e27c6d5c540e9d9c2b743990a43ab44606fdf6/terraform.tfvars


--------------------------------------------------------------------------------
/variables.tf:
--------------------------------------------------------------------------------
1 | variable "aws_region" {
2 |   type        = string
3 |   description = "AWS region to use"
4 |   default     = "us-west-2"
5 | }
6 | 


--------------------------------------------------------------------------------