├── .github
    ├── CODEOWNERS
    ├── dependabot.yml
    ├── wordlist.txt
    └── workflows
    │   ├── bandit.yml
    │   ├── codeql-analysis.yml
    │   ├── linting.yml
    │   ├── pylint.yml
    │   └── spelling.yml
├── .gitignore
├── .pylintrc
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── Falcon Data Replicator Sample
    ├── data_replicator_config.py
    └── data_replicator_sample_consumer.py
├── LICENSE
├── README.md
├── SECURITY.md
├── falcon_data_replicator.ini
├── falcon_data_replicator.py
├── fdr
    └── fdrconnector.py
├── ocsf
    ├── __init__.py
    └── ocsf.py
├── requirements.txt
└── standalone
    └── falcon_data_replicator.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo. Unless a later match takes precedence,
3 | # @global-owner1 and @global-owner2 will be requested for
4 | # review when someone opens a pull request.
5 | *       @jshcodes
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: pip
 9 |     directory: "/"
10 |     schedule:
11 |       interval: weekly
12 |     open-pull-requests-limit: 10
13 |   - package-ecosystem: github-actions
14 |     directory: "/"
15 |     schedule:
16 |       interval: monthly
17 |     open-pull-requests-limit: 10
18 | 


--------------------------------------------------------------------------------
/.github/wordlist.txt:
--------------------------------------------------------------------------------
 1 | CrowdStrike
 2 | html
 3 | http
 4 | https
 5 | www
 6 | faq
 7 | SQS
 8 | ini
 9 | py
10 | autogenerated
11 | boto
12 | botocore
13 | dateutil
14 | jmespath
15 | urllib
16 | config
17 | codebase
18 | socio
19 | sexualized
20 | CodeQL
21 | Snyk
22 | fastparquet
23 | filelock
24 | json
25 | numpy
26 | pyyaml
27 | txt
28 | OCSF
29 | 


--------------------------------------------------------------------------------
/.github/workflows/bandit.yml:
--------------------------------------------------------------------------------
 1 | name: Bandit
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - '**.py'
 6 |     branches: 
 7 |       - main
 8 |       - 'ver_*'
 9 |   pull_request:
10 |     paths:
11 |       - '**.py'
12 |     branches: 
13 |       - main
14 |       - 'ver_*'
15 | 
16 | jobs:
17 |   analyze:
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       matrix:
21 |         python-version: ['3.9']
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install bandit
32 |         pip install -r requirements.txt
33 |     - name: Analyze stand-alone with bandit
34 |       run: |
35 |         bandit -r .
36 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '40 6 * * 6'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'python' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v4
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v3
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v3
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v3
68 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
 1 | name: Flake8
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - '**.py'
 6 |     branches:
 7 |       - main
 8 |       - 'ver_*'
 9 |   pull_request:
10 |     paths:
11 |       - '**.py'
12 |     branches: 
13 |       - main
14 |       - 'ver_*'
15 | 
16 | jobs:
17 |   analyze:
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       matrix:
21 |         python-version: ['3.9']
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         python -m pip install flake8
32 |         pip install -r requirements.txt
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 standalone --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --max-line-length=127 --statistics
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
 1 | name: Python Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - '**.py'
 9 |   pull_request:
10 |     branches: 
11 |       - main
12 |     paths:
13 |       - '**.py'
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     strategy:
19 |       matrix:
20 |         python-version: ['3.9']
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v4
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v5
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install pylint
31 |     - name: Install package dependencies
32 |       run: |
33 |         python -m pip install -r requirements.txt
34 |     - name: Lint main with pylint
35 |       run: |
36 |         pylint *.py
37 |     - name: Lint ocsf with pylint
38 |       run: |
39 |         pylint ocsf
40 |     - name: Lint fdr with pylint
41 |       run: |
42 |         pylint fdr
43 | 


--------------------------------------------------------------------------------
/.github/workflows/spelling.yml:
--------------------------------------------------------------------------------
 1 | name: Spell Check
 2 | on:
 3 |   pull_request:
 4 |     paths:
 5 |     - '**.md'
 6 |   push:
 7 |     paths:
 8 |     - '**.md'
 9 | jobs:
10 |   spelling:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |     - name: Check Spelling
15 |       uses: SFLScientific/spellcheck-github-actions@master
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | ext/
132 | downloaded/
133 | ocsf/mappings
134 | 
135 | .idea
136 | .DS_Store
137 | 
138 | # files for local testing
139 | utils/
140 | falcon_data_replicator_local.ini


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MAIN]
  2 | 
  3 | # Analyse import fallback blocks. This can be used to support both Python 2 and
  4 | # 3 compatible code, which means that the block might have code that exists
  5 | # only in one or another interpreter, leading to false positives when analysed.
  6 | analyse-fallback-blocks=no
  7 | 
  8 | # Load and enable all available extensions. Use --list-extensions to see a list
  9 | # all available extensions.
 10 | #enable-all-extensions=
 11 | 
 12 | # In error mode, checkers without error messages are disabled and for others,
 13 | # only the ERROR messages are displayed, and no reports are done by default.
 14 | #errors-only=
 15 | 
 16 | # Always return a 0 (non-error) status code, even if lint errors are found.
 17 | # This is primarily useful in continuous integration scripts.
 18 | #exit-zero=
 19 | 
 20 | # A comma-separated list of package or module names from where C extensions may
 21 | # be loaded. Extensions are loading into the active Python interpreter and may
 22 | # run arbitrary code.
 23 | extension-pkg-allow-list=
 24 | 
 25 | # A comma-separated list of package or module names from where C extensions may
 26 | # be loaded. Extensions are loading into the active Python interpreter and may
 27 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list
 28 | # for backward compatibility.)
 29 | extension-pkg-whitelist=
 30 | 
 31 | # Return non-zero exit code if any of these messages/categories are detected,
 32 | # even if score is above --fail-under value. Syntax same as enable. Messages
 33 | # specified are enabled, while categories only check already-enabled messages.
 34 | fail-on=
 35 | 
 36 | # Specify a score threshold to be exceeded before program exits with error.
 37 | fail-under=10
 38 | 
 39 | # Interpret the stdin as a python script, whose filename needs to be passed as
 40 | # the module_or_package argument.
 41 | #from-stdin=
 42 | 
 43 | # Files or directories to be skipped. They should be base names, not paths.
 44 | ignore=CVS
 45 | 
 46 | # Add files or directories matching the regex patterns to the ignore-list. The
 47 | # regex matches against paths and can be in Posix or Windows format.
 48 | ignore-paths=
 49 | 
 50 | # Files or directories matching the regex patterns are skipped. The regex
 51 | # matches against base names, not paths. The default value ignores Emacs file
 52 | # locks
 53 | ignore-patterns=^\.#
 54 | 
 55 | # List of module names for which member attributes should not be checked
 56 | # (useful for modules/projects where namespaces are manipulated during runtime
 57 | # and thus existing member attributes cannot be deduced by static analysis). It
 58 | # supports qualified module names, as well as Unix pattern matching.
 59 | ignored-modules=
 60 | 
 61 | # Python code to execute, usually for sys.path manipulation such as
 62 | # pygtk.require().
 63 | #init-hook=
 64 | 
 65 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 66 | # number of processors available to use.
 67 | jobs=1
 68 | 
 69 | # Control the amount of potential inferred values when inferring a single
 70 | # object. This can help the performance when dealing with large functions or
 71 | # complex, nested conditions.
 72 | limit-inference-results=100
 73 | 
 74 | # List of plugins (as comma separated values of python module names) to load,
 75 | # usually to register additional checkers.
 76 | load-plugins=
 77 | 
 78 | # Pickle collected data for later comparisons.
 79 | persistent=yes
 80 | 
 81 | # Minimum Python version to use for version dependent checks. Will default to
 82 | # the version used to run pylint.
 83 | py-version=3.9
 84 | 
 85 | # Discover python modules and packages in the file system subtree.
 86 | recursive=no
 87 | 
 88 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 89 | # user-friendly hints instead of false-positive error messages.
 90 | suggestion-mode=yes
 91 | 
 92 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 93 | # active Python interpreter and may run arbitrary code.
 94 | unsafe-load-any-extension=no
 95 | 
 96 | # In verbose mode, extra non-checker-related info will be displayed.
 97 | #verbose=
 98 | 
 99 | 
100 | [REPORTS]
101 | 
102 | # Python expression which should return a score less than or equal to 10. You
103 | # have access to the variables 'fatal', 'error', 'warning', 'refactor',
104 | # 'convention', and 'info' which contain the number of messages in each
105 | # category, as well as 'statement' which is the total number of statements
106 | # analyzed. This score is used by the global evaluation report (RP0004).
107 | evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
108 | 
109 | # Template used to display messages. This is a python new-style format string
110 | # used to format the message information. See doc for all details.
111 | msg-template=
112 | 
113 | # Set the output format. Available formats are text, parseable, colorized, json
114 | # and msvs (visual studio). You can also give a reporter class, e.g.
115 | # mypackage.mymodule.MyReporterClass.
116 | #output-format=
117 | 
118 | # Tells whether to display a full report or only the messages.
119 | reports=no
120 | 
121 | # Activate the evaluation score.
122 | score=yes
123 | 
124 | 
125 | [MESSAGES CONTROL]
126 | 
127 | # Only show warnings with the listed confidence levels. Leave empty to show
128 | # all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
129 | # UNDEFINED.
130 | confidence=HIGH,
131 |            CONTROL_FLOW,
132 |            INFERENCE,
133 |            INFERENCE_FAILURE,
134 |            UNDEFINED
135 | 
136 | # Disable the message, report, category or checker with the given id(s). You
137 | # can either give multiple identifiers separated by comma (,) or put this
138 | # option multiple times (only on the command line, not in the configuration
139 | # file where it should appear only once). You can also use "--disable=all" to
140 | # disable everything first and then re-enable specific checks. For example, if
141 | # you want to run only the similarities checker, you can use "--disable=all
142 | # --enable=similarities". If you want to run only the classes checker, but have
143 | # no Warning level messages displayed, use "--disable=all --enable=classes
144 | # --disable=W".
145 | disable=raw-checker-failed,
146 |         bad-inline-option,
147 |         locally-disabled,
148 |         file-ignored,
149 |         suppressed-message,
150 |         useless-suppression,
151 |         deprecated-pragma,
152 |         use-symbolic-message-instead
153 | 
154 | # Enable the message, report, category or checker with the given id(s). You can
155 | # either give multiple identifier separated by comma (,) or put this option
156 | # multiple time (only on the command line, not in the configuration file where
157 | # it should appear only once). See also the "--disable" option for examples.
158 | enable=c-extension-no-member
159 | 
160 | 
161 | [LOGGING]
162 | 
163 | # The type of string formatting that logging methods do. `old` means using %
164 | # formatting, `new` is for `{}` formatting.
165 | logging-format-style=old
166 | 
167 | # Logging modules to check that the string format arguments are in logging
168 | # function parameter format.
169 | logging-modules=logging
170 | 
171 | 
172 | [SPELLING]
173 | 
174 | # Limits count of emitted suggestions for spelling mistakes.
175 | max-spelling-suggestions=4
176 | 
177 | # Spelling dictionary name. Available dictionaries: none. To make it work,
178 | # install the 'python-enchant' package.
179 | spelling-dict=
180 | 
181 | # List of comma separated words that should be considered directives if they
182 | # appear at the beginning of a comment and should not be checked.
183 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
184 | 
185 | # List of comma separated words that should not be checked.
186 | spelling-ignore-words=
187 | 
188 | # A path to a file that contains the private dictionary; one word per line.
189 | spelling-private-dict-file=
190 | 
191 | # Tells whether to store unknown words to the private dictionary (see the
192 | # --spelling-private-dict-file option) instead of raising a message.
193 | spelling-store-unknown-words=no
194 | 
195 | 
196 | [MISCELLANEOUS]
197 | 
198 | # List of note tags to take in consideration, separated by a comma.
199 | notes=FIXME,
200 |       XXX,
201 |       TODO
202 | 
203 | # Regular expression of note tags to take in consideration.
204 | notes-rgx=
205 | 
206 | 
207 | [TYPECHECK]
208 | 
209 | # List of decorators that produce context managers, such as
210 | # contextlib.contextmanager. Add to this list to register other decorators that
211 | # produce valid context managers.
212 | contextmanager-decorators=contextlib.contextmanager
213 | 
214 | # List of members which are set dynamically and missed by pylint inference
215 | # system, and so shouldn't trigger E1101 when accessed. Python regular
216 | # expressions are accepted.
217 | generated-members=
218 | 
219 | # Tells whether to warn about missing members when the owner of the attribute
220 | # is inferred to be None.
221 | ignore-none=yes
222 | 
223 | # This flag controls whether pylint should warn about no-member and similar
224 | # checks whenever an opaque object is returned when inferring. The inference
225 | # can return multiple potential results while evaluating a Python object, but
226 | # some branches might not be evaluated, which results in partial inference. In
227 | # that case, it might be useful to still emit no-member and other checks for
228 | # the rest of the inferred objects.
229 | ignore-on-opaque-inference=yes
230 | 
231 | # List of symbolic message names to ignore for Mixin members.
232 | ignored-checks-for-mixins=no-member,
233 |                           not-async-context-manager,
234 |                           not-context-manager,
235 |                           attribute-defined-outside-init
236 | 
237 | # List of class names for which member attributes should not be checked (useful
238 | # for classes with dynamically set attributes). This supports the use of
239 | # qualified names.
240 | ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
241 | 
242 | # Show a hint with possible names when a member name was not found. The aspect
243 | # of finding the hint is based on edit distance.
244 | missing-member-hint=yes
245 | 
246 | # The minimum edit distance a name should have in order to be considered a
247 | # similar match for a missing member name.
248 | missing-member-hint-distance=1
249 | 
250 | # The total number of similar names that should be taken in consideration when
251 | # showing a hint for a missing member.
252 | missing-member-max-choices=1
253 | 
254 | # Regex pattern to define which classes are considered mixins.
255 | mixin-class-rgx=.*[Mm]ixin
256 | 
257 | # List of decorators that change the signature of a decorated function.
258 | signature-mutators=
259 | 
260 | 
261 | [CLASSES]
262 | 
263 | # Warn about protected attribute access inside special methods
264 | check-protected-access-in-special-methods=no
265 | 
266 | # List of method names used to declare (i.e. assign) instance attributes.
267 | defining-attr-methods=__init__,
268 |                       __new__,
269 |                       setUp,
270 |                       __post_init__
271 | 
272 | # List of member names, which should be excluded from the protected access
273 | # warning.
274 | exclude-protected=_asdict,
275 |                   _fields,
276 |                   _replace,
277 |                   _source,
278 |                   _make
279 | 
280 | # List of valid names for the first argument in a class method.
281 | valid-classmethod-first-arg=cls
282 | 
283 | # List of valid names for the first argument in a metaclass class method.
284 | valid-metaclass-classmethod-first-arg=cls
285 | 
286 | 
287 | [VARIABLES]
288 | 
289 | # List of additional names supposed to be defined in builtins. Remember that
290 | # you should avoid defining new builtins when possible.
291 | additional-builtins=
292 | 
293 | # Tells whether unused global variables should be treated as a violation.
294 | allow-global-unused-variables=yes
295 | 
296 | # List of names allowed to shadow builtins
297 | allowed-redefined-builtins=
298 | 
299 | # List of strings which can identify a callback function by name. A callback
300 | # name must start or end with one of those strings.
301 | callbacks=cb_,
302 |           _cb
303 | 
304 | # A regular expression matching the name of dummy variables (i.e. expected to
305 | # not be used).
306 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
307 | 
308 | # Argument names that match this expression will be ignored. Default to name
309 | # with leading underscore.
310 | ignored-argument-names=_.*|^ignored_|^unused_
311 | 
312 | # Tells whether we should check for unused import in __init__ files.
313 | init-import=no
314 | 
315 | # List of qualified module names which can have objects that can redefine
316 | # builtins.
317 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
318 | 
319 | 
320 | [FORMAT]
321 | 
322 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
323 | expected-line-ending-format=
324 | 
325 | # Regexp for a line that is allowed to be longer than the limit.
326 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
327 | 
328 | # Number of spaces of indent required inside a hanging or continued line.
329 | indent-after-paren=4
330 | 
331 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
332 | # tab).
333 | indent-string='    '
334 | 
335 | # Maximum number of characters on a single line.
336 | max-line-length=127
337 | 
338 | # Maximum number of lines in a module.
339 | max-module-lines=1000
340 | 
341 | # Allow the body of a class to be on the same line as the declaration if body
342 | # contains single statement.
343 | single-line-class-stmt=no
344 | 
345 | # Allow the body of an if to be on the same line as the test if there is no
346 | # else.
347 | single-line-if-stmt=no
348 | 
349 | 
350 | [IMPORTS]
351 | 
352 | # List of modules that can be imported at any level, not just the top level
353 | # one.
354 | allow-any-import-level=
355 | 
356 | # Allow wildcard imports from modules that define __all__.
357 | allow-wildcard-with-all=no
358 | 
359 | # Deprecated modules which should not be used, separated by a comma.
360 | deprecated-modules=
361 | 
362 | # Output a graph (.gv or any supported image format) of external dependencies
363 | # to the given file (report RP0402 must not be disabled).
364 | ext-import-graph=
365 | 
366 | # Output a graph (.gv or any supported image format) of all (i.e. internal and
367 | # external) dependencies to the given file (report RP0402 must not be
368 | # disabled).
369 | import-graph=
370 | 
371 | # Output a graph (.gv or any supported image format) of internal dependencies
372 | # to the given file (report RP0402 must not be disabled).
373 | int-import-graph=
374 | 
375 | # Force import order to recognize a module as part of the standard
376 | # compatibility libraries.
377 | known-standard-library=
378 | 
379 | # Force import order to recognize a module as part of a third party library.
380 | known-third-party=enchant
381 | 
382 | # Couples of modules and preferred modules, separated by a comma.
383 | preferred-modules=
384 | 
385 | 
386 | [EXCEPTIONS]
387 | 
388 | # Exceptions that will emit a warning when caught.
389 | overgeneral-exceptions=BaseException,
390 |                        Exception
391 | 
392 | 
393 | [REFACTORING]
394 | 
395 | # Maximum number of nested blocks for function / method body
396 | max-nested-blocks=6
397 | 
398 | # Complete name of functions that never returns. When checking for
399 | # inconsistent-return-statements if a never returning function is called then
400 | # it will be considered as an explicit return statement and no message will be
401 | # printed.
402 | never-returning-functions=sys.exit,argparse.parse_error
403 | 
404 | 
405 | [SIMILARITIES]
406 | 
407 | # Comments are removed from the similarity computation
408 | ignore-comments=yes
409 | 
410 | # Docstrings are removed from the similarity computation
411 | ignore-docstrings=yes
412 | 
413 | # Imports are removed from the similarity computation
414 | ignore-imports=yes
415 | 
416 | # Signatures are removed from the similarity computation
417 | ignore-signatures=yes
418 | 
419 | # Minimum lines number of a similarity.
420 | min-similarity-lines=4
421 | 
422 | 
423 | [DESIGN]
424 | 
425 | # List of regular expressions of class ancestor names to ignore when counting
426 | # public methods (see R0903)
427 | exclude-too-few-public-methods=
428 | 
429 | # List of qualified class names to ignore when counting class parents (see
430 | # R0901)
431 | ignored-parents=
432 | 
433 | # Maximum number of arguments for function / method.
434 | max-args=5
435 | 
436 | # Maximum number of attributes for a class (see R0902).
437 | max-attributes=7
438 | 
439 | # Maximum number of boolean expressions in an if statement (see R0916).
440 | max-bool-expr=5
441 | 
442 | # Maximum number of branch for function / method body.
443 | max-branches=15
444 | 
445 | # Maximum number of locals for function / method body.
446 | max-locals=35
447 | 
448 | # Maximum number of parents for a class (see R0901).
449 | max-parents=7
450 | 
451 | # Maximum number of public methods for a class (see R0904).
452 | max-public-methods=20
453 | 
454 | # Maximum number of return / yield for function / method body.
455 | max-returns=6
456 | 
457 | # Maximum number of statements in function / method body.
458 | max-statements=50
459 | 
460 | # Minimum number of public methods for a class (see R0903).
461 | min-public-methods=2
462 | 
463 | 
464 | [STRING]
465 | 
466 | # This flag controls whether inconsistent-quotes generates a warning when the
467 | # character used as a quote delimiter is used inconsistently within a module.
468 | check-quote-consistency=no
469 | 
470 | # This flag controls whether the implicit-str-concat should generate a warning
471 | # on implicit string concatenation in sequences defined over several lines.
472 | check-str-concat-over-line-jumps=no
473 | 
474 | 
475 | [BASIC]
476 | 
477 | # Naming style matching correct argument names.
478 | argument-naming-style=snake_case
479 | 
480 | # Regular expression matching correct argument names. Overrides argument-
481 | # naming-style. If left empty, argument names will be checked with the set
482 | # naming style.
483 | #argument-rgx=
484 | 
485 | # Naming style matching correct attribute names.
486 | attr-naming-style=snake_case
487 | 
488 | # Regular expression matching correct attribute names. Overrides attr-naming-
489 | # style. If left empty, attribute names will be checked with the set naming
490 | # style.
491 | #attr-rgx=
492 | 
493 | # Bad variable names which should always be refused, separated by a comma.
494 | bad-names=foo,
495 |           bar,
496 |           baz,
497 |           toto,
498 |           tutu,
499 |           tata
500 | 
501 | # Bad variable names regexes, separated by a comma. If names match any regex,
502 | # they will always be refused
503 | bad-names-rgxs=
504 | 
505 | # Naming style matching correct class attribute names.
506 | class-attribute-naming-style=any
507 | 
508 | # Regular expression matching correct class attribute names. Overrides class-
509 | # attribute-naming-style. If left empty, class attribute names will be checked
510 | # with the set naming style.
511 | #class-attribute-rgx=
512 | 
513 | # Naming style matching correct class constant names.
514 | class-const-naming-style=UPPER_CASE
515 | 
516 | # Regular expression matching correct class constant names. Overrides class-
517 | # const-naming-style. If left empty, class constant names will be checked with
518 | # the set naming style.
519 | #class-const-rgx=
520 | 
521 | # Naming style matching correct class names.
522 | class-naming-style=PascalCase
523 | 
524 | # Regular expression matching correct class names. Overrides class-naming-
525 | # style. If left empty, class names will be checked with the set naming style.
526 | #class-rgx=
527 | 
528 | # Naming style matching correct constant names.
529 | const-naming-style=UPPER_CASE
530 | 
531 | # Regular expression matching correct constant names. Overrides const-naming-
532 | # style. If left empty, constant names will be checked with the set naming
533 | # style.
534 | #const-rgx=
535 | 
536 | # Minimum line length for functions/classes that require docstrings, shorter
537 | # ones are exempt.
538 | docstring-min-length=-1
539 | 
540 | # Naming style matching correct function names.
541 | function-naming-style=snake_case
542 | 
543 | # Regular expression matching correct function names. Overrides function-
544 | # naming-style. If left empty, function names will be checked with the set
545 | # naming style.
546 | #function-rgx=
547 | 
548 | # Good variable names which should always be accepted, separated by a comma.
549 | good-names=i,
550 |            j,
551 |            k,
552 |            ex,
553 |            Run,
554 |            _
555 | 
556 | # Good variable names regexes, separated by a comma. If names match any regex,
557 | # they will always be accepted
558 | good-names-rgxs=
559 | 
560 | # Include a hint for the correct naming format with invalid-name.
561 | include-naming-hint=no
562 | 
563 | # Naming style matching correct inline iteration names.
564 | inlinevar-naming-style=any
565 | 
566 | # Regular expression matching correct inline iteration names. Overrides
567 | # inlinevar-naming-style. If left empty, inline iteration names will be checked
568 | # with the set naming style.
569 | #inlinevar-rgx=
570 | 
571 | # Naming style matching correct method names.
572 | method-naming-style=snake_case
573 | 
574 | # Regular expression matching correct method names. Overrides method-naming-
575 | # style. If left empty, method names will be checked with the set naming style.
576 | #method-rgx=
577 | 
578 | # Naming style matching correct module names.
579 | module-naming-style=snake_case
580 | 
581 | # Regular expression matching correct module names. Overrides module-naming-
582 | # style. If left empty, module names will be checked with the set naming style.
583 | #module-rgx=
584 | 
585 | # Colon-delimited sets of names that determine each other's naming style when
586 | # the name regexes allow several styles.
587 | name-group=
588 | 
589 | # Regular expression which should only match function or class names that do
590 | # not require a docstring.
591 | no-docstring-rgx=^_
592 | 
593 | # List of decorators that produce properties, such as abc.abstractproperty. Add
594 | # to this list to register other decorators that produce valid properties.
595 | # These decorators are taken in consideration only for invalid-name.
596 | property-classes=abc.abstractproperty
597 | 
598 | # Regular expression matching correct type variable names. If left empty, type
599 | # variable names will be checked with the set naming style.
600 | #typevar-rgx=
601 | 
602 | # Naming style matching correct variable names.
603 | variable-naming-style=snake_case
604 | 
605 | # Regular expression matching correct variable names. Overrides variable-
606 | # naming-style. If left empty, variable names will be checked with the set
607 | # naming style.
608 | #variable-rgx=
609 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Falcon Data Replicator Community Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | CrowdStrike.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM docker.io/python:3-slim-buster
 2 | 
 3 | RUN : \
 4 |     && apt-get update \
 5 |     && DEBIAN_FRONTEND=noninteractive apt-get upgrade --no-install-recommends --assume-yes \
 6 |     && rm -rf /var/lib/apt/lists/*
 7 | 
 8 | RUN useradd --create-home --home-dir /fdr fdruser
 9 | USER fdruser
10 | WORKDIR /fdr
11 | 
12 | COPY requirements.txt .
13 | RUN pip install -r ./requirements.txt
14 | 
15 | COPY . .
16 | 
17 | ENTRYPOINT [ "python3", "-m" , "falcon_data_replicator"]


--------------------------------------------------------------------------------
/Falcon Data Replicator Sample/data_replicator_config.py:
--------------------------------------------------------------------------------
 1 | # AWS security credentials
 2 | AWS_KEY = ""
 3 | 
 4 | AWS_SECRET = ""
 5 | 
 6 | # URL of SQS queue.
 7 | QUEUE_URL = ""
 8 | 
 9 | # Root directory to download files from S3 to.
10 | OUTPUT_PATH = ""
11 | 
12 | # Time in seconds before a message is added back to the SQS queue if not deleted.  Ensure this is large enough for you
13 | # to safely finish processing any downloaded files.
14 | VISIBILITY_TIMEOUT = 300
15 | 
16 | # name of the aws region
17 | REGION_NAME = ""
18 | 


--------------------------------------------------------------------------------
/Falcon Data Replicator Sample/data_replicator_sample_consumer.py:
--------------------------------------------------------------------------------
  1 | import data_replicator_config
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | try:
  7 |     import boto3
  8 | except ImportError as err:
  9 |     print(err)
 10 |     print(
 11 |         'boto3 is required to run data_replicator_sample_consumer.  Please "pip install boto3"!'
 12 |     )
 13 | 
 14 | ###################################################################################################
 15 | # NOTE: See Falcon Data Replicator instructions for details on how  to use this sample consumer.  #
 16 | ###################################################################################################
 17 | 
 18 | AWS_KEY = data_replicator_config.AWS_KEY
 19 | AWS_SECRET = data_replicator_config.AWS_SECRET
 20 | QUEUE_URL = data_replicator_config.QUEUE_URL
 21 | OUTPUT_PATH = os.path.realpath(data_replicator_config.OUTPUT_PATH)
 22 | VISIBILITY_TIMEOUT = data_replicator_config.VISIBILITY_TIMEOUT
 23 | REGION_NAME = data_replicator_config.REGION_NAME
 24 | 
 25 | sqs = boto3.resource(
 26 |     "sqs",
 27 |     region_name=REGION_NAME,
 28 |     aws_access_key_id=AWS_KEY,
 29 |     aws_secret_access_key=AWS_SECRET,
 30 | )
 31 | s3 = boto3.client(
 32 |     "s3",
 33 |     region_name=REGION_NAME,
 34 |     aws_access_key_id=AWS_KEY,
 35 |     aws_secret_access_key=AWS_SECRET,
 36 | )
 37 | queue = sqs.Queue(url=QUEUE_URL)
 38 | 
 39 | 
 40 | def handle_file(path):
 41 |     """PUT CUSTOM LOGIC FOR HANDLING FILES HERE"""
 42 |     print("Downloaded file to path %s" % path)
 43 | 
 44 | 
 45 | def download_message_files(msg):
 46 |     """Downloads the files from s3 referenced in msg and places them in OUTPUT_PATH.
 47 | 
 48 |     download_message_files function will iterate through every file listed at msg['filePaths'],
 49 |     move it to a local path with name "{OUTPUT_PATH}/{s3_path}",
 50 |     and then call handle_file(path).
 51 |     """
 52 | 
 53 |     # Construct output path for this message's files
 54 |     msg_output_path = os.path.realpath(os.path.join(OUTPUT_PATH, msg["pathPrefix"]))
 55 |     # Only write files to the specified output_path
 56 |     if os.path.commonpath([OUTPUT_PATH, msg_output_path]) != OUTPUT_PATH:
 57 |         print(
 58 |             f"Skipping {msg_output_path} to prevent writes outside of output path: {OUTPUT_PATH}"
 59 |         )
 60 |         return
 61 | 
 62 |     # Ensure directory exists at output path
 63 |     if not os.path.exists(msg_output_path):
 64 |         os.makedirs(msg_output_path)
 65 | 
 66 |     for s3_file in msg["files"]:
 67 |         try:
 68 |             s3_path = s3_file["path"]
 69 |             local_path = os.path.realpath(os.path.join(OUTPUT_PATH, s3_path))
 70 |             # only write files to the specified output path
 71 |             if os.path.commonpath([OUTPUT_PATH, local_path]) != OUTPUT_PATH:
 72 |                 print(
 73 |                     f"Skipping {local_path} to prevent writes outside of output path: {OUTPUT_PATH}"
 74 |                 )
 75 |                 continue
 76 | 
 77 |             # Handle FDR platform and time partitioned folders
 78 |             if not os.path.exists(os.path.dirname(local_path)):
 79 |                 os.makedirs(os.path.dirname(local_path))
 80 | 
 81 |             # Copy one file from s3 to local
 82 |             s3.download_file(msg["bucket"], s3_path, local_path)
 83 |             # Do something with file
 84 |             handle_file(local_path)
 85 |         except Exception as e:
 86 |             print(f"Error downloading file {s3_file['path']}: {e}")
 87 |             print(
 88 |                 "\nIf you're unsure how to handle this error, open an issue on Github: https://github.com/CrowdStrike/FDR/issues or contact support.\n"
 89 |             )
 90 |             exit(1)
 91 | 
 92 | 
 93 | def consume_data_replicator():
 94 |     """Consume from data replicator and track number of messages/files/bytes downloaded."""
 95 | 
 96 |     sleep_time = 1
 97 |     msg_cnt = 0
 98 |     file_cnt = 0
 99 |     byte_cnt = 0
100 | 
101 |     while True:  # We want to continuously poll the queue for new messages.
102 |         # Receive messages from queue if any exist (NOTE: receive_messages() only receives a few messages at a
103 |         # time, it does NOT exhaust the queue)
104 |         for msg in queue.receive_messages(VisibilityTimeout=VISIBILITY_TIMEOUT):
105 |             msg_cnt += 1
106 |             body = json.loads(msg.body)  # grab the actual message body
107 |             download_message_files(body)
108 |             file_cnt += body["fileCount"]
109 |             byte_cnt += body["totalSize"]
110 |             # msg.delete() must be called or the message will be returned to the SQS queue after
111 |             # VISIBILITY_TIMEOUT seconds
112 |             msg.delete()
113 |             time.sleep(sleep_time)
114 | 
115 |         print(
116 |             "Messages consumed: %i\tFile count: %i\tByte count: %i"
117 |             % (msg_cnt, file_cnt, byte_cnt)
118 |         )
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     consume_data_replicator()
123 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <https://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![CrowdStrike Falcon](https://raw.githubusercontent.com/CrowdStrike/falconpy/main/docs/asset/cs-logo.png)<br/>[![Twitter URL](https://img.shields.io/twitter/url?label=Follow%20%40CrowdStrike&style=social&url=https%3A%2F%2Ftwitter.com%2FCrowdStrike)](https://twitter.com/CrowdStrike)<br/>
 2 | # Falcon Data Replicator
 3 | [![Bandit](https://github.com/CrowdStrike/FDR/actions/workflows/bandit.yml/badge.svg)](https://github.com/CrowdStrike/FDR/actions/workflows/bandit.yml)
 4 | [![Flake8](https://github.com/CrowdStrike/FDR/actions/workflows/linting.yml/badge.svg)](https://github.com/CrowdStrike/FDR/actions/workflows/linting.yml)
 5 | [![Python Lint](https://github.com/CrowdStrike/FDR/actions/workflows/pylint.yml/badge.svg)](https://github.com/CrowdStrike/FDR/actions/workflows/pylint.yml)
 6 | [![CodeQL](https://github.com/CrowdStrike/FDR/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/CrowdStrike/FDR/actions/workflows/codeql-analysis.yml)
 7 | 
 8 | The Falcon Data Replicator replicates log data from your CrowdStrike environment to a stand-alone target. This target can be a location on the file system, 
 9 | or a cloud storage bucket.
10 | > Currently AWS is the only cloud provider implemented.
11 | ## Requirements
12 | + Python 3.6+
13 | + boto3
14 | + CrowdStrike Falcon FDR credentials
15 | + CrowdStrike Falcon FDR SQS queue URL
16 | ## Stand-alone solution
17 | + [falcon_data_replicator.ini](https://github.com/CrowdStrike/FDR/blob/main/falcon_data_replicator.ini) - Configuration file
18 | + [standalone/falcon_data_replicator.py](https://github.com/CrowdStrike/FDR/blob/main/standalone/falcon_data_replicator.py) - Stand-alone solution application file
19 | ### Configuration
20 | The `falcon_data_replicator.ini` file contains all of the parameters necessary to configure the
21 | solution for replication to the local file system and / or a storage bucket in AWS S3. After 
22 | retrieving the AWS credentials and SQS queue details from your Falcon console, edit this file
23 | to reflect your environment.
24 | #### Required parameters
25 | The following parameters must be provided in order for the solution to operate.
26 | + `AWS_KEY` - AWS client ID provided to you by the CrowdStrike Falcon console
27 | + `AWS_SECRET` - AWS client secret provided to you by the CrowdStrike Falcon console
28 | + `QUEUE_URL` - AWS SQS queue URL provided to you by the CrowdStrike Falcon console
29 | + `OUTPUT_PATH` - File path where downloaded files will be stored, not used for in-memory transfers
30 | + `VISIBILITY_TIMEOUT` - Time in seconds before a message is returned back to the SQS queue
31 | + `REGION_NAME` - The name of the AWS region where your CrowdStrike SQS queue resides
32 | + `MESSAGE_DELAY` - The time in seconds to wait in between the processing of each message
33 | + `QUEUE_DELAY` - The time in seconds to wait before each check of the queue for more messages
34 | + `LOG_FILE` - The name and path of the the log file
35 | #### Destination parameters
36 | The following parameters configure our destination details. If not these parameters are not present,
37 | upload to our bucket is skipped and the local files are retained after download.
38 | + `TARGET_BUCKET` - The name of the AWS bucket we will use for our target destination
39 | + `TARGET_REGION` - The name of the AWS region our target bucket resides within
40 | + `REMOVE_LOCAL_FILE` - Boolean representing whether or not to remove local files after they are uploaded
41 | + `IN_MEMORY_TRANSFER_ONLY` - Transfer the file from the source bucket to the destination bucket without storing the file on the local file system.
42 | + `DO_OCSF_CONVERSION` - Boolean representing whether or not to convert the events to the OCSF format
43 | + `TARGET_ACCOUNT_ID` - The AWS account ID of the target bucket
44 | + `OCSF_ROLE_NAME` - The name of the role to use when writing to the target bucket
45 | + `OCSF_ROLE_EXTERNAL_ID` - The external ID to use when assuming the role provided by OCSF_ROLE_NAME. Default: `CrowdStrikeCustomSource`
46 | + `OCSF_INGEST_LATENCY` - The maximum amount of time (in minutes) to buffer records before publishing. Min: 5 Max: 60 Default: 5
47 | + `OCSF_MAX_FILE_SIZE` - Maximum size of a file in MB before it is uploaded. Min: 1 Max: 200 Default: 200
48 |  > Note: Security Lake performance is sensitive to the number of files that must be read for a query. Use `OCSF_MAX_FILE_SIZE` and `OCSF_INGEST_LATENCY` to tune performance for your use case.
49 | ### Running the solution
50 | After updating the configuration file to reflect your environment specifics, you can run this solution using:
51 | ```bash
52 | python3 falcon_data_replicator.py
53 | ```
54 | If your configuration file is not present in the same directory as the application file, you can reference
55 | this path using the _-f_ or _--config_file_ command line parameters.
56 | ```bash
57 | python3 falcon_data_replicator.py -f some_path/falcon_data_replicator.ini
58 | ```
59 | ## Container-based
60 | _Coming soon_
61 | 
62 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | This document outlines security policy and procedures for the CrowdStrike `FDR Connector` project.
 3 | + [Supported Python versions](#supported-python-versions)
 4 | + [Supported FDR Connector versions](#supported-fdr-connector-versions)
 5 | + [Reporting a potential security vulnerability](#reporting-a-potential-security-vulnerability)
 6 | + [Disclosure and Mitigation Process](#disclosure-and-mitigation-process)
 7 | 
 8 | ## Supported Python versions
 9 | 
10 | FDR Connector functionality is unit tested to run under the following versions of Python.
11 | 
12 | | Version | Supported |
13 | | :------- | :--------- |
14 | | 3.9.x   | :white_check_mark: |
15 | | 3.8.x   | :white_check_mark: |
16 | | 3.7.x   | :white_check_mark: |
17 | | 3.6.x   | :white_check_mark: |
18 | | <= 3.5  | :x: |
19 | | <= 2.x.x | :x: |
20 | 
21 | ## Supported FDR Connector versions
22 | 
23 | When discovered, we release security vulnerability patches for the most recent release at an accelerated cadence.  
24 | 
25 | ## Reporting a potential security vulnerability
26 | 
27 | Please report suspected security vulnerabilities by:
28 | + Submitting a [bug](https://github.com/CrowdStrike/FDR/issues)
29 | + Submitting a [pull request](https://github.com/CrowdStrike/FDR/pulls) to potentially resolve the issue
30 | 
31 | ## Disclosure and mitigation process
32 | 
33 | Upon receiving a security bug report, the issue will be assigned to one of the project maintainers. This person will coordinate the related fix and release
34 | process, involving the following steps:
35 | + Communicate with you to confirm we have received the report and provide you with a status update.
36 |     - You should receive this message within 48 - 72 business hours.
37 | + Confirmation of the issue and a determination of affected versions.
38 | + An audit of the codebase to find any potentially similar problems.
39 | + Preparation of patches for all releases still under maintenance.
40 |     - These patches will be submitted as a separate pull request and contain a version update.
41 |     - This pull request will be flagged as a security fix.
42 | 
43 | ## Comments
44 | If you have suggestions on how this process could be improved, please let us know by [submitting an issue](https://github.com/CrowdStrike/FDR/issues).
45 | 


--------------------------------------------------------------------------------
/falcon_data_replicator.ini:
--------------------------------------------------------------------------------
 1 | #    ____     __                ___       __         ___           ___          __
 2 | #   / __/__ _/ /______  ___    / _ \___ _/ /____ _  / _ \___ ___  / (_)______ _/ /____  ____
 3 | #  / _// _ `/ / __/ _ \/ _ \  / // / _ `/ __/ _ `/ / , _/ -_) _ \/ / / __/ _ `/ __/ _ \/ __/
 4 | # /_/  \_,_/_/\__/\___/_//_/ /____/\_,_/\__/\_,_/ /_/|_|\__/ .__/_/_/\__/\_,_/\__/\___/_/
 5 | #                                                         /_/
 6 | # falcon_data_replicator.ini
 7 | # Creation date: 04.03.21, jshcodes@CrowdStrike
 8 | #
 9 | # Local configuration file for Falcon Data Replicator integration
10 | #
11 | # =========================================================================================================
12 | #  ____                             ____        _
13 | # / ___|  ___  _   _ _ __ ___ ___  |  _ \  __ _| |_ __ _
14 | # \___ \ / _ \| | | | '__/ __/ _ \ | | | |/ _` | __/ _` |
15 | #  ___) | (_) | |_| | | | (_|  __/ | |_| | (_| | || (_| |
16 | # |____/ \___/ \__,_|_|  \___\___| |____/ \__,_|\__\__,_|
17 | #
18 | # These values must be populated in order for this solution to operate
19 | #
20 | [Source Data]
21 | # AWS security credentials, provided to you by the CrowdStrike console (String)
22 | AWS_KEY = AWS_KEY_GOES_HERE
23 | # (String)
24 | AWS_SECRET = AWS_SECRET_GOES_HERE
25 | # URL of the SQS queue provided to you by CrowdStrike (String)
26 | # Should be a SQS URL
27 | QUEUE_URL = https://AWS_QUEUE_URL_GOES_HERE
28 | # This is the folder where downloads are stored. If you are immediately uploading these files to another
29 | # s3 bucket, then you can name this folder anything. If you plan on storing this data on the file system
30 | # then this would represent that location. (String)
31 | OUTPUT_PATH = downloaded
32 | # Time in seconds before a message is added back to the SQS queue if not deleted.
33 | # Ensure this is large enough for you to safely finish processing any downloaded files. (Integer)
34 | # Example: 300
35 | VISIBILITY_TIMEOUT = 300
36 | # Name of the AWS region for our source bucket (String)
37 | # This should match the region of your CrowdStrike FDR source bucket
38 | REGION_NAME = us-west-1
39 | # Delay (in seconds) to wait in between messages
40 | MESSAGE_DELAY = 1
41 | # Delay (in seconds) to wait in between message runs
42 | QUEUE_DELAY = 5
43 | # Log file
44 | LOG_FILE = falcon_data_replicator.log
45 | # Maximum number of processor threads to use for processing
46 | # Leaving this value blank will tell the application to make
47 | # it's best guess. The maximum number of threads that will
48 | # be generated at one time should not exceed 10.
49 | # (Max number of SQS received per iteration.)
50 | MAX_THREADS = 5
51 | # Logging level, INFO or DEBUG
52 | LOG_LEVEL = INFO
53 | 
54 | #  ____            _   _             _   _               ____        _
55 | # |  _ \  ___  ___| |_(_)_ __   __ _| |_(_) ___  _ __   |  _ \  __ _| |_ __ _
56 | # | | | |/ _ \/ __| __| | '_ \ / _` | __| |/ _ \| '_ \  | | | |/ _` | __/ _` |
57 | # | |_| |  __/\__ \ |_| | | | | (_| | |_| | (_) | | | | | |_| | (_| | || (_| |
58 | # |____/ \___||___/\__|_|_| |_|\__,_|\__|_|\___/|_| |_| |____/ \__,_|\__\__,_|
59 | #
60 | # If these values are not defined, this solution will save downloaded files to the OUTPUT_PATH location only.
61 | #
62 | [Destination Data]
63 | # Target bucket (String)
64 | # The name of your bucket. This bucket must exist.
65 | TARGET_BUCKET = TARGET_BUCKET_NAME_GOES_HERE
66 | # Name of our target AWS region (String)
67 | # Example: us-east-1
68 | TARGET_REGION = TARGET_REGION_NAME_GOES_HERE
69 | # Remove local files after upload (Boolean)
70 | # Allowed values: True, False, Yes, No
71 | REMOVE_LOCAL_FILE = yes
72 | # No local file system usage
73 | # Allowed values: True, False, Yes, No
74 | IN_MEMORY_TRANSFER_ONLY = yes
75 | # Convert inbound data into OCSF format before
76 | # publishing it to the target bucket or folder
77 | DO_OCSF_CONVERSION = No
78 | # OCSF Target AWS Account Id
79 | TARGET_ACCOUNT_ID= TARGET_ACCOUNT_ID
80 | # The role name to assume to write to Security Lake bucket
81 | OCSF_ROLE_NAME =
82 | # The external ID used to assume the role in the target account
83 | OCSF_ROLE_EXTERNAL_ID = CrowdStrikeCustomSource
84 | # Security Lake performance is sensitive to the number of files that must be read for a query.
85 | # The max amount of time (in minutes) to buffer records before publishing. Min: 5 Max: 60 Default: 5
86 | OCSF_INGEST_LATENCY = 5
87 | # Maximum size of a file in MB before it is uploaded. Min: 200 Max: 256 Default: 256
88 | OCSF_MAX_FILE_SIZE = 256


--------------------------------------------------------------------------------
/falcon_data_replicator.py:
--------------------------------------------------------------------------------
  1 | r"""Falcon Data Replicator - Local File System / AWS S3 connector
  2 | 
  3 |  _____     _                   ____        _          ____            _ _           _
  4 | |  ___|_ _| | ___ ___  _ __   |  _ \  __ _| |_ __ _  |  _ \ ___ _ __ | (_) ___ __ _| |_ ___  _ __
  5 | | |_ / _` | |/ __/ _ \| '_ \  | | | |/ _` | __/ _` | | |_) / _ \ '_ \| | |/ __/ _` | __/ _ \| '__|
  6 | |  _| (_| | | (_| (_) | | | | | |_| | (_| | || (_| | |  _ <  __/ |_) | | | (_| (_| | || (_) | |
  7 | |_|  \__,_|_|\___\___/|_| |_| |____/ \__,_|\__\__,_| |_| \_\___| .__/|_|_|\___\__,_|\__\___/|_|
  8 |                                                                |_|
  9 | 
 10 |                       .
 11 |        Your data      |  _____________________________________________________     ___
 12 |            is here!   | |    _____                  ________      _ __        |  __
 13 |              \ _______| |   / ___/______ _    _____/ / __/ /_____(_) /_____   |      ___
 14 |               / _____ | |  / /__/ __/ _ \ |/|/ / _  /\ \/ __/ __/ /  '_/ -_)  |
 15 |              / /(__) || |  \___/_/  \___/__,__/\_,_/___/\__/_/ /_/_/\_\\__/   |  ___
 16 |     ________/ / |OO| || |                                                     |
 17 |    | Hemi    |-------|| |                     --= FALCON DATA REPLICATOR >>   | ___
 18 |   (|         |     -.|| |_______________________                              |    ____
 19 |    |  ____   \       ||_________||____________  |             ____      ____  |
 20 |   /| / __ \   |______||     / __ \   / __ \   | |            / __ \    / __ \ |\       ___
 21 |   \|| /  \ |_______________| /  \ |_| /  \ |__| |___________| /  \ |__| /  \|_|/
 22 |      | () |                 | () |   | () |                  | () |    | () |     ____
 23 |       \__/                   \__/     \__/                    \__/      \__/
 24 | 
 25 | 
 26 |                         Local File System / AWS S3 connector
 27 | 
 28 | NOTE: See https://github.com/CrowdStrike/FDR for details on how to use this application.
 29 | """
 30 | import json
 31 | import io
 32 | import os
 33 | import sys
 34 | import time
 35 | import pathlib
 36 | import signal as sig
 37 | import configparser
 38 | import argparse
 39 | import logging
 40 | from logging.handlers import RotatingFileHandler
 41 | from functools import partial
 42 | from concurrent.futures import ThreadPoolExecutor
 43 | from threading import main_thread
 44 | from ocsf import transform_fdr_data_to_ocsf_data, upload_parquet_files_to_s3
 45 | from fdr.fdrconnector import FDRConnector
 46 | 
 47 | # This solution is dependant upon the AWS boto3 Python library
 48 | try:
 49 |     import boto3
 50 | except ImportError as err:
 51 |     print(err)
 52 |     raise SystemExit("The AWS boto3 library is required to run Falcon "
 53 |                      "Data Replicator.\nPlease execute 'pip3 install boto3'"
 54 |                      ) from err
 55 | 
 56 | try:
 57 |     from aws_assume_role_lib import assume_role
 58 | except ImportError as err:
 59 |     print(err)
 60 |     raise SystemExit("The aws-assume-role-lib library is required to run Falcon "
 61 |                      "Data Replicator.\nPlease execute 'pip3 install aws-assume-role-lib'"
 62 |                      ) from err
 63 | # Global FDR
 64 | FDR = None
 65 | 
 66 | 
 67 | # This method is used as an exit handler. When a quit, cancel or interrupt is received,
 68 | # this method forces FDR to finish processing the file it is working on before exiting.
 69 | def clean_exit(stat, signal, frame):  # pylint: disable=W0613
 70 |     """Graceful exit handler for SIGINT, SIGQUIT and SIGTERM"""
 71 |     stat.set_exit(True)
 72 |     return True
 73 | 
 74 | 
 75 | def do_keyed_delete(file_target: str, log: logging.Logger):
 76 |     """Remove temporary folder artifacts."""
 77 |     os.remove(file_target)
 78 |     os.rmdir(os.path.dirname(file_target))
 79 |     pure = pathlib.PurePath(file_target)
 80 |     # Remove the parent temporary folders if they exist
 81 |     try:
 82 |         os.rmdir(pure.parent.parent)
 83 |     except OSError:
 84 |         log.debug(f"Skipping deletion of {pure.parent.parent} as not empty.")
 85 |     else:
 86 |         log.debug("Removed %s", pure.parent.parent)
 87 |     if FDR.output_path not in pure.parent.parent.parent.name:
 88 |         try:
 89 |             os.rmdir(pure.parent.parent.parent)
 90 |         except OSError:
 91 |             log.debug(
 92 |                 f"Skipping deletion of {pure.parent.parent.parent} as not empty.")
 93 |         else:
 94 |             log.debug("Removed %s", pure.parent.parent.parent)
 95 | 
 96 | 
 97 | def handle_file(path, key, target_bkt, file_object=None, log_util: logging.Logger = None):
 98 |     """Process the file. If configured, upload this file to our target bucket and remove it."""
 99 |     total_events_in_file = 0
100 |     transform_time = 0
101 |     upload_time = 0
102 |     # If we've defined a target bucket
103 |     if FDR.target_bucket_name:
104 |         if not file_object:
105 |             if FDR.do_ocsf:
106 |                 # Send the gzip'd file to be transformed and write it as parquet file
107 |                 start_transform_time = time.time()
108 |                 total_events_in_file = transform_fdr_data_to_ocsf_data(
109 |                     FDR, path, log_util)
110 |                 transform_time = time.time() - start_transform_time
111 |                 # upload the file that meets the criteria
112 |                 start_upload_time = time.time()
113 |                 upload_parquet_files_to_s3(FDR, target_bkt, log_util)
114 |                 upload_time = time.time() - start_upload_time
115 |             else:
116 |                 start_upload_time = time.time()
117 |                 # Open our local file (binary)
118 |                 with open(path, 'rb') as data:
119 |                     # Perform the upload to the same key in our target bucket
120 |                     target_bkt.upload_fileobj(
121 |                         data, FDR.target_bucket_name, key)
122 |                 log_util.info('Uploaded file to path %s', key)
123 |                 upload_time = time.time() - start_upload_time
124 |             # Only perform this step if configured to do so
125 |             if FDR.remove_local_file:
126 |                 # Remove the file from the local file system
127 |                 do_keyed_delete(path, log_util)
128 | 
129 |         else:
130 |             if FDR.do_ocsf:
131 |                 # OCSF conversion using IN Memory data from s3 source
132 |                 start_transform_time = time.time()
133 |                 total_events_in_file = transform_fdr_data_to_ocsf_data(
134 |                     FDR, file_object, log_util)
135 |                 transform_time = time.time() - start_transform_time
136 |                 # upload the file that meets the criteria
137 |                 start_upload_time = time.time()
138 |                 upload_parquet_files_to_s3(FDR, target_bkt, log_util)
139 |                 upload_time = time.time() - start_upload_time
140 |             else:
141 |                 start_upload_time = time.time()
142 |                 target_bkt.upload_fileobj(
143 |                     file_object, FDR.target_bucket_name, key)
144 |                 log_util.info('Uploaded file to path %s', key)
145 |                 upload_time = time.time() - start_upload_time
146 |             if os.path.exists(f"{FDR.output_path}/{key}"):
147 |                 # Something about our zip handling is leaving artifacts on the drive
148 |                 do_keyed_delete(f"{FDR.output_path}/{key}", log_util)
149 |     # We're done
150 |     return {'done': True, 'total_events_per_input_file': total_events_in_file,
151 |             'transform_time_per_input_file': transform_time,
152 |             'upload_time_per_input_file': upload_time
153 |             }
154 | 
155 | 
156 | def download_message_files(msg, s3ta, s3or, log: logging.Logger):
157 |     """Download the file specified in the SQS message and trigger file handling."""
158 |     total_event_count = 0
159 |     total_download_time_sec = 0.0
160 |     total_transform_time_sec = 0.0
161 |     total_upload_time_sec = 0.0
162 |     # For every file in our message
163 |     for s3_file in msg['files']:
164 |         # Retrieve the bucket path for this file
165 |         s3_path = s3_file['path']
166 |         total_download_time_per_input_file = 0
167 |         if not FDR.in_memory_transfer_only:
168 |             # Construct output path for this message's files
169 |             msg_output_path = os.path.realpath(os.path.join(FDR.output_path, msg["pathPrefix"]))
170 |             # Only write files to the specified output_path
171 |             if os.path.commonpath([FDR.output_path, msg_output_path]) != FDR.output_path:
172 |                 log.info(
173 |                     f"Skipping {msg_output_path} to prevent writes outside of output path: {FDR.output_path}"
174 |                 )
175 |                 continue
176 |             # Ensure directory exists at output path
177 |             if not os.path.exists(msg_output_path):
178 |                 # Create it if it doesn't
179 |                 os.makedirs(msg_output_path)
180 |             # Create a local path name for our destination file based off of the S3 path
181 |             local_path = os.path.realpath(os.path.join(FDR.output_path, s3_path))
182 |             # Only write files to the specified output_path
183 |             if os.path.commonpath([FDR.output_path, local_path]) != FDR.output_path:
184 |                 log.info(
185 |                     f"Skipping {local_path} to prevent writes outside of output path: {FDR.output_path}"
186 |                 )
187 |                 continue
188 |             if not os.path.exists(os.path.dirname(local_path)):
189 |                 # Handle fdr platform and time partitioned folders
190 |                 os.makedirs(os.path.dirname(local_path))
191 |             start_download_time = time.time()
192 |             # Open our local file for binary write
193 |             with open(local_path, 'wb') as data:
194 |                 # Download the file from S3 into our opened local file
195 |                 s3or.download_fileobj(msg['bucket'], s3_path, data)
196 |             log.debug('Downloaded file to path %s', local_path)
197 |             total_download_time_per_input_file = time.time() - start_download_time
198 |             # Handle S3 upload if configured
199 |             result = handle_file(local_path, s3_path, s3ta, None, log)
200 |         else:
201 |             log.debug('Downloading file to memory')
202 |             start_download_time = time.time()
203 |             s3t = boto3.resource("s3",
204 |                                  region_name=FDR.region_name,
205 |                                  aws_access_key_id=FDR.aws_key,
206 |                                  aws_secret_access_key=FDR.aws_secret
207 |                                  )
208 |             bkt = s3t.Bucket(msg['bucket'])
209 |             obj = bkt.Object(s3_path)
210 |             stream = io.BytesIO()
211 |             obj.download_fileobj(stream)
212 |             # Seek to the beginning of the stream before passing it to the upload handler
213 |             stream.seek(0)
214 |             total_download_time_per_input_file = time.time() - start_download_time
215 |             result = handle_file(None, s3_path, s3ta, stream, log)
216 | 
217 |         total_event_count += result['total_events_per_input_file']
218 |         total_download_time_sec += total_download_time_per_input_file
219 |         total_transform_time_sec += result['transform_time_per_input_file']
220 |         total_upload_time_sec += result['upload_time_per_input_file']
221 |         # pif is per_input_file
222 |         log.debug(
223 |             'total_events_pif=%i, '
224 |             'total_download_time_pif=%f, '
225 |             'total_transform_time_pif=%f, '
226 |             'total_upload_time_pif=%f, '
227 |             'filepath=%s',
228 |             result['total_events_per_input_file'],
229 |             total_download_time_per_input_file,
230 |             result['transform_time_per_input_file'],
231 |             result['upload_time_per_input_file'],
232 |             s3_path)
233 | 
234 |     return {'total_event_count': total_event_count,
235 |             'total_download_time_sec': total_download_time_sec,
236 |             'total_transform_time_sec': total_transform_time_sec,
237 |             'total_upload_time_sec': total_upload_time_sec}
238 | 
239 | 
240 | def process_queue_message(msg, s3b, s3o, log_util: logging.Logger):
241 |     """Process the message off of the queue and trigger the file download."""
242 |     log_util.debug("Processing message [%s]", msg.message_id)
243 |     # Grab the actual message body
244 |     body = json.loads(msg.body)
245 |     # Download the file to our local file system and potentially upload it to S3
246 |     metrics = download_message_files(body, s3b, s3o, log_util)
247 |     log_util.debug("Removing message [%s] from queue", msg.message_id)
248 |     # Remove our message from the queue, if this is not performed in visibility_timeout seconds
249 |     # this message will be restored to the queue for follow-up processing
250 |     msg.delete()
251 | 
252 |     return body['fileCount'], body['totalSize'], True, metrics
253 | 
254 | 
255 | def do_shutdown(log_util: logging.Logger, clean: bool = False):
256 |     """Perform a graceful shutdown."""
257 |     if clean:
258 |         log_util.warning("Routine exit requested")
259 |         sys.exit(0)
260 |     else:
261 |         log_util.warning("Unexpected error occurred")
262 |         sys.exit(1)
263 | 
264 | 
265 | def consume_data_replicator(s3_bkt, s3_cs_bkt, log: logging.Logger):
266 |     """Consume from data replicator and track number of messages/files/bytes downloaded."""
267 |     # Tracking details
268 |     total_event_count = 0
269 |     total_download_time_sec = 0.0
270 |     total_transform_time_sec = 0.0
271 |     total_upload_time_sec = 0.0
272 |     total_time_sec = 0.0
273 |     msg_cnt = 0
274 |     file_cnt = 0
275 |     byte_cnt = 0
276 | 
277 |     # Continuously poll the queue for new messages.
278 |     while not FDR.exiting:
279 |         received = False
280 |         # Receive messages from queue if any exist and send each message to it's own thread for processing
281 |         # (NOTE: receive_messages() only receives a few messages at a time, it does NOT exhaust the queue)
282 |         #
283 |         with ThreadPoolExecutor(FDR.max_threads, thread_name_prefix="thread") as executor:
284 |             futures = {
285 |                 executor.submit(process_queue_message, msg,
286 |                                 s3_bkt, s3_cs_bkt, log)
287 |                 for msg in queue.receive_messages(VisibilityTimeout=FDR.visibility_timeout, MaxNumberOfMessages=10)
288 |             }
289 |             max_total_download_time_sec = 0.0
290 |             max_total_transform_time_sec = 0.0
291 |             max_total_upload_time_sec = 0.0
292 |             max_total_time_sec = 0.0
293 |             for fut in futures:
294 |                 msg_cnt += 1
295 |                 res = fut.result()
296 |                 file_cnt += res[0]
297 |                 byte_cnt += res[1]
298 |                 received = res[2]
299 |                 total_event_count += res[3]['total_event_count']
300 |                 max_total_download_time_sec = max(max_total_download_time_sec, res[3]['total_download_time_sec'])
301 |                 max_total_transform_time_sec = max(max_total_transform_time_sec, res[3]['total_transform_time_sec'])
302 |                 max_total_upload_time_sec = max(max_total_upload_time_sec, res[3]['total_upload_time_sec'])
303 |                 m_tot_time_sec = max_total_download_time_sec + \
304 |                     max_total_transform_time_sec + max_total_upload_time_sec
305 |                 max_total_time_sec = max(max_total_time_sec, m_tot_time_sec)
306 | 
307 |         if not received:
308 |             log.info("No messages received, sleeping for %i seconds",
309 |                      FDR.queue_delay)
310 |             for _ in range(0, FDR.queue_delay):
311 |                 time.sleep(1)
312 |                 if FDR.exiting:
313 |                     do_shutdown(log, True)
314 |         else:
315 |             total_download_time_sec += max_total_download_time_sec
316 |             total_transform_time_sec += max_total_transform_time_sec
317 |             total_upload_time_sec += max_total_upload_time_sec
318 |             total_time_sec += max_total_time_sec
319 |             log.info(
320 |                 "Messages_consumed: %i\t"
321 |                 "File_count: %i\t"
322 |                 "total_event_count: %i\t"
323 |                 "total_time_sec: %f\t"
324 |                 "total_download_time_sec: %f\t"
325 |                 "total_transform_time_sec: %f\t"
326 |                 "total_upload_time_sec: %f\t"
327 |                 "Byte_count: %i",
328 |                 msg_cnt,
329 |                 file_cnt,
330 |                 total_event_count,
331 |                 total_time_sec,
332 |                 total_download_time_sec,
333 |                 total_transform_time_sec,
334 |                 total_upload_time_sec,
335 |                 byte_cnt)
336 | 
337 |     # We've requested an exit
338 |     if FDR.exiting:
339 |         # Clean exit
340 |         do_shutdown(log, True)
341 |     else:
342 |         # Something untoward has occurred
343 |         do_shutdown(log, False)
344 | 
345 | 
346 | def setup_logging(connector: FDRConnector):
347 |     """Configure logging."""
348 |     # Set our parent thread name
349 |     thread = main_thread()
350 |     thread.name = "main"
351 |     # Ask boto to keep his voice down
352 |     logging.getLogger('boto').setLevel(logging.CRITICAL)
353 |     logging.getLogger('boto3').setLevel(logging.CRITICAL)
354 |     logging.getLogger('botocore').setLevel(logging.CRITICAL)
355 |     logging.getLogger('s3transfer').setLevel(logging.CRITICAL)
356 |     logging.getLogger('urllib3').setLevel(logging.CRITICAL)
357 |     # Log level
358 |     log_level = logging.INFO
359 |     if FDR.log_level.upper() == "DEBUG":
360 |         log_level = logging.DEBUG
361 |     # Setup our root logger
362 |     logging.basicConfig(
363 |         level=log_level, format="%(asctime)-8s %(levelname)-8s %(name)s/%(threadName)-10s %(message)s")
364 |     # Create our FDR logger
365 |     log_util = logging.getLogger("FDR")
366 |     # Rotate log file handler
367 |     rfh = RotatingFileHandler(
368 |         connector.log_file, maxBytes=20971520, backupCount=5)
369 |     # Log file output format
370 |     f_format = logging.Formatter(
371 |         '%(asctime)s %(levelname)-8s %(name)s/%(threadName)-10s %(message)s')
372 |     # Set the log file output level to INFO
373 |     rfh.setLevel(logging.INFO)
374 |     # Add our log file formatter to the log file handler
375 |     rfh.setFormatter(f_format)
376 |     # Add our log file handler to our logger
377 |     log_util.addHandler(rfh)
378 |     # Log our pre-startup event
379 |     log_util.info(" _____ ____  ____        _")
380 |     log_util.info("|  ___|  _ \\|  _ \\      (.\\")
381 |     log_util.info("| |_  | | | | |_) |     |/(\\")
382 |     log_util.info("|  _| | |_| |  _ <       \\(\\\\")
383 |     log_util.info("|_|   |____/|_| \\_\\      \"^\"`\\")
384 |     log_util.info("Process starting up with Thread Count=%i", FDR.max_threads)
385 | 
386 |     return log_util
387 | 
388 | 
389 | def setup_signal_handlers(connector: FDRConnector):
390 |     """Setup our graceful exit handlers."""
391 |     sig.signal(sig.SIGINT, partial(clean_exit, connector))
392 |     sig.signal(sig.SIGTERM, partial(clean_exit, connector))
393 |     sig.signal(sig.SIGQUIT, partial(clean_exit, connector))
394 | 
395 | 
396 | def get_crowdstrike_aws_objects(connector: FDRConnector):
397 |     """Retrieve the CrowdStrike AWS objects storing our FDR data."""
398 |     sqs = boto3.resource('sqs',
399 |                          region_name=connector.region_name,
400 |                          aws_access_key_id=connector.aws_key,
401 |                          aws_secret_access_key=connector.aws_secret
402 |                          )
403 |     # Connect to our CrowdStrike provided S3 bucket
404 |     s3bkt = boto3.client('s3',
405 |                          region_name=connector.region_name,
406 |                          aws_access_key_id=connector.aws_key,
407 |                          aws_secret_access_key=connector.aws_secret
408 |                          )
409 | 
410 |     # Create our queue object for handling message traffic
411 |     sqs_queue = sqs.Queue(url=FDR.queue_url)
412 | 
413 |     return sqs_queue, s3bkt
414 | 
415 | 
416 | # pylint: disable=R0913
417 | def get_aws_client(resource_type, account_id, aws_region, role_name, session_name, external_id, role_path='/'):
418 |     """
419 |     This function Assumes role and returns a client
420 | 
421 |     Args:
422 |         resource_type (string): Resource type to initialize (Ex: ec2, s3)
423 |         account_id (string): Target account Id to assume role
424 |         aws_region (string): AWS region to initialize service
425 |         role_name (string): Role name to assume
426 |         session_name (string): Assume role session name
427 |         external_id (string): External Id to assume role
428 |         role_path (string): Role Path, default = '/'
429 | 
430 |     Returns:
431 |         serviceClient (botocore client): botocore resource client
432 | 
433 |     """
434 |     try:
435 |         # Make Role ARN
436 |         if role_path == '/':
437 |             role_arn = f'arn:aws:iam::{account_id}:role/{role_name}'
438 |         else:
439 |             role_arn = f'arn:aws:iam::{account_id}:role/{role_path.lstrip("/").rstrip("/")}/{role_name}'
440 | 
441 |         # Assume role
442 |         session = boto3.Session(region_name=aws_region)
443 |         assumed_role_session = assume_role(session, role_arn, RoleSessionName=session_name, ExternalId=external_id)
444 |         return assumed_role_session.client(resource_type, region_name=aws_region)
445 | 
446 |     except Exception as error:
447 |         print(f'Failed to assume the role for Account: {account_id}: {error}')
448 |         raise
449 | 
450 | 
451 | def get_s3_target(connector: FDRConnector, log_util: logging.Logger):
452 |     """Retrieve details for any S3 bucket uploads."""
453 |     returned = None
454 |     if FDR.target_bucket_name and connector.target_region_name:
455 |         log_util.info("Upload to AWS S3 enabled")
456 | 
457 |         # Connect to our target S3 bucket, uses the existing
458 |         # client configuration to connect (Not the CS provided ones)
459 |         if connector.do_ocsf:
460 |             returned = get_aws_client('s3',
461 |                                       connector.target_account_id,
462 |                                       connector.target_region_name,
463 |                                       connector.ocsf_role_name,
464 |                                       "CrowdStrikeCustomSource",
465 |                                       connector.ocsf_role_external_id
466 |                                       )
467 |         else:
468 |             returned = boto3.client(
469 |                 's3', region_name=connector.target_region_name)
470 | 
471 |     return returned
472 | 
473 | 
474 | def consume_arguments():
475 |     """Consume any provided command line arguments."""
476 |     # Configure our accepted command line parameters
477 |     parser = argparse.ArgumentParser(
478 |         description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
479 |     parser.add_argument("-f", "--config_file", dest="config_file", help="Path to the configuration file",
480 |                         required=False)
481 |     # Parse any parameters passed at runtime
482 |     return parser.parse_args()
483 | 
484 | 
485 | def initialize_connector(cmd_line: argparse.Namespace):
486 |     """Initialize an instance of our FDRConnector class."""
487 |     # If we were not provided a configuration file name
488 |     if not cmd_line.config_file:
489 |         # Use the default name / location provided in our repo
490 |         config_file = "falcon_data_replicator.ini"
491 |     else:
492 |         # Use the configuration file provided at runtime
493 |         config_file = cmd_line.config_file
494 |     # Read in our configuration parameters
495 |     configuration = configparser.ConfigParser()
496 |     configuration.read(config_file)
497 |     # Create our connector
498 |     return FDRConnector(configuration)
499 | 
500 | 
501 | # Start our main routine
502 | if __name__ == '__main__':
503 |     # Consume any provided command line arguments
504 |     cmdline = consume_arguments()
505 |     # Initialize our FDR connector
506 |     FDR = initialize_connector(cmdline)
507 |     # Setup logging
508 |     logger = setup_logging(FDR)
509 |     # Enable our graceful exit handler to allow uploads and artifact
510 |     # cleanup to complete for SIGINT, SIGTERM and SIGQUIT signals.
511 |     setup_signal_handlers(FDR)
512 |     # Connect to our CrowdStrike provided SQS queue and S3 bucket
513 |     queue, s3_cs = get_crowdstrike_aws_objects(FDR)
514 |     # If we are doing S3 uploads
515 |     s3_target = get_s3_target(FDR, logger)
516 |     logger.info("Startup complete")
517 |     # Start consuming the replicator feed
518 |     consume_data_replicator(s3_target, s3_cs, logger)
519 | 


--------------------------------------------------------------------------------
/fdr/fdrconnector.py:
--------------------------------------------------------------------------------
  1 | """Falcon Data Replicator - Connection configuration class."""
  2 | import os
  3 | import sys
  4 | import configparser
  5 | 
  6 | 
  7 | # Class to hold our connector config and to track our running status
  8 | class FDRConnector:  # pylint: disable=R0902
  9 |     """The FDRConnector class contains the details of this connection and tracks the status of our process."""
 10 | 
 11 |     def __init__(self, config: configparser.ConfigParser):  # pylint: disable=R0912,R0915
 12 |         """Initialize our status class"""
 13 |         self.set_exit(False)
 14 |         # We cannot read our source parameters, exit the routine
 15 |         if "Source Data" not in config:
 16 |             print("Unable to load configuration file parameters. Routine halted.")
 17 |             sys.exit(1)
 18 | 
 19 |         # AWS Client ID - Provided by CrowdStrike
 20 |         self.aws_key = config["Source Data"]["AWS_KEY"]
 21 |         # AWS Client Secret - Provided by CrowdStrike
 22 |         self.aws_secret = config["Source Data"]["AWS_SECRET"]
 23 |         # AWS SQS queue URL - Provided by CrowdStrike
 24 |         self.queue_url = config["Source Data"]["QUEUE_URL"]
 25 |         # Local file output location
 26 |         self.output_path = os.path.realpath(config["Source Data"]["OUTPUT_PATH"])
 27 |         # Timeout before messages are returned to the queue
 28 |         self.visibility_timeout = int(config["Source Data"]["VISIBILITY_TIMEOUT"])
 29 |         # Message delay
 30 |         self.message_delay = int(config["Source Data"]["MESSAGE_DELAY"])
 31 |         # Queue delay
 32 |         self.queue_delay = int(config["Source Data"]["QUEUE_DELAY"])
 33 |         # Log File
 34 |         self.log_file = config["Source Data"]["LOG_FILE"]
 35 |         # AWS Region name for our source S3 bucket
 36 |         self.region_name = config["Source Data"]["REGION_NAME"]
 37 |         # Log setting
 38 |         self.log_level = config["Source Data"].get("LOG_LEVEL", "INFO")
 39 |         max_threads = config["Source Data"].get("MAX_THREADS", False)
 40 |         if not max_threads:
 41 |             self.max_threads = min(32, (os.cpu_count() or 1) * 4)
 42 |         else:
 43 |             self.max_threads = int(max_threads)
 44 |         self.in_memory_transfer_only = False  # Defaults to writing to the local file system
 45 |         self.target_region_name = None  # Defaults to no upload
 46 |         self.target_bucket_name = None  # Defaults to no upload
 47 |         self.remove_local_file = False  # Defaults to keeping files locally
 48 | 
 49 |         try:
 50 |             # Fail on these in order.  If REMOVE_LOCAL_FILE, or IN_MEMORY_TRANSFER_ONLY
 51 |             # fail, processing will still continue.
 52 |             if "Destination Data" in config:
 53 |                 # If it's not present, we don't need it
 54 |                 if config["Destination Data"]["TARGET_BUCKET"]:
 55 |                     # The name of our target S3 bucket
 56 |                     self.target_bucket_name = config["Destination Data"]["TARGET_BUCKET"]
 57 | 
 58 |                 if config["Destination Data"]["TARGET_REGION"]:
 59 |                     # The AWS region name our target S3 bucket resides in
 60 |                     self.target_region_name = config["Destination Data"]["TARGET_REGION"]
 61 | 
 62 |                 if config["Destination Data"]["REMOVE_LOCAL_FILE"]:
 63 |                     # Should we remove local files after we upload them?
 64 |                     remove = config["Destination Data"]["REMOVE_LOCAL_FILE"]
 65 |                     self.remove_local_file = False
 66 |                     if remove.lower() in "true,yes".split(","):  # pylint: disable=R1703
 67 |                         self.remove_local_file = True
 68 | 
 69 |                 if config["Destination Data"]["IN_MEMORY_TRANSFER_ONLY"]:
 70 |                     # Transfer to S3 without using the local file system?
 71 |                     mem_trans = config["Destination Data"]["IN_MEMORY_TRANSFER_ONLY"]
 72 |                     self.in_memory_transfer_only = False
 73 |                     if mem_trans.lower() in "true,yes".split(","):  # pylint: disable=R1703
 74 |                         self.in_memory_transfer_only = True
 75 | 
 76 |                 if config["Destination Data"]["DO_OCSF_CONVERSION"]:
 77 |                     ocsf_setting = config["Destination Data"].get("DO_OCSF_CONVERSION", "no")
 78 |                     self.do_ocsf = False
 79 |                     if ocsf_setting.lower() in "true,yes".split(","):
 80 |                         self.do_ocsf = True
 81 |                     if config["Destination Data"]["TARGET_ACCOUNT_ID"]:
 82 |                         # AWS Account ID
 83 |                         self.target_account_id = config["Destination Data"]["TARGET_ACCOUNT_ID"]
 84 | 
 85 |                     if self.do_ocsf:
 86 |                         ocsf_max_file_size = int(
 87 |                             config["Destination Data"].get("OCSF_MAX_FILE_SIZE", 256))
 88 |                         ocsf_ingest_latency = int(config["Destination Data"].get("OCSF_INGEST_LATENCY", 5))
 89 |                         ocsf_role_name = config["Destination Data"].get(
 90 |                             "OCSF_ROLE_NAME", None)
 91 |                         ocsf_role_external_id = config["Destination Data"].get("OCSF_ROLE_EXTERNAL_ID",
 92 |                                                                                "CrowdStrike OCSF Conversion"
 93 |                                                                                )
 94 | 
 95 |                         if ocsf_role_name is None:
 96 |                             raise RuntimeError(
 97 |                                 "OCSF_ROLE_NAME must be set if DO_OCSF_CONVERSION is true")
 98 | 
 99 |                         self.ocsf_role_name = ocsf_role_name
100 |                         self.ocsf_role_external_id = ocsf_role_external_id
101 |                         self.ocsf_max_file_size = max(
102 |                             min(ocsf_max_file_size, 256), 200)
103 |                         self.ocsf_ingest_latency = max(min(ocsf_ingest_latency, 60), 5)
104 | 
105 |         except KeyError:
106 |             pass
107 | 
108 |     @property
109 |     def exiting(self):
110 |         """Returns the value of the exiting property"""
111 |         return self.exiting
112 | 
113 |     @classmethod
114 |     def set_exit(cls, val):
115 |         """Sets the value of the exiting property"""
116 |         cls.exiting = val
117 |         return True
118 | 


--------------------------------------------------------------------------------
/ocsf/__init__.py:
--------------------------------------------------------------------------------
1 | """OCSF file conversion, upload."""
2 | from .ocsf import transform_fdr_data_to_ocsf_data, upload_parquet_files_to_s3
3 | 
4 | __all__ = ["transform_fdr_data_to_ocsf_data", "upload_parquet_files_to_s3"]
5 | 


--------------------------------------------------------------------------------
/ocsf/ocsf.py:
--------------------------------------------------------------------------------
  1 | """Transforms FDR data to OCSF Format and writes in parquet file and uploads the file to AWS Security Lake"""
  2 | import glob
  3 | import gzip
  4 | import json
  5 | import os
  6 | import re
  7 | import threading
  8 | from datetime import datetime
  9 | from functools import reduce
 10 | from logging import Logger
 11 | from filelock import FileLock
 12 | import pandas as pd
 13 | import yaml
 14 | 
 15 | NEWLINE = ord('\n')
 16 | 
 17 | CUSTOM_SOURCES = {
 18 |     1001: 'CrowdStrike_FILE_ACTIVITY',
 19 |     1005: 'CrowdStrike_MODULE_ACTIVITY',
 20 |     1007: 'CrowdStrike_PROCESS_ACTIVITY',
 21 |     4001: 'CrowdStrike_NETWORK_ACTIVITY',
 22 |     4003: 'CrowdStrike_DNS_ACTIVITY'
 23 | }
 24 | 
 25 | BYTES_IN_MB = 1000000
 26 | 
 27 | WRITE_UPLOAD_THREAD_LOCK = threading.Lock()
 28 | 
 29 | 
 30 | def upload_parquet_files_to_s3(fdr, s3_target, log_utl: Logger):
 31 |     """Uploads parquet files to s3"""
 32 |     if fdr.target_bucket_name:
 33 |         with WRITE_UPLOAD_THREAD_LOCK:
 34 |             for root, _, filenames in os.walk('ext'):
 35 |                 for filename in filenames:
 36 |                     upload_file_path = os.path.join(root, filename)
 37 |                     timestamp_str = filename.split('_')[-1].split('.')[0]
 38 | 
 39 |                     if not filename.endswith('parquet'):
 40 |                         continue
 41 | 
 42 |                     if not os.path.exists(upload_file_path):
 43 |                         continue
 44 | 
 45 |                     if os.path.getsize(upload_file_path) >= (BYTES_IN_MB * fdr.ocsf_max_file_size) or \
 46 |                             is_older_than_minutes(timestamp_str, fdr.ocsf_ingest_latency):
 47 |                         lock = FileLock(upload_file_path + ".lock")
 48 |                         with lock:
 49 |                             with open(upload_file_path, 'rb') as parquet_data:
 50 |                                 log_utl.debug('@@@@uploaded_file@@@@=%s', upload_file_path)
 51 |                                 s3_target.upload_fileobj(parquet_data, fdr.target_bucket_name, upload_file_path)
 52 |                             # Remove the file from the local file system
 53 |                             os.remove(upload_file_path)
 54 | 
 55 | 
 56 | def is_older_than_minutes(timestamp, minutes):
 57 |     """Checks if the timestamp is older than the number of minutes passed
 58 | 
 59 |     Arguments:
 60 |         timestamp {string} -- timestamp in string format
 61 |         minutes {int} -- number of minutes
 62 | 
 63 |     Returns:
 64 |         bool -- True if the timestamp is older than the number of minutes passed
 65 |     """
 66 |     return (datetime.utcnow().timestamp() - float(timestamp)) > minutes * 60
 67 | 
 68 | 
 69 | def write_to_parquet_file(fdr, ocsf_events, filename_class_uid_key, log_utl: Logger = None):
 70 |     """write the events to a parquet file"""
 71 |     split_path = filename_class_uid_key.rsplit(os.path.sep, 1)
 72 |     log_utl.debug('split_path=%s', split_path)
 73 |     folder_path = split_path[0]
 74 |     file_name = split_path[1]
 75 |     data = pd.DataFrame(ocsf_events)
 76 |     data.sort_index(axis=1, inplace=True)
 77 |     if 'exit_code' in data.columns:
 78 |         data['exit_code'] = data['exit_code'].astype('Int64')
 79 |     with WRITE_UPLOAD_THREAD_LOCK:
 80 |         file_list = os.listdir(folder_path)
 81 |         events_wrote_to_file = False
 82 |         if len(file_list) > 0:
 83 |             for file_path in file_list:
 84 |                 parquet_file_name = os.path.join(folder_path, file_path)
 85 |                 if file_path.endswith('parquet') and file_path.startswith(file_name + '_chunk_') and \
 86 |                         os.path.getsize(parquet_file_name) <= (BYTES_IN_MB * fdr.ocsf_max_file_size):
 87 |                     lock = FileLock(parquet_file_name + ".lock")
 88 |                     with lock:
 89 |                         events_wrote_to_file = True
 90 |                         log_utl.debug('!!!!!!!!!!Update to bucket=%s, record_len=%s, file_name=%s',
 91 |                                       filename_class_uid_key,
 92 |                                       len(ocsf_events), parquet_file_name)
 93 |                         existing_data = pd.read_parquet(parquet_file_name)
 94 |                         existing_data.sort_index(axis=1, inplace=True)
 95 |                         concat_data = pd.concat([existing_data, data], axis=0)
 96 |                         concat_data.to_parquet(parquet_file_name, compression='gzip', index=False)
 97 |         if not events_wrote_to_file:
 98 |             parquet_file_name = filename_class_uid_key + '_chunk_' + str(
 99 |                 int(datetime.utcnow().timestamp())) + '.parquet'
100 |             lock = FileLock(parquet_file_name + ".lock")
101 |             with lock:
102 |                 log_utl.debug('#########Write to bucket=%s, record_len=%s, file_name=%s', filename_class_uid_key,
103 |                               len(ocsf_events), parquet_file_name)
104 |                 data.to_parquet(parquet_file_name, compression='gzip', index=False)
105 | 
106 | 
107 | def read_fdr_part(rdr):
108 |     """reads the fdr file"""
109 |     # to avoid reading the file into memory, we push each byte into a bytearray
110 |     # and yield the completed json once we hit a newline
111 |     tmp = bytearray()
112 |     for char in rdr.read():
113 |         if char == NEWLINE:
114 |             if tmp:
115 |                 try:
116 |                     yield json.loads(tmp.decode('utf-8'))
117 |                 except json.JSONDecodeError as e:
118 |                     print(f"Error decoding JSON: {e}")
119 |             tmp.clear()
120 |         else:
121 |             tmp.append(char)
122 | 
123 | 
124 | def transform_fdr_data_to_ocsf_data(fdr, file, log_utl: Logger = None):
125 |     """Transform FDR data into OSCF format data."""
126 |     total_events_in_file = 0
127 |     mapping_dict_by_name = {}
128 |     supporting_mapping_dict = {}
129 |     for mapping_defn in glob.glob(os.path.join('ocsf', 'mappings', '*.yaml')):
130 |         with open(mapping_defn, encoding='utf-8') as mapping_file:
131 |             mapping_yamls_by_defn_file = yaml.safe_load_all(mapping_file)
132 |             for mapping_yaml in mapping_yamls_by_defn_file:
133 |                 mapping_jsons = json.loads(json.dumps(mapping_yaml))
134 |                 for mapping_json in mapping_jsons:
135 |                     if mapping_json['type'] == 'Telemetry':
136 |                         prepare_mapping_dict(mapping_json, mapping_dict_by_name)
137 |                     else:
138 |                         prepare_mapping_dict(mapping_json, supporting_mapping_dict)
139 | 
140 |     file_prefix = 'class_uid'
141 |     ocsf_dicts = {}
142 |     with gzip.open(file, 'rb') as chunk:
143 |         for event in read_fdr_part(chunk):
144 |             total_events_in_file += 1
145 |             mapping_event_simplename = event.get('event_simpleName')
146 |             if mapping_event_simplename in mapping_dict_by_name:
147 |                 class_uid_field = next(
148 |                     (field for field in mapping_dict_by_name[mapping_event_simplename].get('fields') if
149 |                      field['name'] == 'class_uid'), False)
150 |                 if class_uid_field:
151 |                     class_uid = class_uid_field['value']
152 |                     if class_uid in CUSTOM_SOURCES:
153 |                         timestamp = int(int(event.get('timestamp')) / 1000)
154 |                         folder_path = os.path.join('ext', CUSTOM_SOURCES[class_uid_field['value']],
155 |                                                    'region=' + fdr.target_region_name,
156 |                                                    'accountId=' + fdr.target_account_id,
157 |                                                    'eventDay=' + datetime.fromtimestamp(timestamp).strftime('%Y%m%d'))
158 |                         is_dir_exist = os.path.exists(folder_path)
159 |                         if not is_dir_exist:
160 |                             try:
161 |                                 os.makedirs(folder_path)
162 |                             except FileExistsError:
163 |                                 pass
164 |                         class_uid_path = os.path.join(folder_path, file_prefix + '_' + str(
165 |                             class_uid))
166 |                         ocsf_class_uid_dicts = ocsf_dicts.setdefault(class_uid_path, [])
167 |                         ocsf_dict = {}
168 |                         ocsf_class_uid_dicts.append(
169 |                             transform_event_to_ocsf(event, ocsf_dict, mapping_dict_by_name[mapping_event_simplename],
170 |                                                     supporting_mapping_dict))
171 | 
172 |     for filename_class_uid_key, values in ocsf_dicts.items():
173 |         event_count = 0
174 |         ocsf_events = []
175 |         for event in values:
176 |             ocsf_events.append(event)
177 |             event_count += 1
178 |             if event_count == 100000:
179 |                 write_to_parquet_file(fdr, ocsf_events, filename_class_uid_key, log_utl)
180 |                 ocsf_events = []
181 |                 event_count = 0
182 | 
183 |         if len(ocsf_events) > 0:
184 |             write_to_parquet_file(fdr, ocsf_events, filename_class_uid_key, log_utl)
185 | 
186 |     return total_events_in_file
187 | 
188 | 
189 | def prepare_mapping_dict(mapping_json: dict, out_dict: dict):
190 |     """Dict containing the mapping definition for each name"""
191 |     if isinstance(mapping_json.get('name'), list):
192 |         for name in mapping_json.get('name'):
193 |             out_dict[name] = mapping_json
194 |     else:
195 |         out_dict[mapping_json.get('name')] = mapping_json
196 | 
197 | 
198 | def transform_event_to_ocsf(event: dict, ocsf_dict: dict, mapping_dict: dict, mapping_supporting_dict: dict):
199 |     """Transforms event to ocsf format"""
200 |     for mapping in mapping_dict.get('mappings'):
201 |         if not event.get(mapping.get('ours')) and mapping.get('default') is not None:
202 |             event[mapping.get('ours')] = mapping.get('default')
203 |         map_field(event, ocsf_dict, mapping, mapping_supporting_dict)
204 |     for field in mapping_dict.get('fields'):
205 |         add_default_field(ocsf_dict, field)
206 | 
207 |     return dot_notation_to_json(ocsf_dict)
208 | 
209 | 
210 | # Transform Functions start #
211 | def extract_filename(value):
212 |     """extracts filename from the value"""
213 |     basename = re.search(r'[^\\/]+(?=[\\/]?$)', value)
214 |     if basename:
215 |         return basename.group(0)
216 |     return value
217 | 
218 | 
219 | def as_number(value):
220 |     """converts to int"""
221 |     if value is None:
222 |         return 0
223 |     if '.' in value:
224 |         return int(value.split('.')[0])
225 |     return int(value)
226 | 
227 | 
228 | def as_string(value):
229 |     """converts to string"""
230 |     if value is None:
231 |         return ''
232 |     return str(value)
233 | 
234 | 
235 | def map_ours_theirs(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
236 |     # pylint: disable=unused-argument
237 |     """transform function map_ours_theirs"""
238 |     dst[mapping.get('theirs')] = src.get(mapping.get('ours'))
239 | 
240 | 
241 | def map_ours_theirs_using_fn(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
242 |     """transform function map_ours_theirs_using_fn"""
243 |     supporting_enum = mapping_supporting_dict.get(mapping.get('using'))
244 |     for value in supporting_enum.get('values'):
245 |         if value.get('ours') == src.get(mapping.get('ours')):
246 |             dst[mapping.get('theirs')] = value.get('theirs')
247 | 
248 | 
249 | def map_ours_theirs_transform_fn(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
250 |     # pylint: disable=unused-argument
251 |     """transform function map_ours_theirs_transform_fn"""
252 |     transform_fn = ALL_TRANSFORMS.get(mapping.get('transform'))
253 |     dst[mapping.get('theirs')] = transform_fn(src.get(mapping.get('ours')))
254 | 
255 | 
256 | def map_items_theirs(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
257 |     # pylint: disable=unused-argument
258 |     """transform function map_items_theirs"""
259 |     values = []
260 |     for _, item in enumerate(mapping.get('items')):
261 |         value = {}
262 |         for item_mapping in item.get('mappings'):
263 |             if src.get(item_mapping.get('ours')) is not None:
264 |                 value[item_mapping.get('theirs')] = src.get(item_mapping.get('ours'))
265 |             for field in item.get('fields'):
266 |                 if src.get(item_mapping.get('ours')):
267 |                     value[field.get('name')] = field.get('value')
268 |             values.append(value)
269 | 
270 |     dst[mapping.get('theirs')] = values
271 | 
272 | 
273 | def map_ours_theirs_list(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
274 |     # pylint: disable=unused-argument
275 |     """transform function map_ours_theirs_list"""
276 |     for their in mapping.get('theirs'):
277 |         if src.get(mapping.get('ours')) is not None:
278 |             dst[their] = src.get(mapping.get('ours'))
279 | 
280 | 
281 | def map_ours_theirs_list_using_fn(src: dict, dst: dict, mapping:  dict, mapping_supporting_dict: dict):
282 |     # pylint: disable=unused-argument
283 |     """transform function map_ours_theirs_list_using_fn"""
284 |     supporting_enum = mapping_supporting_dict.get(mapping.get('using'))
285 |     for their in mapping.get('theirs'):
286 |         if src.get(mapping.get('ours')) is not None:
287 |             for value in supporting_enum.get('values'):
288 |                 if value.get('ours') == src.get(mapping.get('ours')):
289 |                     dst[their] = value.get(mapping.get('theirs'))
290 | 
291 | 
292 | def map_ours_theirs_list_transform_fn(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
293 |     # pylint: disable=unused-argument
294 |     """transform function map_ours_theirs_list_transform_fn"""
295 |     transform_fn = ALL_TRANSFORMS.get(mapping.get('transform'))
296 |     for their in mapping.get('theirs'):
297 |         if src.get(mapping.get('ours')) is not None:
298 |             dst[their] = transform_fn(src.get(mapping.get('ours')))
299 | 
300 | 
301 | # Transform Functions End#
302 | def apply_transform(src: dict, mapping: dict):
303 |     """determines the transform function to be applied"""
304 |     ours = mapping.get('ours')
305 |     theirs = mapping.get('theirs')
306 |     optional_using = mapping.get('using')
307 |     optional_translate = mapping.get('transform')
308 |     optional_items = mapping.get('items')
309 |     return_func = ''
310 |     if ours and not isinstance(ours, list):
311 |         if theirs and not isinstance(theirs, list):
312 |             if src.get(ours) is not None and not optional_translate and not optional_using and not optional_items:
313 |                 return_func = 'map_ours_theirs'
314 |             elif src.get(ours) is not None and not optional_translate and optional_using and not optional_items:
315 |                 return_func = 'map_ours_theirs_using_fn'
316 |             elif src.get(ours) is not None and optional_translate and not optional_using and not optional_items:
317 |                 return_func = 'map_ours_theirs_transform_fn'
318 |         if theirs and isinstance(theirs, list):
319 |             if not optional_translate and not optional_using and not optional_items:
320 |                 return_func = 'map_ours_theirs_list'
321 |             elif not optional_translate and optional_using and not optional_items:
322 |                 return_func = 'map_ours_theirs_list_using_fn'
323 |             elif optional_translate and not optional_using and not optional_items:
324 |                 return_func = 'map_ours_theirs_list_transform_fn'
325 |     elif not ours and optional_items and isinstance(optional_items, list):
326 |         if theirs and not isinstance(theirs, list) and not optional_translate and not optional_using:
327 |             return_func = 'map_items_theirs'
328 | 
329 |     return return_func
330 | 
331 | 
332 | def map_field(src: dict, dst: dict, mapping: dict, mapping_supporting_dict: dict):
333 |     """maps the FDR field to OCSF field"""
334 |     map_fn = ALL_TRANSFORMS.get(apply_transform(src, mapping))
335 |     if map_fn:
336 |         map_fn(src, dst, mapping, mapping_supporting_dict)
337 | 
338 | 
339 | def dot_notation_to_json(ocsf_dict):
340 |     """converts the dot notations in the json to nested json"""
341 |     output = {}
342 |     for key, value in ocsf_dict.items():
343 |         path = key.split('.')
344 |         target = reduce(lambda d, k: d.setdefault(k, {}), path[:-1], output)
345 |         target[path[-1]] = value
346 |     return output
347 | 
348 | 
349 | def add_default_field(dest: dict, field: dict):
350 |     """adds the default field in the dict"""
351 |     name = field.get('name')
352 |     value = field.get('value')
353 |     if isinstance(value, list) and len(value) == 1 and value[0] is None:
354 |         dest[name] = []
355 |     else:
356 |         dest[name] = value
357 | 
358 | 
359 | ALL_TRANSFORMS = {
360 |     'extract_filename': extract_filename,
361 |     'as_number': as_number,
362 |     'as_string': as_string,
363 |     'map_ours_theirs': map_ours_theirs,
364 |     'map_ours_theirs_using_fn': map_ours_theirs_using_fn,
365 |     'map_ours_theirs_transform_fn': map_ours_theirs_transform_fn,
366 |     'map_items_theirs': map_items_theirs,
367 |     'map_ours_theirs_list': map_ours_theirs_list,
368 |     'map_ours_theirs_list_using_fn': map_ours_theirs_list_using_fn,
369 |     'map_ours_theirs_list_transform_fn': map_ours_theirs_list_transform_fn
370 | }
371 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile with python 3.9
 3 | # To update, run:
 4 | #
 5 | #    pip-compile requirements.txt
 6 | #
 7 | boto3>=1.24.70
 8 |     # via -r requirements.txt
 9 | fastparquet>=0.8.3
10 |     # via -r requirements.txt
11 | filelock>=3.8.0
12 |     # via -r requirements.txt
13 | json2parquet>=2.0.0
14 |     # via -r requirements.txt
15 | pyyaml>=6.0
16 |     # via -r requirements.txt
17 | numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
18 | aws-assume-role-lib>=2.10.0
19 | pyarrow>=14.0.1 # not directly required, pinned by Snyk to avoid a vulnerability
20 | 


--------------------------------------------------------------------------------
/standalone/falcon_data_replicator.py:
--------------------------------------------------------------------------------
  1 | """Falcon Data Replicator - Local File System / AWS S3 connector"""
  2 | 
  3 | #  _____     _                   ____        _          ____            _ _           _
  4 | # |  ___|_ _| | ___ ___  _ __   |  _ \  __ _| |_ __ _  |  _ \ ___ _ __ | (_) ___ __ _| |_ ___  _ __
  5 | # | |_ / _` | |/ __/ _ \| '_ \  | | | |/ _` | __/ _` | | |_) / _ \ '_ \| | |/ __/ _` | __/ _ \| '__|
  6 | # |  _| (_| | | (_| (_) | | | | | |_| | (_| | || (_| | |  _ <  __/ |_) | | | (_| (_| | || (_) | |
  7 | # |_|  \__,_|_|\___\___/|_| |_| |____/ \__,_|\__\__,_| |_| \_\___| .__/|_|_|\___\__,_|\__\___/|_|
  8 | #                                                                |_|
  9 | # Local File System / AWS S3 connector
 10 | #
 11 | ###################################################################################################
 12 | # NOTE: See https://github.com/CrowdStrike/FDR for details on how to use this application.        #
 13 | ###################################################################################################
 14 | #
 15 | import json
 16 | import io
 17 | import os
 18 | import sys
 19 | import time
 20 | import pathlib
 21 | import signal as sig
 22 | import configparser
 23 | import argparse
 24 | import logging
 25 | from logging.handlers import RotatingFileHandler
 26 | from functools import partial
 27 | 
 28 | # This solution is dependant upon the AWS boto3 Python library
 29 | try:
 30 |     import boto3
 31 | except ImportError as err:
 32 |     print(err)
 33 |     print(
 34 |         'The AWS boto3 library is required to run Falcon Data Replicator.\nPlease execute "pip3 install boto3"'
 35 |     )
 36 |     sys.exit(1)
 37 | 
 38 | 
 39 | # Class to hold our connector config and to track our running status
 40 | class FDRConnector:  # pylint: disable=R0902
 41 |     """The FDRConnector class contains the details of this connection and tracks the status of our process."""
 42 | 
 43 |     def __init__(self, config: configparser.ConfigParser):
 44 |         """Initialize our status class"""
 45 |         self.set_exit(False)
 46 |         # We cannot read our source parameters, exit the routine
 47 |         if "Source Data" not in config:
 48 |             print("Unable to load configuration file parameters. Routine halted.")
 49 |             sys.exit(1)
 50 | 
 51 |         # AWS Client ID - Provided by CrowdStrike
 52 |         self.aws_key = config["Source Data"]["AWS_KEY"]
 53 |         # AWS Client Secret - Provided by CrowdStrike
 54 |         self.aws_secret = config["Source Data"]["AWS_SECRET"]
 55 |         # AWS SQS queue URL - Provided by CrowdStrike
 56 |         self.queue_url = config["Source Data"]["QUEUE_URL"]
 57 |         # Local file output location
 58 |         self.output_path = os.path.realpath(config["Source Data"]["OUTPUT_PATH"])
 59 |         # Timeout before messages are returned to the queue
 60 |         self.visibility_timeout = int(config["Source Data"]["VISIBILITY_TIMEOUT"])
 61 |         # Message delay
 62 |         self.message_delay = int(config["Source Data"]["MESSAGE_DELAY"])
 63 |         # Queue delay
 64 |         self.queue_delay = int(config["Source Data"]["QUEUE_DELAY"])
 65 |         # Log File
 66 |         self.log_file = config["Source Data"]["LOG_FILE"]
 67 |         # AWS Region name for our source S3 bucket
 68 |         self.region_name = config["Source Data"]["REGION_NAME"]
 69 |         self.in_memory_transfer_only = (
 70 |             False  # Defaults to writing to the local file system
 71 |         )
 72 |         self.target_region_name = None  # Defaults to no upload
 73 |         self.target_bucket_name = None  # Defaults to no upload
 74 |         self.remove_local_file = False  # Defaults to keeping files locally
 75 |         try:
 76 |             # Fail on these in order.  If REMOVE_LOCAL_FILE, or IN_MEMORY_TRANSFER_ONLY
 77 |             # fail, processing will still continue.
 78 |             if "Destination Data" in config:
 79 |                 # If it's not present, we don't need it
 80 |                 if config["Destination Data"]["TARGET_BUCKET"]:
 81 |                     # The name of our target S3 bucket
 82 |                     self.target_bucket_name = config["Destination Data"][
 83 |                         "TARGET_BUCKET"
 84 |                     ]
 85 | 
 86 |                 if config["Destination Data"]["TARGET_REGION"]:
 87 |                     # The AWS region name our target S3 bucket resides in
 88 |                     self.target_region_name = config["Destination Data"][
 89 |                         "TARGET_REGION"
 90 |                     ]
 91 | 
 92 |                 if config["Destination Data"]["REMOVE_LOCAL_FILE"]:
 93 |                     # Should we remove local files after we upload them?
 94 |                     remove = config["Destination Data"]["REMOVE_LOCAL_FILE"]
 95 |                     if remove.lower() in "true,yes".split(","):  # pylint: disable=R1703
 96 |                         self.remove_local_file = True
 97 |                     else:
 98 |                         self.remove_local_file = False
 99 | 
100 |                 if config["Destination Data"]["IN_MEMORY_TRANSFER_ONLY"]:
101 |                     # Transfer to S3 without using the local file system?
102 |                     mem_trans = config["Destination Data"]["IN_MEMORY_TRANSFER_ONLY"]
103 |                     if mem_trans.lower() in "true,yes".split(
104 |                         ","
105 |                     ):  # pylint: disable=R1703
106 |                         self.in_memory_transfer_only = True
107 |                     else:
108 |                         self.in_memory_transfer_only = False
109 | 
110 |         except KeyError:
111 |             pass
112 | 
113 |     @property
114 |     def exiting(self):
115 |         """Returns the value of the exiting property"""
116 |         return self.exiting
117 | 
118 |     @classmethod
119 |     def set_exit(cls, val):
120 |         """Sets the value of the exiting property"""
121 |         cls.exiting = val
122 |         return True
123 | 
124 | 
125 | # This method is used as an exit handler. When a quit, cancel or interrupt is received,
126 | # this method forces FDR to finish processing the file it is working on before exiting.
127 | def clean_exit(stat, signal, frame):  # pylint: disable=W0613
128 |     """Graceful exit handler for SIGINT, SIGQUIT and SIGTERM"""
129 |     stat.set_exit(True)
130 |     return True
131 | 
132 | 
133 | def handle_file(path, key, file_object=None):
134 |     """If configured, upload this file to our target bucket and remove it."""
135 |     # If we've defined a target bucket
136 |     if FDR.target_bucket_name:
137 |         if not file_object:
138 |             # Open our local file (binary)
139 |             with open(path, "rb") as data:
140 |                 # Perform the upload to the same key in our target bucket
141 |                 s3_target.upload_fileobj(data, FDR.target_bucket_name, key)
142 |             logger.info("Uploaded file to path %s", key)
143 |             # Only perform this step if configured to do so
144 |             if FDR.remove_local_file:
145 |                 # Remove the file from the local file system
146 |                 os.remove(path)
147 |                 logger.info("Removed %s", path)
148 |                 # Remove the temporary folder from the local file system
149 |                 os.rmdir(os.path.dirname(path))
150 |                 logger.info("Removed %s", os.path.dirname(path))
151 |                 pure = pathlib.PurePath(path)
152 |                 # Remove the parent temporary folders if they exist
153 |                 os.rmdir(pure.parent.parent)
154 |                 logger.info("Removed %s", pure.parent.parent)
155 |                 if FDR.output_path not in pure.parent.parent.parent.name:
156 |                     os.rmdir(pure.parent.parent.parent)
157 |                     logger.info("Removed %s", pure.parent.parent.parent)
158 |         else:
159 |             s3_target.upload_fileobj(file_object, FDR.target_bucket_name, key)
160 |             logger.info("Uploaded file to path %s", key)
161 |     # We're done
162 |     return True
163 | 
164 | 
165 | def download_message_files(msg):
166 |     """Downloads the files from s3 referenced in msg and places them in output_path.
167 | 
168 |     download_message_files function will iterate through every file listed at msg['filePaths'],
169 |     move it to our output_path, and then call handle_file.
170 |     """
171 |     # For every file in our message
172 |     for s3_file in msg["files"]:
173 |         # Retrieve the bucket path for this file
174 |         s3_path = s3_file["path"]
175 |         if not FDR.in_memory_transfer_only:
176 |             # Create a local path name for our destination file based off of the S3 path
177 |             # Construct output path for this message's files
178 |             msg_output_path = os.path.realpath(
179 |                 os.path.join(FDR.output_path, msg["pathPrefix"])
180 |             )
181 |             # Only write files to the specified output_path
182 |             if (
183 |                 os.path.commonpath([FDR.output_path, msg_output_path])
184 |                 != FDR.output_path
185 |             ):
186 |                 logger.info(
187 |                     f"Skipping {msg_output_path} to prevent writes outside of output path: {FDR.output_path}"
188 |                 )
189 |                 continue
190 |             # Ensure directory exists at output path
191 |             if not os.path.exists(msg_output_path):
192 |                 # Create it if it doesn't
193 |                 os.makedirs(msg_output_path)
194 |             local_path = os.path.realpath(os.path.join(FDR.output_path, s3_path))
195 |             # Only write files to the specified output_path
196 |             if os.path.commonpath([FDR.output_path, local_path]) != FDR.output_path:
197 |                 logger.info(
198 |                     f"Skipping {local_path} to prevent writes outside of output path: {FDR.output_path}"
199 |                 )
200 |                 continue
201 |             if not os.path.exists(os.path.dirname(local_path)):
202 |                 # Handle fdr platform and time partitioned folders
203 |                 os.makedirs(os.path.dirname(local_path))
204 |             # Open our local file for binary write
205 |             with open(local_path, "wb") as data:
206 |                 # Download the file from S3 into our opened local file
207 |                 s3.download_fileobj(msg["bucket"], s3_path, data)
208 |             logger.info("Downloaded file to path %s", local_path)
209 |             # Handle S3 upload if configured
210 |             handle_file(local_path, s3_path, None)
211 |         else:
212 |             logger.info("Downloading file to memory")
213 |             s3t = boto3.resource(
214 |                 "s3",
215 |                 region_name=FDR.region_name,
216 |                 aws_access_key_id=FDR.aws_key,
217 |                 aws_secret_access_key=FDR.aws_secret,
218 |             )
219 |             bkt = s3t.Bucket(msg["bucket"])
220 |             obj = bkt.Object(s3_path)
221 |             stream = io.BytesIO()
222 |             obj.download_fileobj(stream)
223 |             # Seek to the beginning of the stream before passing it to the upload handler
224 |             stream.seek(0)
225 |             handle_file(None, s3_path, stream)
226 | 
227 | 
228 | def consume_data_replicator():
229 |     """Consume from data replicator and track number of messages/files/bytes downloaded."""
230 |     # Tracking details
231 |     msg_cnt = 0
232 |     file_cnt = 0
233 |     byte_cnt = 0
234 | 
235 |     # Continuously poll the queue for new messages.
236 |     while not FDR.exiting:
237 |         received = False
238 |         # Receive messages from queue if any exist
239 |         # (NOTE: receive_messages() only receives a few messages at a time, it does NOT exhaust the queue)
240 |         for msg in queue.receive_messages(VisibilityTimeout=FDR.visibility_timeout):
241 |             received = True
242 |             # Increment our message counter
243 |             msg_cnt += 1
244 |             logger.info("Processing message %i [%s]", msg_cnt, msg.message_id)
245 |             # Grab the actual message body
246 |             body = json.loads(msg.body)
247 |             # Download the file to our local file system and potentially upload it to S3
248 |             download_message_files(body)
249 |             # Increment our file count by using the fileCount value in our message
250 |             file_cnt += body["fileCount"]
251 |             # Increment our byte count by using the totalSize value in our message
252 |             byte_cnt += body["totalSize"]
253 |             logger.info("Removing message %i [%s] from queue", msg_cnt, msg.message_id)
254 |             # Remove our message from the queue, if this is not performed in visibility_timeout seconds
255 |             # this message will be restored to the queue for follow-up processing
256 |             msg.delete()
257 |             # Sleep until our next message iteration
258 |             time.sleep(FDR.message_delay)
259 | 
260 |         logger.info(
261 |             "Messages consumed: %i\tFile count: %i\tByte count: %i",
262 |             msg_cnt,
263 |             file_cnt,
264 |             byte_cnt,
265 |         )
266 |         if not received:
267 |             logger.info(
268 |                 "No messages received, sleeping for %i seconds", FDR.queue_delay
269 |             )
270 |             time.sleep(FDR.queue_delay)
271 | 
272 |     # We've requested an exit
273 |     if FDR.exiting:
274 |         # Clean exit
275 |         logger.warning("Routine exit requested")
276 |         sys.exit(0)
277 |     else:
278 |         # Something untoward has occurred
279 |         logger.error("Unexpected exit occurred")
280 |         sys.exit(1)
281 | 
282 | 
283 | # Start our main routine
284 | if __name__ == "__main__":
285 |     # Configure our accepted command line parameters
286 |     parser = argparse.ArgumentParser("Falcon Data Replicator")
287 |     parser.add_argument(
288 |         "-f",
289 |         "--config_file",
290 |         dest="config_file",
291 |         help="Path to the configuration file",
292 |         required=False,
293 |     )
294 |     # Parse any parameters passed at runtime
295 |     args = parser.parse_args()
296 |     # If we were not provided a configuration file name
297 |     if not args.config_file:
298 |         # Use the default name / location provided in our repo
299 |         CONFIG_FILE = "../falcon_data_replicator.ini"
300 |     else:
301 |         # Use the configuration file provided at runtime
302 |         CONFIG_FILE = args.config_file
303 |     # Read in our configuration parameters
304 |     configuration = configparser.ConfigParser()
305 |     configuration.read(CONFIG_FILE)
306 |     # Create our connector
307 |     FDR = FDRConnector(configuration)
308 |     # Setup our root logger
309 |     logging.basicConfig(
310 |         level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s"
311 |     )
312 |     # Create our FDR logger
313 |     logger = logging.getLogger("FDR Connector")
314 |     # Rotate log file handler
315 |     RFH = RotatingFileHandler(FDR.log_file, maxBytes=20971520, backupCount=5)
316 |     # Log file output format
317 |     F_FORMAT = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
318 |     # Set the log file output level to INFO
319 |     RFH.setLevel(logging.INFO)
320 |     # Add our log file formatter to the log file handler
321 |     RFH.setFormatter(F_FORMAT)
322 |     # Add our log file handler to our logger
323 |     logger.addHandler(RFH)
324 |     # Log our pre-startup event
325 |     logger.info(" _____ ____  ____        _")
326 |     logger.info("|  ___|  _ \\|  _ \\      (.\\")
327 |     logger.info("| |_  | | | | |_) |     |/(\\")
328 |     logger.info("|  _| | |_| |  _ <       \\(\\\\")
329 |     logger.info('|_|   |____/|_| \\_\\      "^"`\\')
330 |     logger.info("Process starting up")
331 |     # Enable our graceful exit handler to allow uploads and artifact
332 |     # cleanup to complete for SIGINT, SIGTERM and SIGQUIT signals.
333 |     sig.signal(sig.SIGINT, partial(clean_exit, FDR))
334 |     sig.signal(sig.SIGTERM, partial(clean_exit, FDR))
335 |     sig.signal(sig.SIGQUIT, partial(clean_exit, FDR))
336 |     # Connect to our CrowdStrike provided SQS queue
337 |     sqs = boto3.resource(
338 |         "sqs",
339 |         region_name=FDR.region_name,
340 |         aws_access_key_id=FDR.aws_key,
341 |         aws_secret_access_key=FDR.aws_secret,
342 |     )
343 |     # Connect to our CrowdStrike provided S3 bucket
344 |     s3 = boto3.client(
345 |         "s3",
346 |         region_name=FDR.region_name,
347 |         aws_access_key_id=FDR.aws_key,
348 |         aws_secret_access_key=FDR.aws_secret,
349 |     )
350 |     # If we are doing S3 uploads
351 |     if FDR.target_bucket_name and FDR.target_region_name:
352 |         logger.info("Upload to AWS S3 enabled")
353 |         # Connect to our target S3 bucket, uses the existing client configuration to connect (Not the CS provided ones)
354 |         s3_target = boto3.client("s3", region_name=FDR.target_region_name)
355 |     # Create our queue object for handling message traffic
356 |     queue = sqs.Queue(url=FDR.queue_url)
357 |     logger.info("Startup complete")
358 |     # Start consuming the replicator feed
359 |     consume_data_replicator()
360 | 
361 | 
362 | #                     .
363 | #      Your data      |  _____________________________________________________     ___
364 | #          is here!   | |    _____                  ________      _ __        |  __
365 | #            \ _______| |   / ___/______ _    _____/ / __/ /_____(_) /_____   |      ___
366 | #             / _____ | |  / /__/ __/ _ \ |/|/ / _  /\ \/ __/ __/ /  '_/ -_)  |
367 | #            / /(__) || |  \___/_/  \___/__,__/\_,_/___/\__/_/ /_/_/\_\\__/   |  ___
368 | #   ________/ / |OO| || |                                                     |
369 | #  | Hemi    |-------|| |                     --= FALCON DATA REPLICATOR >>   | ___
370 | # (|         |     -.|| |_______________________                              |    ____
371 | #  |  ____   \       ||_________||____________  |             ____      ____  |
372 | # /| / __ \   |______||     / __ \   / __ \   | |            / __ \    / __ \ |\       ___
373 | # \|| /  \ |_______________| /  \ |_| /  \ |__| |___________| /  \ |__| /  \|_|/
374 | #    | () |                 | () |   | () |                  | () |    | () |     ____
375 | #     \__/                   \__/     \__/                    \__/      \__/
376 | 


--------------------------------------------------------------------------------