├── defplorex
    ├── __init__.py
    ├── backend
    │   ├── __init__.py
    │   └── elastic.py
    ├── config
    │   ├── 11-prod-settings.json.DISABLED
    │   ├── __init__.py
    │   └── 00-base-settings.json
    ├── transformer
    │   ├── base.py
    │   ├── tag.py
    │   └── __init__.py
    ├── celeryapp.py
    ├── celeryconfig.py
    ├── loggers.py
    ├── utils.py
    ├── tasks.py
    └── console.py
├── i
    ├── dpx-binning.png
    ├── dpx-celery.png
    ├── dpx-overall.png
    ├── dpx-extraction.png
    ├── dpx-features.png
    ├── dpx-binning-viz.png
    ├── dpx-clusters-viz.png
    ├── dpx-features-viz.png
    └── dpx-binned-records-viz.png
├── deploy
    └── requirements.pip
├── LICENSE
├── setup.py
└── README.md


/defplorex/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/defplorex/backend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/i/dpx-binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binning.png


--------------------------------------------------------------------------------
/i/dpx-celery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-celery.png


--------------------------------------------------------------------------------
/i/dpx-overall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-overall.png


--------------------------------------------------------------------------------
/i/dpx-extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-extraction.png


--------------------------------------------------------------------------------
/i/dpx-features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-features.png


--------------------------------------------------------------------------------
/i/dpx-binning-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binning-viz.png


--------------------------------------------------------------------------------
/i/dpx-clusters-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-clusters-viz.png


--------------------------------------------------------------------------------
/i/dpx-features-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-features-viz.png


--------------------------------------------------------------------------------
/i/dpx-binned-records-viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binned-records-viz.png


--------------------------------------------------------------------------------
/deploy/requirements.pip:
--------------------------------------------------------------------------------
 1 | amqp
 2 | appnope==0.1.0
 3 | argcomplete==1.4.1
 4 | argh==0.26.2
 5 | arrow==0.8.0
 6 | Babel==2.3.4
 7 | billiard==3.5.0.2
 8 | blinker==1.4
 9 | celery
10 | cffi==1.8.3
11 | click==6.6
12 | colorlog==2.7.0
13 | ConfigArgParse==0.10.0
14 | construct==2.5.3
15 | cycler==0.10.0
16 | elasticsearch
17 | elasticsearch-dsl
18 | humanize==0.5.1
19 | ipython-genutils==0.1.0
20 | itsdangerous==0.24
21 | kombu
22 | progress==1.2
23 | prompt-toolkit==1.0.7
24 | python-dateutil
25 | pytz==2016.6.1
26 | shove==0.6.6
27 | simplejson==3.8.2
28 | six==1.10.0
29 | stuf==0.9.16
30 | tabulate==0.7.5
31 | traitlets==4.3.0
32 | unicodecsv
33 | urllib3==1.19.1
34 | urwid==1.3.1
35 | wcwidth==0.1.7
36 | configparser==3.5.0
37 | httplib2==0.9.2
38 | mccabe==0.5.2
39 | pyrabbit==1.1.0
40 | python-dotenv==0.6.2
41 | anyconfig==0.7.0
42 | python-logstash==0.4.6
43 | tzlocal==1.3
44 | 


--------------------------------------------------------------------------------
/defplorex/config/11-prod-settings.json.DISABLED:
--------------------------------------------------------------------------------
 1 | {
 2 |     "LOGGING": {
 3 |         "version": 1,
 4 |         "disable_existing_loggers": true,
 5 |         "formatters": {
 6 |             "logstash": {
 7 |                 "()": "loggers.Formatter",
 8 |                 "format": "1 %(asctime)s {host} %(processName)s %(process)d {project} [pythonData filename=\"%(filename)s\" thread=\"%(threadName)s\" line=\"%(lineno)d\" module=\"%(module)s\"][esData index_keyword=\"{project}\"] %(levelname)s %(message)s"
 9 |             }
10 |         },
11 |         "handlers": {
12 |             "logstash": {
13 |                 "()": "logging.handlers.SysLogHandler",
14 |                 "formatter": "logstash",
15 |                 "level": "WARN",
16 |                 "address": ["__LOGSTASH_SERVER__", 5514]
17 |             }
18 |         },
19 |         "loggers": {
20 |             "defplorex": {
21 |                 "handlers": [
22 |                     "logstash"
23 |                 ],
24 |                 "level": "WARN",
25 |                 "propagate": false
26 |             }
27 |         },
28 |         "root": {
29 |             "handlers": [
30 |                 "logstash"
31 |             ],
32 |             "level": "WARN"
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Trend Micro Incorporated
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
14 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
17 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
18 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
19 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
20 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
21 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
22 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
23 | POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | The views and conclusions contained in the software and documentation are
26 | those of the authors and should not be interpreted as representing official
27 | policies, either expressed or implied, of the FreeBSD Project.
28 | 


--------------------------------------------------------------------------------
/defplorex/transformer/base.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | import logging
32 | 
33 | log = logging.getLogger(__name__)
34 | 
35 | 
36 | class Transformer(object):
37 |     """
38 |     Generic class to transform documents
39 |     """
40 |     def __call__(self, doc, *args, **kwargs):
41 |         log.info('Calling %s', self._name)
42 |         return kwargs.get('original_doc', {})
43 | 
44 | 


--------------------------------------------------------------------------------
/defplorex/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | import os
32 | import anyconfig
33 | 
34 | 
35 | DEFAULT_CONFIG_PATH = os.path.abspath(
36 |     os.path.join(os.path.dirname(__file__), '*settings.json'))
37 | 
38 | 
39 | def load_settings(config_path=DEFAULT_CONFIG_PATH):
40 |     """ Load settings """
41 |     settings = anyconfig.load(config_path)
42 | 
43 |     settings['bulk_size'] = int(settings.get('bulk_size', 100))
44 | 
45 |     if os.environ.get('DEBUG'):
46 |         settings['DEBUG'] = True
47 | 
48 |     return settings
49 | 


--------------------------------------------------------------------------------
/defplorex/celeryapp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | import logging
32 | 
33 | from celery import Celery
34 | from celery.signals import setup_logging
35 | 
36 | from defplorex.celeryconfig import broker_url, result_backend, timezone
37 | 
38 | app = Celery()
39 | app.config_from_object('defplorex.celeryconfig')
40 | 
41 | log = logging.getLogger(__name__)
42 | 
43 | 
44 | @setup_logging.connect
45 | def _setup_logging(loglevel=logging.WARN, **kwargs):
46 |     from loggers import config_logger
47 |     config_logger(level=loglevel)
48 | 
49 |     log.debug(
50 |             'Configuring Celery on broker = %s,'
51 |             ' result_backend = %s, timezone = %s',
52 |             broker_url, result_backend, timezone)
53 | 
54 | 
55 | app.log.setup()
56 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | from setuptools import setup, find_packages
32 | 
33 | DEPENDENCIES = open('deploy/requirements.pip', 'r').read().split('\n')
34 | 
35 | setup(
36 |     name='defplorex',
37 |     version='0.1',
38 |     description='ES Framework for Large-Scale Processing',
39 |     author='FTR Team at Trend Micro',
40 |     url='https://github.com/trendmicro/defplorex',
41 |     packages=find_packages(),
42 |     entry_points='''
43 |         [console_scripts]
44 |         dpx=defplorex.console:cli
45 |     ''',
46 |     install_requires=DEPENDENCIES,
47 |     classifiers=[
48 |         'Intended Audience :: Developers',
49 |         'License :: OSI Approved :: MIT License',
50 |         'Operating System :: OS Independent',
51 |         'Programming Language :: Python',
52 |         'Topic :: Software Development :: Libraries :: Python Modules',
53 |     ],
54 | )
55 | 


--------------------------------------------------------------------------------
/defplorex/transformer/tag.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | import logging
32 | 
33 | log = logging.getLogger(__name__)
34 | 
35 | from defplorex.transformer.base import Transformer
36 | 
37 | 
38 | class TagTransformer(Transformer):
39 |     """
40 |     Example transform to append tag to a record.
41 |     """
42 |     _name = 'tag'
43 | 
44 |     def __call__(self, doc, *args, **kwargs):
45 |         doc = super(TagTransformer, self).__call__(
46 |                 doc, *args, **kwargs)
47 | 
48 |         tag = kwargs.get('tag')
49 | 
50 |         if not tag:
51 |             log.debug('No tags supplied, skipping')
52 |             return []
53 | 
54 |         tags = doc.get('tags', [])
55 | 
56 |         if tags:
57 |             log.debug('Found tags: %s', tags)
58 | 
59 |         tags.append(tag)
60 |         tags = list(set(tags))
61 | 
62 |         log.debug('Updated tags: %s', tags)
63 | 
64 |         return dict(tags=tags)
65 | 


--------------------------------------------------------------------------------
/defplorex/config/00-base-settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "project": "__PROJECT_NAME__",
 3 | 
 4 |     "default_lang": "eng",
 5 |     "bulk_size": 1000,
 6 | 
 7 |     "es": {
 8 |         "client": {
 9 |             "hosts": [
10 |                 "__ELASTCSEARCH_SERVER__:9200"
11 |             ],
12 |             "timeout": 240,
13 |             "retry_on_timeout": true
14 |         },
15 |         "index": "__INDEX_NAME__",
16 |         "doc_type": "__DOC_TYPE_NAME__"
17 |     },
18 | 
19 |     "encoding": "utf-8",
20 |     "path_encoding": "utf-8",
21 |     "target_tz": "UTC",
22 |     "date_format": "YYYY-MM-DD HH:mm:ss",
23 | 
24 |     "LOGGING": {
25 |         "version": 1,
26 |         "disable_existing_loggers": true,
27 |         "formatters": {
28 |             "console": {
29 |                 "()": "defplorex.loggers.Formatter",
30 |                 "format": "[%(asctime)s] - %(name)s |%(levelname)s| %(message)s"
31 |             },
32 |             "colorlog": {
33 |                 "()": "colorlog.ColoredFormatter",
34 |                 "format": "%(log_color)s[%(asctime)s] - %(name)s |%(levelname)s| %(message)s"
35 |             },
36 |             "logstash": {
37 |                 "()": "defplorex.loggers.Formatter",
38 |                 "format": "1 %(asctime)s {host} %(processName)s %(process)d {project} [pythonData filename=\"%(filename)s\" thread=\"%(threadName)s\" line=\"%(lineno)d\" module=\"%(module)s\"][esData index_keyword=\"{project}\"] %(levelname)s %(message)s"
39 |             }
40 |         },
41 |         "handlers": {
42 |             "console": {
43 |                 "()": "logging.StreamHandler",
44 |                 "formatter": "console",
45 |                 "level": "DEBUG"
46 |             },
47 |             "colorlog": {
48 |                 "()": "colorlog.StreamHandler",
49 |                 "formatter": "colorlog",
50 |                 "level": "DEBUG"
51 |             },
52 |             "logstash": {
53 |                 "()": "logging.handlers.SysLogHandler",
54 |                 "formatter": "logstash",
55 |                 "level": "DEBUG",
56 |                 "address": ["__LOGSTASH_SERVER__", 5514]
57 |             }
58 |         },
59 |         "loggers": {
60 |             "defplorex": {
61 |                 "handlers": [
62 |                     "colorlog"
63 |                 ],
64 |                 "level": "DEBUG",
65 |                 "propagate": false
66 |             }
67 |         },
68 |         "root": {
69 |             "handlers": [
70 |                 "colorlog"
71 |             ],
72 |             "level": "DEBUG"
73 |         }
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/defplorex/celeryconfig.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | # NOTE this file is provided as a mere example: the end user can change it as
32 | # needed
33 | 
34 | 
35 | import os
36 | 
37 | from kombu import Queue as Q, Exchange as E
38 | from dotenv import load_dotenv, find_dotenv
39 | 
40 | 
41 | # load './.env' file, if any
42 | load_dotenv(find_dotenv())
43 | 
44 | imports = ('tasks',)
45 | 
46 | timezone = os.environ.get('celery_timezone', 'UTC')
47 | broker_url = os.environ.get('celery_broker_url', 'redis://')
48 | result_backend = os.environ.get('celery_result_backend', 'redis://')
49 | 
50 | enable_utc = timezone == 'UTC'
51 | 
52 | result_persistent = False
53 | 
54 | worker_hijack_root_logger = False
55 | worker_pool_restarts = True
56 | task_default_queue = 'celery'
57 | 
58 | worker_send_task_events = True
59 | 
60 | queues = [
61 |         'processor_task']
62 | 
63 | task_queues = [Q(n, E(n, 'direct'), n, durable=False) for n in queues]
64 | task_routes = {'tasks.{}'.format(q): q for q in queues}
65 | 


--------------------------------------------------------------------------------
/defplorex/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright (c) 2017, Trend Micro Incorporated
 4 | # All rights reserved.
 5 | #
 6 | # Redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. Redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 | # POSSIBILITY OF SUCH DAMAGE.
26 | #
27 | # The views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the FreeBSD Project.
30 | 
31 | import logging
32 | 
33 | from defplorex.transformer.tag import TagTransformer
34 | 
35 | log = logging.getLogger(__name__)
36 | 
37 | __all__ = [
38 |     'TagTransformer'
39 | ]
40 | 
41 | classes = [
42 |     TagTransformer
43 | ]
44 | 
45 | 
46 | class TransformerFactory(object):
47 |     registry = {t._name: t for t in classes}
48 | 
49 |     @classmethod
50 |     def get_by_name(cls, name):
51 |         return cls.registry.get(name)
52 | 
53 |     @classmethod
54 |     def get_by_list(cls, name_lst):
55 |         return [cls.get_by_name(name) for name in name_lst]
56 | 
57 |     @classmethod
58 |     def get_names(cls):
59 |         return cls.registry.keys()
60 | 
61 |     @classmethod
62 |     def get_classes(cls):
63 |         return cls.registry.items()
64 | 
65 | 
66 | class Pipeline(object):
67 |     @staticmethod
68 |     def chain(doc, transformers, updates_only=True, *args, **kwargs):
69 |         doc = doc.copy()
70 | 
71 |         if '_source' in doc:
72 |             doc = doc.get('_source', {})
73 | 
74 |         kwargs.update(**dict(original_doc=doc))
75 |         updates = {}
76 | 
77 |         for transformer in transformers:
78 |             _ = transformer(updates.copy(), *args, **kwargs)
79 |             updates.update(**_)
80 | 
81 |         if updates_only:
82 |             return updates
83 | 
84 |         doc.update(**updates)
85 | 
86 |         return doc
87 | 


--------------------------------------------------------------------------------
/defplorex/loggers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # copyright (c) 2017, trend micro incorporated
 4 | # all rights reserved.
 5 | #
 6 | # redistribution and use in source and binary forms, with or without
 7 | # modification, are permitted provided that the following conditions are met:
 8 | #
 9 | # 1. redistributions of source code must retain the above copyright notice,
10 | # this list of conditions and the following disclaimer.
11 | # 2. redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # this software is provided by the copyright holders and contributors "as is"
16 | # and any express or implied warranties, including, but not limited to, the
17 | # implied warranties of merchantability and fitness for a particular purpose
18 | # are disclaimed. in no event shall the copyright owner or contributors be
19 | # liable for any direct, indirect, incidental, special, exemplary, or
20 | # consequential damages (including, but not limited to, procurement of
21 | # substitute goods or services; loss of use, data, or profits; or business
22 | # interruption) however caused and on any theory of liability, whether in
23 | # contract, strict liability, or tort (including negligence or otherwise)
24 | # arising in any way out of the use of this software, even if advised of the
25 | # possibility of such damage.
26 | #
27 | # the views and conclusions contained in the software and documentation are
28 | # those of the authors and should not be interpreted as representing official
29 | # policies, either expressed or implied, of the freebsd project.
30 | 
31 | import logging
32 | import logging.config
33 | import logging.handlers
34 | import socket
35 | 
36 | import arrow
37 | from tzlocal import get_localzone
38 | 
39 | from defplorex.config import load_settings
40 | 
41 | log = logging.getLogger(__name__)
42 | 
43 | 
44 | class Formatter(logging.Formatter):
45 |     """Simple formatter"""
46 | 
47 |     def converter(self, timestamp):
48 |         """Convert date to local timezone"""
49 | 
50 |         ltz = get_localzone()
51 |         converted = arrow.get(
52 |                 timestamp,
53 |                 tz=ltz).to('UTC')
54 |         return converted.datetime.timetuple()
55 | 
56 | 
57 | def config_logger(level=logging.WARN, debug=False):
58 |     """Configure logger"""
59 | 
60 |     # settings
61 |     settings = load_settings()
62 |     project = settings.get('project', 'project')
63 |     host = socket.getfqdn()
64 |     _logging = settings.get('LOGGING').copy()
65 | 
66 |     if settings.get('DEBUG', False) or debug:
67 |         level = logging.DEBUG
68 | 
69 |     # override level if DEBUG
70 |     for handler in _logging.get('handlers').keys():
71 |         _logging['handlers'][handler]['level'] = level
72 |     for logger in _logging.get('loggers').keys():
73 |         _logging['loggers'][logger]['level'] = level
74 |     if 'root' in _logging:
75 |         _logging['root']['level'] = level
76 | 
77 |     # set host in format
78 |     if 'logstash' in _logging.get('formatters'):
79 |         fmt = _logging['formatters']['logstash']['format']
80 |         fmt = fmt.format(host=host, project=project)
81 |         _logging['formatters']['logstash']['format'] = fmt
82 | 
83 |     if 'logstash' in _logging.get('handlers'):
84 |         address = _logging['handlers']['logstash']['address']
85 |         _logging['handlers']['logstash']['address'] = tuple(address)
86 | 
87 |     logging.config.dictConfig(_logging)
88 | 
89 |     log.info('Logger configured: %s', log)
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     config_logger()
94 | 


--------------------------------------------------------------------------------
/defplorex/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2017, Trend Micro Incorporated
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #
  9 | # 1. Redistributions of source code must retain the above copyright notice,
 10 | # this list of conditions and the following disclaimer.
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation
 13 | # and/or other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 25 | # POSSIBILITY OF SUCH DAMAGE.
 26 | #
 27 | # The views and conclusions contained in the software and documentation are
 28 | # those of the authors and should not be interpreted as representing official
 29 | # policies, either expressed or implied, of the FreeBSD Project.
 30 | 
 31 | from __future__ import division
 32 | 
 33 | import re
 34 | import logging
 35 | 
 36 | from progress.bar import Bar
 37 | from progress.spinner import Spinner
 38 | from progress.helpers import WriteMixin
 39 | from progress import Infinite
 40 | import humanize
 41 | 
 42 | ip_re = re.compile(
 43 |         '(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}'
 44 |         '(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))')
 45 | 
 46 | 
 47 | log = logging.getLogger(__name__)
 48 | 
 49 | 
 50 | def fopen(fname, *args):
 51 |     if not fname:
 52 |         import sys
 53 |         return sys.stdout
 54 | 
 55 |     if fname.endswith('.gz'):
 56 |         import gzip
 57 |         return gzip.open(fname, *args)
 58 |     return open(fname, *args)
 59 | 
 60 | 
 61 | class Counter(WriteMixin, Infinite):
 62 |     message = ''
 63 | 
 64 |     def __init__(self, name):
 65 |         super(Counter, self).__init__()
 66 |         self.name = name
 67 | 
 68 |     def update(self, txt):
 69 |         self.write('{}: {}'.format(self.name, str(txt)))
 70 | 
 71 | 
 72 | class FancyBar(Bar):
 73 |     message = ''
 74 |     fill = '*'
 75 |     suffix = '[%(percent)d%%] %(index)d/%(max)d - ' \
 76 |              'ETA: %(eta)ds' \
 77 |              ' (%(elapsed_td)s) - %(avg)f sec/itm'
 78 | 
 79 | 
 80 | class SlowFancyBar(Bar):
 81 |     message = ''
 82 |     fill = '*'
 83 |     suffix = '[%(percent)d%%] %(index)d/%(max)d - ' \
 84 |              'ETA: %(eta)ds ~= %(rem_h)dhrs' \
 85 |              ' (%(elapsed_td)s) - %(avg)f s/itm'
 86 | 
 87 |     @property
 88 |     def rem_h(self):
 89 |         return self.eta // 3600
 90 | 
 91 | 
 92 | class SlowOverallFancyBar(Bar):
 93 |     message = ''
 94 |     fill = '*'
 95 |     suffix = '[%(percent)d%%] %(index)d/%(max)d ' \
 96 |              'ETA: %(natural_eta)s' \
 97 |              ' (%(natural_overall_eta)s for %(grand_tot)s)' \
 98 |              ' (%(nat_elapsed)s) - %(avg)f s/itm'
 99 | 
100 |     def __init__(self, *args, **kwargs):
101 |         self.grand_total = kwargs.pop('grand_total')
102 |         super(SlowOverallFancyBar, self).__init__(*args, **kwargs)
103 | 
104 |     @property
105 |     def natural_eta(self):
106 |         return humanize.naturaldelta(self.eta)
107 | 
108 |     @property
109 |     def natural_overall_eta(self):
110 |         return humanize.naturaldelta(self.avg * self.grand_total)
111 | 
112 |     @property
113 |     def grand_tot(self):
114 |         return humanize.intword(self.grand_total)
115 | 
116 |     @property
117 |     def nat_elapsed(self):
118 |         return humanize.naturaldelta(self.elapsed_td)
119 | 
120 | 
121 | class FancySpinner(Spinner):
122 |     suffix = '%(index)d'
123 | 


--------------------------------------------------------------------------------
/defplorex/tasks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2017, Trend Micro Incorporated
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #
  9 | # 1. Redistributions of source code must retain the above copyright notice,
 10 | # this list of conditions and the following disclaimer.
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation
 13 | # and/or other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 25 | # POSSIBILITY OF SUCH DAMAGE.
 26 | #
 27 | # The views and conclusions contained in the software and documentation are
 28 | # those of the authors and should not be interpreted as representing official
 29 | # policies, either expressed or implied, of the FreeBSD Project.
 30 | 
 31 | import logging
 32 | 
 33 | from celeryapp import app as clapp
 34 | 
 35 | from defplorex.transformer import TagTransformer, TransformerFactory, Pipeline
 36 | 
 37 | log = logging.getLogger(__name__)
 38 | 
 39 | 
 40 | class ProcessorTask(object):
 41 |     max_retries = 3
 42 |     default_retry_delay = 30
 43 | 
 44 |     _settings = None
 45 |     _es = None
 46 | 
 47 |     @property
 48 |     def settings(self):
 49 |         if self._settings is None:
 50 |             from config import load_settings
 51 |             self._settings = load_settings()
 52 |         return self._settings
 53 | 
 54 |     @property
 55 |     def es(self):
 56 |         if self._es is None:
 57 |             from defplorex.backend.elastic import ES
 58 |             self._es = ES(self.settings)
 59 |         return self._es
 60 | 
 61 |     def __init__(self, transformers, tr_args=[], tr_kwargs={}):
 62 |         self.transformers = [TagTransformer()]
 63 | 
 64 |         if isinstance(transformers, list):
 65 |             for k in transformers:
 66 |                 tr_kwargs.update(**dict(settings=self.settings))
 67 |                 self.transformers.append(k(*tr_args, **tr_kwargs))
 68 | 
 69 |     def run(self, ids, index, *args, **kwargs):
 70 |         log.info('Received task for %d IDs on index %s', len(ids), index)
 71 | 
 72 |         query = dict(query=dict(ids=dict(values=filter(lambda x: x, ids))))
 73 |         kwargs.update(**dict(settings=self.settings))
 74 |         update = kwargs.get('update', True)
 75 |         ephemeral = kwargs.get('ephemeral', False)
 76 | 
 77 |         def _transform(doc):
 78 |             return Pipeline.chain(
 79 |                     doc,
 80 |                     self.transformers,
 81 |                     updates_only=update, *args, **kwargs)
 82 | 
 83 |         if ephemeral:
 84 |             return [_transform(doc) for doc in self.es.scan(index, query)]
 85 | 
 86 |         err_ids = self.es.partial_update_from_query(
 87 |                 index=index,
 88 |                 query=query,
 89 |                 transform=_transform)
 90 | 
 91 |         if err_ids:
 92 |             raise Exception('IDs = %s have failed (will retry)', err_ids)
 93 | 
 94 | 
 95 | @clapp.task(
 96 |         bind=True,
 97 |         default_retry_delay=ProcessorTask.default_retry_delay,
 98 |         max_retries=ProcessorTask.max_retries)
 99 | def processor_task(self, ids, index, **kwargs):
100 |     """
101 |     Generic task that executes a serie of transformations on the doc
102 |     """
103 |     transformers_lst = kwargs.get('transformers_lst', [])
104 |     tr_args = kwargs.get('tr_args', [])
105 |     tr_kwargs = kwargs.get('tr_kwargs', {})
106 |     ephemeral = kwargs.get('ephemeral', False)
107 | 
108 |     if index is None:
109 |         return []
110 | 
111 |     transformers = TransformerFactory.get_by_list(transformers_lst)
112 |     processor = ProcessorTask(
113 |             transformers,
114 |             tr_args=tr_args,
115 |             tr_kwargs=tr_kwargs)
116 | 
117 |     try:
118 |         r = processor.run(ids, index, **kwargs)
119 |         if ephemeral:
120 |             return r
121 |     except Exception as e:
122 |         log.warn('Retrying task %s because: %s', self.request.id, e)
123 |         raise self.retry(exc=e)
124 | 


--------------------------------------------------------------------------------
/defplorex/console.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2017, Trend Micro Incorporated
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #
  9 | # 1. Redistributions of source code must retain the above copyright notice,
 10 | # this list of conditions and the following disclaimer.
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation
 13 | # and/or other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 25 | # POSSIBILITY OF SUCH DAMAGE.
 26 | #
 27 | # The views and conclusions contained in the software and documentation are
 28 | # those of the authors and should not be interpreted as representing official
 29 | # policies, either expressed or implied, of the FreeBSD Project.
 30 | 
 31 | from __future__ import division
 32 | 
 33 | import sys
 34 | import time
 35 | import logging
 36 | 
 37 | # 3rd partymodules
 38 | import click
 39 | import simplejson
 40 | 
 41 | from elasticsearch_dsl import Search, Q
 42 | 
 43 | # local modules
 44 | from defplorex.loggers import config_logger
 45 | from defplorex.config import load_settings
 46 | from defplorex.backend.elastic import ES
 47 | from defplorex.transformer import TransformerFactory
 48 | from defplorex.utils import (
 49 |         SlowOverallFancyBar,
 50 |         SlowFancyBar,
 51 |         fopen)
 52 | 
 53 | # locals
 54 | log = logging.getLogger(__name__)
 55 | settings = load_settings()
 56 | es = ES(settings)
 57 | TR = TransformerFactory.get_names()
 58 | 
 59 | INDEX = settings.get('es').get('index')
 60 | 
 61 | 
 62 | @click.group()
 63 | @click.option(
 64 |         '--debug', '-d',
 65 |         is_flag=True, default=False, help='Enable debugging output')
 66 | def cli(debug):
 67 |     config_logger(debug=debug)
 68 | 
 69 |     # NOTE put any preparatory task here
 70 | 
 71 |     log.info('Command line ready...')
 72 | 
 73 | 
 74 | @cli.command()
 75 | def show_settings():
 76 |     """Print the configuration settings"""
 77 |     simplejson.dump(settings, sys.stdout)
 78 | 
 79 | 
 80 | @cli.group()
 81 | def process():
 82 |     """Distributed data-processing commands"""
 83 |     pass
 84 | 
 85 | 
 86 | @cli.group()
 87 | def elastic():
 88 |     """ES commands"""
 89 |     log.info('Elasticsearch commands')
 90 | 
 91 | 
 92 | @process.command()
 93 | @click.option(
 94 |         '--index', '-i',
 95 |         help='Read from index',
 96 |         metavar='F', default=INDEX)
 97 | @click.option(
 98 |         '--transformer', '-T',
 99 |         multiple=True,
100 |         type=click.Choice(TR),
101 |         metavar='T',
102 |         help='Transformation: {}'.format(TR))
103 | @click.option(
104 |         '--limit', '-l', type=int,
105 |         metavar='L', help='Limit number of records')
106 | @click.option(
107 |         '--tag', '-t',
108 |         metavar='TAG', help='Tag the records')
109 | @click.option(
110 |         '--reindex', '-r',
111 |         is_flag=True, default=False, help='Do not update,'
112 |         ' but re-index (expensive)')
113 | @click.option(
114 |         '--now', '-n',
115 |         is_flag=True, default=False, help='Execute locally')
116 | @click.option(
117 |         '--ephemeral', '-e',
118 |         is_flag=True, default=False, help='Dry run')
119 | @click.argument('q', metavar='<q>')
120 | def enqueue(index, transformer, limit, tag, reindex, now, ephemeral, q):
121 |     """
122 |     Read from index according to query, process, and write to index
123 |     """
124 |     if not transformer:
125 |         log.warn('Please choose at least one transform among %s', TR)
126 | 
127 |     from defplorex.tasks import processor_task
128 | 
129 |     log.info('Working on index %s', index)
130 | 
131 |     kwargs = dict(
132 |             update=not reindex,
133 |             ephemeral=ephemeral)
134 | 
135 |     if tag:
136 |         kwargs.update(**dict(tag=tag))
137 | 
138 |     # iterator that paginates through records
139 |     it = es.paginate(
140 |             index=index,
141 |             q=q,
142 |             limit=limit,
143 |             id_only=True)
144 | 
145 |     # enqueue one task per page of records
146 |     for ids in it:
147 |         ids = list(ids)
148 |         click.echo('Launching task with {} IDs'.format(len(ids)))
149 | 
150 |         kwargs.update(**dict(transformers_lst=transformer))
151 | 
152 |         s = processor_task.s(ids, index, **kwargs)
153 | 
154 |         if now:
155 |             res = s()
156 |         else:
157 |             res = s.delay()
158 |             if ephemeral:
159 |                 res = res.get()
160 | 
161 |         if ephemeral:
162 |             click.echo(simplejson.dumps(res, indent=2))
163 | 
164 | 
165 | @process.command()
166 | @click.option('--index', '-i', default=INDEX, help='Read from index')
167 | @click.option('--delta', '-D', help='Measure delta from beginning',
168 |               is_flag=True)
169 | @click.argument('query_string', metavar='<query_string>')
170 | def monitor(index, delta, query_string):
171 |     click.clear()
172 | 
173 |     def cnt():
174 |         q = Q('query_string', query=query_string)
175 |         s = Search(
176 |                 using=es.client,
177 |                 index=index).query(q)
178 |         return s.count()
179 | 
180 |     N = cnt()
181 |     tot = Search(using=es.client, index=index).count()
182 | 
183 |     if not delta:
184 |         N = tot
185 | 
186 |     log.info('Processing %d records (total: %d)', N, tot)
187 | 
188 |     click.echo('You can exit by CTRL-C: results will still process')
189 | 
190 |     bar = SlowOverallFancyBar('', max=N, grand_total=tot)
191 |     while True:
192 |         time.sleep(5.0)
193 |         try:
194 |             n = cnt()
195 |             if isinstance(n, int):
196 |                 if delta:
197 |                     done = N - n
198 |                 else:
199 |                     done = n
200 |                 bar.goto(done)
201 |         except Exception as e:
202 |             log.warn('Cannot count: %s', e)
203 |     bar.finish()
204 | 
205 | 
206 | @elastic.command()
207 | @click.argument('index')
208 | @click.argument('mappings_and_settings', type=click.File('rb'))
209 | def create_index(index, mappings_and_settings):
210 |     """Create an index given mappings and settings as a JSON"""
211 |     body = simplejson.load(mappings_and_settings)
212 | 
213 |     click.confirm('Create index "%s"?' % index, abort=True)
214 | 
215 |     es.client.indices.create(index=index, body=body)
216 | 
217 |     log.info('Index created')
218 | 
219 | 
220 | @elastic.command()
221 | @click.argument('index')
222 | def delete_index(index):
223 |     """Delete an index"""
224 |     click.clear()
225 | 
226 |     click.confirm(
227 |             click.style(
228 |                 'Really DELETE index "%s"?' % index,
229 |                 fg='white',
230 |                 bg='red'), abort=True)
231 | 
232 |     es.client.indices.delete(index=index)
233 | 
234 |     log.info('Index deleted')
235 | 
236 | 
237 | @elastic.command()
238 | @click.option(
239 |         '--use-helper',
240 |         '-H',
241 |         is_flag=True,
242 |         default=False,
243 |         help='Use old helper API')
244 | @click.argument('from_index')
245 | @click.argument('to_index')
246 | def clone_index(use_helper, from_index, to_index):
247 |     """Clone an index"""
248 |     from elasticsearch_dsl import Search
249 |     from elasticsearch.helpers import reindex
250 | 
251 |     click.clear()
252 | 
253 |     if not es.client.indices.exists(index=to_index):
254 |         click.secho('%s not existing!'.format(to_index), fg='red')
255 |         return 1
256 | 
257 |     cnt = Search(using=es.client, index=to_index).count()
258 |     message = 'Index %s already exists (%d records). Overwrite?' % (
259 |             to_index, cnt)
260 | 
261 |     click.confirm(message, abort=True)
262 | 
263 |     if use_helper:
264 |         reindex(
265 |                 client=es.client,
266 |                 source_index=from_index,
267 |                 target_index=to_index)
268 |     else:
269 |         es.client.reindex(
270 |                 body=dict(
271 |                     source=dict(index=from_index),
272 |                     dest=dict(index=to_index)),
273 |                 wait_for_completion=False)
274 | 
275 | 
276 | @elastic.command()
277 | @click.argument('from_index')
278 | @click.argument('to_index')
279 | def monitor_clone_index(from_index, to_index):
280 |     """Monitor the size of an index"""
281 |     from elasticsearch_dsl import Search
282 | 
283 |     click.clear()
284 | 
285 |     cnt = Search(using=es.client, index=from_index).count()
286 | 
287 |     bar = SlowFancyBar('', max=cnt)
288 |     while True:
289 |         time.sleep(2.0)
290 |         _cnt = Search(using=es.client, index=to_index).count()
291 |         bar.goto(_cnt)
292 |     bar.finish()
293 | 


--------------------------------------------------------------------------------
/defplorex/backend/elastic.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright (c) 2017, Trend Micro Incorporated
  4 | # All rights reserved.
  5 | #
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are met:
  8 | #
  9 | # 1. Redistributions of source code must retain the above copyright notice,
 10 | # this list of conditions and the following disclaimer.
 11 | # 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | # this list of conditions and the following disclaimer in the documentation
 13 | # and/or other materials provided with the distribution.
 14 | #
 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 25 | # POSSIBILITY OF SUCH DAMAGE.
 26 | #
 27 | # The views and conclusions contained in the software and documentation are
 28 | # those of the authors and should not be interpreted as representing official
 29 | # policies, either expressed or implied, of the FreeBSD Project.
 30 | 
 31 | from __future__ import division
 32 | 
 33 | # built-in modules
 34 | import gc
 35 | import logging
 36 | from datetime import datetime
 37 | 
 38 | # 3rd party modules
 39 | import simplejson
 40 | from elasticsearch import Elasticsearch, helpers
 41 | from elasticsearch_dsl import Search, Q
 42 | 
 43 | log = logging.getLogger(__name__)
 44 | 
 45 | 
 46 | class FailedTransformException(Exception):
 47 |     def __init__(self, message, _id):
 48 |         super(FailedTransformException, self).__init__(message)
 49 | 
 50 |         self._id = _id
 51 | 
 52 | 
 53 | class ESStorer(object):
 54 |     """
 55 |     Generic ES wrapper
 56 |     """
 57 |     def __init__(self, settings):
 58 |         kwargs = settings.get('es').get('client')
 59 |         es_user = settings.get('es_user')
 60 |         es_pass = settings.get('es_pass')
 61 | 
 62 |         if es_user and es_pass:
 63 |             kwargs.update(**dict(http_auth=(es_user, es_pass)))
 64 | 
 65 |         self.client = Elasticsearch(**kwargs)
 66 |         self.timeout = settings.get('es').get('client').get('timeout')
 67 |         self.doc_type = settings.get('es').get('doc_type')
 68 |         self.index_name = settings.get('es').get('index')
 69 |         self.id_field = settings.get('id_field')
 70 |         self.bulk_size = settings.get('bulk_size', 1000)
 71 |         self.path_encoding = settings.get('path_encoding')
 72 | 
 73 |         self.actions = []
 74 | 
 75 |         log.debug('ESStorer instance created: %s', self.client)
 76 | 
 77 |     def get(self, doc_id, index):
 78 |         log.debug('Getting _id = %s from index %s', doc_id, index)
 79 | 
 80 |         try:
 81 |             return self.client.get(index=index, doc_type=self.doc_type,
 82 |                                    id=doc_id)
 83 |         except Exception as e:
 84 |             log.warn('Cannot get doc with ID = %s because: %s', doc_id, e)
 85 | 
 86 |     def search(self, **kwargs):
 87 |         q = kwargs.get('q', '*')
 88 |         sort = kwargs.get('sort', 'timestamp')
 89 |         search_after = kwargs.get('search_after')
 90 |         size = kwargs.get('size', 50)
 91 |         source = kwargs.get('source')
 92 |         extra = dict(
 93 |                 size=size)
 94 | 
 95 |         if search_after:
 96 |             extra.update(dict(search_after=search_after))
 97 | 
 98 |         s = Search(using=self.client, index=self.index_name)
 99 |         if source:
100 |             s = s.source(source)
101 |         s = s.sort(sort)
102 |         s = s.query(Q('query_string', query=q))
103 |         s = s.extra(**extra)
104 | 
105 |         log.info('Query: %s', s.to_dict())
106 | 
107 |         r = s.execute()
108 |         count = r.hits.total
109 |         took = r.took
110 | 
111 |         result = r, count, took
112 | 
113 |         return result
114 | 
115 |     def partial_update_from_query(
116 |             self, index, query, transform, last_updated=True):
117 | 
118 |         gc.collect()
119 |         err_ids = []
120 | 
121 |         def it():
122 |             batch = []
123 | 
124 |             log.info('Received query: %s', query)
125 | 
126 |             s = Search(
127 |                     using=self.client,
128 |                     index=index,
129 |                     doc_type=self.doc_type)
130 |             s = s.update_from_dict(query)
131 | 
132 |             log.info('Running query: %s', s.to_dict())
133 | 
134 |             # this loop shold spin `bulk_size` times
135 |             for doc in s.scan():
136 |                 batch.append(doc)
137 | 
138 |             log.info('Accumulated %d items', len(batch))
139 | 
140 |             for doc in batch:
141 |                 data = doc.to_dict()
142 |                 _id = doc.meta.id
143 |                 data['_id'] = _id
144 | 
145 |                 log.debug('Working on doc %s', data)
146 | 
147 |                 try:
148 |                     try:
149 |                         doc_body = transform(data)
150 |                         log.debug('Invoking transform on ID = %s', _id)
151 |                     except Exception as e:
152 |                         log.warn(
153 |                             'Error while transforming doc ID = %s: %s',
154 |                             _id, e)
155 |                         raise e
156 | 
157 |                     if doc_body:
158 |                         if last_updated:
159 |                             doc_body['last_updated'] = datetime.now()
160 | 
161 |                         op = self.partial_update_op(
162 |                                 doc_id=_id,
163 |                                 index=index,
164 |                                 doc_body=doc_body,
165 |                                 doc_type=self.doc_type)
166 |                         yield op
167 |                 except Exception as e:
168 |                     log.warn('Cannot process doc ID = %s: %s', _id, e)
169 |                     err_ids.append(_id)
170 |             del(batch)
171 | 
172 |         try:
173 |             # call the iterator via bulk
174 |             self.bulk(it())
175 |             log.info('Invoking self.bulk(it())')
176 |         except Exception as e:
177 |             log.warn('Error in bulk on query = %s because: %s', query, e)
178 | 
179 |         return err_ids
180 | 
181 |     def bulk_index_from_it(
182 |             self, index, it, transform=lambda x: x, last_updated=True):
183 | 
184 |         gc.collect()
185 |         err_ids = []
186 | 
187 |         def _it():
188 |             for doc_body in it:
189 |                 try:
190 |                     log.debug('Working on record: %s', doc_body)
191 |                     _id = doc_body.get(self.id_field)
192 | 
193 |                     try:
194 |                         doc_body = transform(doc_body)
195 |                     except Exception as e:
196 |                         log.warn(
197 |                                 'Error while transforming doc ID = %s: %s',
198 |                                 _id, e)
199 |                         raise e
200 | 
201 |                     if doc_body:
202 |                         if last_updated:
203 |                             doc_body['last_updated'] = datetime.now()
204 | 
205 |                         op = self.partial_index_op(
206 |                                 doc_id=_id,
207 |                                 index=index,
208 |                                 doc_body=doc_body,
209 |                                 doc_type=self.doc_type)
210 |                         yield op
211 |                 except Exception as e:
212 |                     log.warn('Cannot process doc ID = %s: %s', _id, e)
213 |                     err_ids.append(_id)
214 | 
215 |         try:
216 |             self.bulk(_it())
217 |             log.info('Invoked self.bulk(_it())')
218 |         except Exception as e:
219 |             log.warn('Error in bulk index because: %s', e)
220 | 
221 |         return err_ids
222 | 
223 |     def create_op(
224 |                 self, doc_id, index, doc_body, op_type='update',
225 |                 doc_type=None):
226 |         if not doc_id:
227 |             raise Exception('Invalid document ID: %s', doc_id)
228 | 
229 |         if not doc_type:
230 |             doc_type = self.doc_type
231 | 
232 |         # remove _id
233 |         if '_id' in doc_body:
234 |             del(doc_body['_id'])
235 | 
236 |         if op_type == 'update':
237 |             body = {
238 |                     'doc': doc_body
239 |                     }
240 |         else:
241 |             body = doc_body
242 | 
243 |         op_template = {
244 |             '_id': doc_id,
245 |             '_op_type': op_type,
246 |             '_retry_on_conflict': 3,
247 |             '_index': index,
248 |             '_type': doc_type,
249 |             '_source': body
250 |         }
251 | 
252 |         return op_template.copy()
253 | 
254 |     def partial_index_op(self, doc_id, index, doc_body, doc_type=None):
255 |         return self.create_op(
256 |                 doc_id=doc_id,
257 |                 index=index,
258 |                 doc_body=doc_body,
259 |                 op_type='index',
260 |                 doc_type=doc_type)
261 | 
262 |     def partial_update_op(
263 |             self, doc_id, index, doc_body, doc_type=None):
264 |         return self.create_op(
265 |                 doc_id=doc_id,
266 |                 index=index,
267 |                 doc_body=doc_body,
268 |                 op_type='update',
269 |                 doc_type=doc_type)
270 | 
271 |     def index(self, doc_id, index, source):
272 |         log.debug('Storing _id = %s <- %s', doc_id, source)
273 |         try:
274 |             self.client.index(id=doc_id, index=index, doc_type=self.doc_type,
275 |                               body=source)
276 |         except Exception as e:
277 |             log.warn('Cannot index %s because: %s', doc_id, e)
278 | 
279 |     def bulk(self, it):
280 |         try:
281 |             log.info('Sending bulk request on iterable/generator')
282 |             args = dict(client=self.client,
283 |                         actions=it,
284 |                         chunk_size=self.bulk_size,
285 |                         raise_on_exception=False,
286 |                         raise_on_error=False,
287 |                         stats_only=False,
288 |                         request_timeout=self.timeout)
289 | 
290 |             res_succ, res_err = helpers.bulk(**args)
291 | 
292 |             log.info(
293 |                     'Sent bulk request on queue iterator: '
294 |                     'successfull ops = %d, failed ops = %d',
295 |                     res_succ, len(res_err))
296 | 
297 |             for res in res_err:
298 |                 log.warn('Error response: %s', res)
299 |         except Exception as e:
300 |             log.error('Error in storing: %s', e, exc_info=True)
301 | 
302 |     def get_fields(self, index):
303 |         return self.client.indices.get_mapping(index, doc_type=self.doc_type)
304 | 
305 |     def count(self, index, query):
306 |         try:
307 |             s = Search(
308 |                     using=self.client,
309 |                     index=index,
310 |                     doc_type=self.doc_type). \
311 |                             update_from_dict(query)
312 |             log.info('Querying: %s', s.to_dict())
313 | 
314 |             return s.count()
315 |         except Exception as e:
316 |             log.warn('Cannot count: %s', e)
317 | 
318 |     def scan(self, index, query, limit=None, id_only=False):
319 |         size = self.bulk_size
320 |         max_records = None
321 |         cnt = 0
322 | 
323 |         if isinstance(limit, int):
324 |             if limit > 0:
325 |                 size = min(limit, size)
326 |                 max_records = limit
327 | 
328 |         kw = dict(
329 |             index=index,
330 |             query=query,
331 |             size=size
332 |         )
333 | 
334 |         if id_only:
335 |             kw['_source'] = ['_id']
336 | 
337 |         log.debug('Scanning for %s (size = %d, index = %s)',
338 |                   query, size, index)
339 | 
340 |         for hit in helpers.scan(self.client, **kw):
341 |             if max_records:
342 |                 if cnt >= max_records:
343 |                     log.debug('Stopping after pulling %d records'
344 |                               ' as requested', cnt)
345 |                     raise StopIteration
346 | 
347 |             log.debug('Yielding %s', hit['_id'])
348 |             cnt += 1
349 | 
350 |             if id_only:
351 |                 yield hit.get('_id')
352 |             else:
353 |                 yield hit
354 | 
355 |     def paginate(self, index, q='*', limit=None, size=None, id_only=True):
356 |         if not size:
357 |             size = self.bulk_size
358 | 
359 |         log.info('Limit %s, size %s (q = "%s")', limit, size, q)
360 | 
361 |         s = Search(
362 |                 using=self.client,
363 |                 index=index,
364 |                 doc_type=self.doc_type)
365 |         s = s.query(Q('query_string', query=q))
366 | 
367 |         if limit:
368 |             size = min(size, limit)
369 |             s = s.extra(size=size)
370 | 
371 |         s = s.params(
372 |                 scroll='20m',
373 |                 size=size)
374 | 
375 |         if id_only:
376 |             s = s.source(False)
377 | 
378 |         log.debug('Query: %s', simplejson.dumps(s.to_dict(), indent=2))
379 | 
380 |         hits = []
381 |         overall = 0
382 | 
383 |         for h in s.scan():
384 |             if limit is not None and overall >= limit:
385 |                 raise StopIteration()
386 | 
387 |             log.debug('Hit: %s (progress: %d)', h.meta.id, overall)
388 |             if overall < limit or not limit:
389 |                 if id_only:
390 |                     hits.append(h.meta.id)
391 |                 else:
392 |                     hits.append(h.to_dict())
393 | 
394 |                 if len(hits) == size:
395 |                     yield iter(hits)
396 |                     hits = []
397 |                     overall += size
398 | 
399 |         if len(hits):
400 |             yield iter(hits)
401 |         else:
402 |             raise StopIteration()
403 | 
404 | 
405 | ES = ESStorer
406 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DefPloreX (Public Release)
  2 | At [BlackHat USA 2017's Arsenal we've showcased
  3 | DefPloreX](https://www.blackhat.com/us-17/arsenal/schedule/index.html#defplorex-a-machine-learning-toolkit-for-large-scale-ecrime-forensics-8065),
  4 | an Elasticsearch-based toolkit that our team uses for large-scale processing,
  5 | analysis and visualization of e-crime records. In particular, we've
  6 | successfully been applying DefPloreX to the analysis of deface records (e.g., from web compromises);
  7 |  hence its name, Def(acement) eXPlorer (DefPloreX).
  8 | 
  9 | ![DefPloreX Visualization](i/dpx-clusters-viz.png?raw=true "DefPloreX Visualization")
 10 | 
 11 | DefPloreX automatically organizes deface records by web pages' content and format (what we call ``template pages'').
 12 | This allows an analyst to easily investigate on campaigns, 
 13 | for example in discovering websites targeted by the same campaign or
 14 | attributing one or more actors to the same hacking group.
 15 | All of this without sacrificing the interactivity aspect of the investigation.
 16 | 
 17 | ![Overview of DefPloreX](i/dpx-overall.png?raw=true "Overview of DefPloreX")
 18 | 
 19 | The full version of DefPloreX includes:
 20 | 
 21 |   * A thin wrapper to interact with an Elasticsearch backend (included in this release)
 22 |   * A distributed data-processing pipeline based on Celery (example included in this release)
 23 |   * An analysis component to extract information from deface web pages
 24 |   * A features extraction component to produce a compact, numerical and categorical representation of each web page
 25 |   * A statistical machine-learning component to automatically find groups of similar web pages
 26 | 
 27 | The input to DefPloreX is a feed of URLs describing the deface web pages,
 28 | including metadata such as the (declared) attacker name, timestamp, reason
 29 | for hacking that page, and so on. Separately, we also have a mirror of the
 30 | web pages at the time of compromise.
 31 | 
 32 | ## Code Release
 33 | This repository contains the public release of DefPloreX. Technically speaking,
 34 | we're releasing an example use of the DefPloreX approach to distributed data
 35 | processing using Elasticsearch (ES). This is not meant to be a ready-to-use,
 36 | plug-n-play solution, but rather a framework that you can reuse, extend and
 37 | improve to adapt it to your needs.
 38 | 
 39 | The goal that guided us to implement DefPloreX was the need to efficiently 
 40 | analyze a large number of records (pages) for common aspects, recurrent attackers,
 41 | or groups of organized attackers. In other words, a typical e-crime
 42 | forensics task.
 43 | 
 44 | In this, the core challenge was to visit and analyze over 13 million web pages, 
 45 | parse their source code, analyze their resources (e.g.,
 46 | images, scripts), extract visual information, store the data so extracted in
 47 | a database, and query it to answer the typical questions that arise during
 48 | a post-mortem investigation. Given its popularity and scalability,
 49 | we've chosen Elasticsearch as our data storage solution. Since we wanted our
 50 | solution to be scalable, and given that visiting a web page (with an automated,
 51 | headless browser) takes at least 5 seconds, the only option was to distribute
 52 | the workload across several worker machines.
 53 | 
 54 | ## Distributed Data Processing
 55 | 
 56 | Normally, to take full advantage of Elasticsearch's distributed
 57 | data-processing functionality, you need to resort to
 58 | [scripting](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting.html).
 59 | Although scripting is quite powerful and handy for small data-manipulation
 60 | tasks, it's a bit cumbersome to deploy and handle requires; and, in addition, it
 61 | requires full access to the Elasticsearch's client nodes. For example, if you
 62 | need to process all the documents in an Elastic index (e.g., to enrich them by
 63 | computing additional fields), you will have to choose one of the scripting
 64 | languages supported by Elastic, write a script, deploy it and run it. Needless
 65 | to say, your script will run within the context of the ES runtime,
 66 | with all the limitations that this implies. For example, should you need to use
 67 | Python, you're forced to use the Jython Java implementation of Python, which is
 68 | not the same as pure Python. For instance, some of the libraries that you may
 69 | want to use may not be supported, and so on. In other words, we don't want to depend
 70 | on the Elastic's scripting subsystem in our work :)
 71 | 
 72 | Instead, we take a more "detached" approach. We decouple the data-processing
 73 | part, making it independent from the Elasticsearch runtime and architecture,
 74 | and rely on ES exclusively as a data back-end to store, retrieve and
 75 | modify JSON documents. The coordination of the distributed computation is
 76 | delegated to a well-known and widely used distributed task queue:
 77 | [Celery](http://www.celeryproject.org/). The friendliness of Celery is
 78 | astonishing: from the programmer's perspective, all it requires is to write
 79 | your data-processing code by means of a function, and Celery will take care of
 80 | offloading the (expensive and long-running) computation to one of the available
 81 | workers.
 82 | 
 83 | ![DefPloreX distributed data processing via Celery](i/dpx-celery.png?raw=true "DefPloreX distributed data processing via Celery")
 84 | 
 85 | For example, if you need to visit a web page with an automated headless browser,
 86 | all you need to do is to wrap your code into a function, let's say `visit_page`,
 87 | and decorate it with `@app.task` to inform Celery that this is a task:
 88 | 
 89 | ```
 90 | @app.task
 91 | def visit_page(url):
 92 |     result = long_running_process(url)
 93 | 
 94 |     return result
 95 | ```
 96 | 
 97 | Later on in your code, all you need to do is to call the function (almost) as
 98 | you would normally do:
 99 | 
100 | ```
101 | visit_page.delay(url)
102 | ```
103 | 
104 | The `.delay()` function indicates that the function call will not execute
105 | immediately, but instead will be "pushed" into a task list, from which an
106 | available worker will pull it and do the work.
107 | 
108 | On the other end of the task list, you can launch as many workers as you need,
109 | by simply keeping the Celery daemon active:
110 | 
111 | ```
112 | $ celery worker --autoscale=6,64
113 | ```
114 | 
115 | Assuming having a 64-core machine, this command spawns 6 concurrent processes, up
116 | to 64 when more workload comes in. And of course you can add as many workers as
117 | needed, from a single computer with a few tenths of cores, to a full rack
118 | distributed across the globe. In our deployment, we have 5 machines, with
119 | a total of 128 cores. With these modest resources, we were able to visit the
120 | entire collection of over 13 million web pages in a week. Adding more cores would have
121 | made the analysis even faster.
122 | 
123 | # Document Transformations
124 | From this moment on, we have a solid foundation to efficiently transform JSON
125 | documents stored in the Elastic index. Therefore, we "encode" any operation
126 | that we need to perform in DefPloreX by means of a few lines of Python code. For
127 | example, we often need to "tag" JSON documents to mark those that have been
128 | processed. To this end, as exemplified in this repository, we use the
129 | `TagTransformer` transformation. As any other transform, this function receives one JSON
130 | document and returns the newly added fields, or the modified fields.
131 | 
132 | ```
133 | class TagTransformer(Transformer):
134 |     """
135 |     Example transform to append tag to a record.
136 |     """
137 |     _name = 'tag'                   # unique name
138 | 
139 |     def __call__(self, doc, *args, **kwargs):
140 |         doc = super(TagTransformer, self).__call__(
141 |                 doc, *args, **kwargs)
142 | 
143 |         tag = kwargs.get('tag')     # tag that we want to apply to the JSON
144 | 
145 |         if not tag:
146 |             log.debug('No tags supplied, skipping')
147 |             return []
148 | 
149 |         tags = doc.get('tags', [])  # get the 'tags' field from the existing JSON doc
150 | 
151 |         if tags:
152 |             log.debug('Found tags: %s', tags)
153 | 
154 |         tags.append(tag)            # append the new tag
155 |         tags = list(set(tags))      # remove duplicates
156 | 
157 |         log.debug('Updated tags: %s', tags)
158 | 
159 |         return dict(tags=tags)      # return the enriched JSON
160 | ```
161 | 
162 | The output of this transformation is automatically handled by our Elasticsearch 
163 | wrapper (see `backend.elastic.ESStorer`) and the
164 | `transformer.Pipeline` class, which merges the new (partial) document with the
165 | original one and saves it into the ES index. Actually, this is
166 | performed in bulk: that is, every worker consumes and processes a given amount
167 | of documents at each round (default is 1000). To summarize: given a query, we
168 | enqueue all the IDs of the documents that match that query. The queue consumers
169 | will pull 1000 IDs at a time, query Elastic for the respective documents,
170 | transform them, and push them back on Elastic as update operations.
171 | 
172 | Other transformations that we have implemented (briefly explained in
173 | the following) include for example visiting the web pages with an automated,
174 | headless browser, extracting information from the visited web pages,
175 | calculating numerical features, and so on. Every task is expressed by means of
176 | a subclass of `Transformer`, which takes as input a document, and returns the
177 | enriched or modified fields.
178 | 
179 | ## Extracted Information
180 | From each web page, we were interested in collecting two "sides" of the same
181 | story: a "static" view of the page (e.g., non-interpreted resources, scripts,
182 | text) and a "dynamic" view of the same page (e.g., rendered page with DOM
183 | modifications and so on). In concrete, the full version of DefPloreX can
184 | extract URLs, e-mail addresses, social-network nicknames and handles, hashtags,
185 | images, file metadata, summarized text, and so on. These information captures the
186 | main characteristics of a defaced web page.
187 | 
188 | ![Extracted data from each web page](i/dpx-extraction.png?raw=true "Extracted data from each page")
189 | 
190 | ## Scalable Data Clustering
191 | We approach the problem of finding groups of related deface pages
192 | (e.g., hacktivism campaigns) as a typical data-mining problem. We assume that
193 | there are recurring and similar characteristics among these pages that we can
194 | capture and use as clustering features. For example, we assume that the same
195 | attacker will reuse the same web snippets or templates (albeit with minimal variations)
196 | within the same campaign. We capture this and other aspects by extracting
197 | numerical and categorical features from the data that we obtained by analyzing
198 | each page (static and dynamic view). To this end, we express the following
199 | task by means of a transform function.
200 | 
201 | For example, here's an excerpt of the features that we compute from
202 | each of our documents:
203 | 
204 | ```
205 | {
206 |   "n_urls": 135,
207 |   "n_object": 0,
208 |   "n_embed": 0,
209 |   "n_telephone": 8,
210 |   "n_email": 1,
211 |   "n_img": 18,
212 |   "n_link": 0,
213 |   "n_sound_urls": 0,
214 |   "n_anchor": 60,
215 |   "n_meta": 4,
216 |   "n_resource": 0,
217 |   "n_iframe": 0,
218 |   "n_script": 34,
219 |   "n_hashtag": 0,
220 |   "n_style": 9,
221 |   "n_twitter": 1,
222 |   "avg_color": "#000000",
223 |   "frac_letters_in_title": 0.6979166666666666,
224 |   "frac_punct_in_title": 0.17708333333333334,
225 |   "frac_whitespace_in_title": 0.0625,
226 |   "frac_digits_in_title": 0.0625
227 | }
228 | ```
229 | 
230 | ![Feature extraction](i/dpx-features.png?raw=true "Feature extraction")
231 | 
232 | At this point we could use any clustering algorithm to find groups. However,
233 | this would not be the most efficient solution, at least in general, because
234 | we would need to compare all pairs of our collection of 13 million records, 
235 | calculate "some" form of distance (e.g., ssdeep), and then start forming groups by
236 | means of such distance.
237 | 
238 | We take a different approach, which is approximate but way faster. As a result,
239 | we're able to cluster our entire collection of 13 million documents in less than a
240 | minute, and we dynamically configure the clustering features on demand (i.e., at
241 | each clustering execution).
242 | 
243 | Intuitively, we would like to be able to find logical groups of web pages that
244 | share "similar" feature values. Instead of approaching this problem as
245 | a distance-metric calculation task, we use the concept of "feature binning" or
246 | "feature quantization". In simple words, we want all the web pages with a "low
247 | number of URLs" to fall in the same cluster. At the same time, we want all the
248 | web pages with a "high number of URLs" to fall in another cluster. And so on,
249 | for all the features. In other words, the clustering task becomes a "group-by"
250 | task, which is natively and well supported by all database engines. In the case of
251 | Elastic, it's efficiently implemented in a map-reduce fashion, effectively distributing
252 | the workload across all the available nodes.
253 | 
254 | The missing piece is how we obtain these "low, medium, high" values from the
255 | original, numerical feature values. For instance, is "42 URLs" considered low,
256 | high, or medium? To this end, we look at the statistical distribution of each feature,
257 | and divide its space into intervals according to estimated percentiles. For instance,
258 | the values below the 25% percentile are considered low, those between 25-50% percentile
259 | are medium, and those between 50% and 75% are high. Those above the 75% percentile
260 | are outliers. This is just an example, of course.
261 | 
262 | ![Feature quantization and clustering](i/dpx-binning.png?raw=true "Feature quantization and clustering")
263 | 
264 | It turns out that Elasticsearch already supports the calculation of a few
265 | statistical metrics, among which we happily found the percentiles. So all we need
266 | to do is asking Elastic to compute the percentiles of each feature -- done in a matter
267 | of few seconds. Then, we store these percentiles
268 | and use them as thresholds to quantize the numerical features.
269 | 
270 | For example, here's an excerpt of four equally-spaced percentiles (from 1%
271 | to 99%) that we obtaine from our collection:
272 | 
273 | ```
274 | "features": {
275 | 	"n_style": [
276 | 	  0,
277 | 	  2,
278 | 	  5,
279 | 	  10
280 | 	],
281 | 	"n_anchor": [
282 | 	  0,
283 | 	  10,
284 | 	  34,
285 | 	  284.78097304328793
286 | 	],
287 | 	"n_urls": [
288 | 	  0,
289 | 	  6.999999999999999,
290 | 	  19.575392097264313,
291 | 	  201.65553368092415
292 | 	],
293 | 	"n_hashtag": [
294 | 	  0,
295 | 	  2.2336270350272462,
296 | 	  5,
297 | 	  16
298 | 	],
299 | 	"n_script": [
300 | 	  0,
301 | 	  4,
302 | 	  12,
303 | 	  45
304 | 	],
305 | 	"n_sound_urls": [
306 | 	  0,
307 | 	  1,
308 | 	  2.4871283217280453,
309 | 	  7
310 | 	],
311 | ...
312 | }
313 | ```
314 | 
315 | Overall, for each page, we obtain a vector as the following that we store in ES.
316 | 
317 | ```
318 | {
319 |   "n_urls": H,
320 |   "n_object": L,
321 |   "n_embed": L,
322 |   "n_telephone": M,
323 |   "n_email": L,
324 |   "n_img": M,
325 |   "n_link": L,
326 |   "n_sound_urls": L,
327 |   "n_anchor": M,
328 |   "n_meta": L,
329 |   "n_resource": L,
330 |   "n_iframe": L,
331 |   "n_script": M,
332 |   "n_hashtag": L,
333 |   "n_style": L,
334 |   "n_twitter": L,
335 |   "avg_color": "#000000",
336 |   "frac_letters_in_title": M,
337 |   "frac_punct_in_title": L,
338 |   "frac_whitespace_in_title": L,
339 |   "frac_digits_in_title": L
340 | }
341 | ```
342 | 
343 | At this point, the web operator (the analyst) simply chooses the features for data pivoting, and
344 | runs an Elasticsearch aggregate query, which is natively supported.
345 | 
346 | In the remainder of this page you can see some example results.
347 | 
348 | ![Feature quantization and clustering (visualized)](i/dpx-binning-viz.png?raw=true "Feature quantization and clustering (visualized)")
349 | 
350 | ![Feature quantization and clustering (visualized)](i/dpx-binned-records-viz.png?raw=true "Feature quantization and clustering (visualized)")
351 | 
352 | # License
353 | ```
354 | Copyright (c) 2017, Trend Micro Incorporated
355 | All rights reserved.
356 | 
357 | Redistribution and use in source and binary forms, with or without
358 | modification, are permitted provided that the following conditions are met:
359 | 
360 | 1. Redistributions of source code must retain the above copyright notice,
361 | this list of conditions and the following disclaimer.
362 | 2. Redistributions in binary form must reproduce the above copyright notice,
363 | this list of conditions and the following disclaimer in the documentation
364 | and/or other materials provided with the distribution.
365 | 
366 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
367 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
368 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
369 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
370 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
371 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
372 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
373 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
374 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
375 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
376 | POSSIBILITY OF SUCH DAMAGE.
377 | 
378 | The views and conclusions contained in the software and documentation are
379 | those of the authors and should not be interpreted as representing official
380 | policies, either expressed or implied, of the FreeBSD Project.
381 | ```
382 | 


--------------------------------------------------------------------------------