├── defplorex ├── __init__.py ├── backend │ ├── __init__.py │ └── elastic.py ├── config │ ├── 11-prod-settings.json.DISABLED │ ├── __init__.py │ └── 00-base-settings.json ├── transformer │ ├── base.py │ ├── tag.py │ └── __init__.py ├── celeryapp.py ├── celeryconfig.py ├── loggers.py ├── utils.py ├── tasks.py └── console.py ├── i ├── dpx-binning.png ├── dpx-celery.png ├── dpx-overall.png ├── dpx-extraction.png ├── dpx-features.png ├── dpx-binning-viz.png ├── dpx-clusters-viz.png ├── dpx-features-viz.png └── dpx-binned-records-viz.png ├── deploy └── requirements.pip ├── LICENSE ├── setup.py └── README.md /defplorex/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /defplorex/backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /i/dpx-binning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binning.png -------------------------------------------------------------------------------- /i/dpx-celery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-celery.png -------------------------------------------------------------------------------- /i/dpx-overall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-overall.png -------------------------------------------------------------------------------- /i/dpx-extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-extraction.png -------------------------------------------------------------------------------- /i/dpx-features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-features.png -------------------------------------------------------------------------------- /i/dpx-binning-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binning-viz.png -------------------------------------------------------------------------------- /i/dpx-clusters-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-clusters-viz.png -------------------------------------------------------------------------------- /i/dpx-features-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-features-viz.png -------------------------------------------------------------------------------- /i/dpx-binned-records-viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/trendmicro/defplorex/master/i/dpx-binned-records-viz.png -------------------------------------------------------------------------------- /deploy/requirements.pip: -------------------------------------------------------------------------------- 1 | amqp 2 | appnope==0.1.0 3 | argcomplete==1.4.1 4 | argh==0.26.2 5 | arrow==0.8.0 6 | Babel==2.3.4 7 | billiard==3.5.0.2 8 | blinker==1.4 9 | celery 10 | cffi==1.8.3 11 | click==6.6 12 | colorlog==2.7.0 13 | ConfigArgParse==0.10.0 14 | construct==2.5.3 15 | cycler==0.10.0 16 | elasticsearch 17 | elasticsearch-dsl 18 | humanize==0.5.1 19 | ipython-genutils==0.1.0 20 | itsdangerous==0.24 21 | kombu 22 | progress==1.2 23 | prompt-toolkit==1.0.7 24 | python-dateutil 25 | pytz==2016.6.1 26 | shove==0.6.6 27 | simplejson==3.8.2 28 | six==1.10.0 29 | stuf==0.9.16 30 | tabulate==0.7.5 31 | traitlets==4.3.0 32 | unicodecsv 33 | urllib3==1.19.1 34 | urwid==1.3.1 35 | wcwidth==0.1.7 36 | configparser==3.5.0 37 | httplib2==0.9.2 38 | mccabe==0.5.2 39 | pyrabbit==1.1.0 40 | python-dotenv==0.6.2 41 | anyconfig==0.7.0 42 | python-logstash==0.4.6 43 | tzlocal==1.3 44 | -------------------------------------------------------------------------------- /defplorex/config/11-prod-settings.json.DISABLED: -------------------------------------------------------------------------------- 1 | { 2 | "LOGGING": { 3 | "version": 1, 4 | "disable_existing_loggers": true, 5 | "formatters": { 6 | "logstash": { 7 | "()": "loggers.Formatter", 8 | "format": "1 %(asctime)s {host} %(processName)s %(process)d {project} [pythonData filename=\"%(filename)s\" thread=\"%(threadName)s\" line=\"%(lineno)d\" module=\"%(module)s\"][esData index_keyword=\"{project}\"] %(levelname)s %(message)s" 9 | } 10 | }, 11 | "handlers": { 12 | "logstash": { 13 | "()": "logging.handlers.SysLogHandler", 14 | "formatter": "logstash", 15 | "level": "WARN", 16 | "address": ["__LOGSTASH_SERVER__", 5514] 17 | } 18 | }, 19 | "loggers": { 20 | "defplorex": { 21 | "handlers": [ 22 | "logstash" 23 | ], 24 | "level": "WARN", 25 | "propagate": false 26 | } 27 | }, 28 | "root": { 29 | "handlers": [ 30 | "logstash" 31 | ], 32 | "level": "WARN" 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Trend Micro Incorporated 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 17 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 18 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 19 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 20 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 21 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 22 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 23 | POSSIBILITY OF SUCH DAMAGE. 24 | 25 | The views and conclusions contained in the software and documentation are 26 | those of the authors and should not be interpreted as representing official 27 | policies, either expressed or implied, of the FreeBSD Project. 28 | -------------------------------------------------------------------------------- /defplorex/transformer/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import logging 32 | 33 | log = logging.getLogger(__name__) 34 | 35 | 36 | class Transformer(object): 37 | """ 38 | Generic class to transform documents 39 | """ 40 | def __call__(self, doc, *args, **kwargs): 41 | log.info('Calling %s', self._name) 42 | return kwargs.get('original_doc', {}) 43 | 44 | -------------------------------------------------------------------------------- /defplorex/config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import os 32 | import anyconfig 33 | 34 | 35 | DEFAULT_CONFIG_PATH = os.path.abspath( 36 | os.path.join(os.path.dirname(__file__), '*settings.json')) 37 | 38 | 39 | def load_settings(config_path=DEFAULT_CONFIG_PATH): 40 | """ Load settings """ 41 | settings = anyconfig.load(config_path) 42 | 43 | settings['bulk_size'] = int(settings.get('bulk_size', 100)) 44 | 45 | if os.environ.get('DEBUG'): 46 | settings['DEBUG'] = True 47 | 48 | return settings 49 | -------------------------------------------------------------------------------- /defplorex/celeryapp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import logging 32 | 33 | from celery import Celery 34 | from celery.signals import setup_logging 35 | 36 | from defplorex.celeryconfig import broker_url, result_backend, timezone 37 | 38 | app = Celery() 39 | app.config_from_object('defplorex.celeryconfig') 40 | 41 | log = logging.getLogger(__name__) 42 | 43 | 44 | @setup_logging.connect 45 | def _setup_logging(loglevel=logging.WARN, **kwargs): 46 | from loggers import config_logger 47 | config_logger(level=loglevel) 48 | 49 | log.debug( 50 | 'Configuring Celery on broker = %s,' 51 | ' result_backend = %s, timezone = %s', 52 | broker_url, result_backend, timezone) 53 | 54 | 55 | app.log.setup() 56 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | from setuptools import setup, find_packages 32 | 33 | DEPENDENCIES = open('deploy/requirements.pip', 'r').read().split('\n') 34 | 35 | setup( 36 | name='defplorex', 37 | version='0.1', 38 | description='ES Framework for Large-Scale Processing', 39 | author='FTR Team at Trend Micro', 40 | url='https://github.com/trendmicro/defplorex', 41 | packages=find_packages(), 42 | entry_points=''' 43 | [console_scripts] 44 | dpx=defplorex.console:cli 45 | ''', 46 | install_requires=DEPENDENCIES, 47 | classifiers=[ 48 | 'Intended Audience :: Developers', 49 | 'License :: OSI Approved :: MIT License', 50 | 'Operating System :: OS Independent', 51 | 'Programming Language :: Python', 52 | 'Topic :: Software Development :: Libraries :: Python Modules', 53 | ], 54 | ) 55 | -------------------------------------------------------------------------------- /defplorex/transformer/tag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import logging 32 | 33 | log = logging.getLogger(__name__) 34 | 35 | from defplorex.transformer.base import Transformer 36 | 37 | 38 | class TagTransformer(Transformer): 39 | """ 40 | Example transform to append tag to a record. 41 | """ 42 | _name = 'tag' 43 | 44 | def __call__(self, doc, *args, **kwargs): 45 | doc = super(TagTransformer, self).__call__( 46 | doc, *args, **kwargs) 47 | 48 | tag = kwargs.get('tag') 49 | 50 | if not tag: 51 | log.debug('No tags supplied, skipping') 52 | return [] 53 | 54 | tags = doc.get('tags', []) 55 | 56 | if tags: 57 | log.debug('Found tags: %s', tags) 58 | 59 | tags.append(tag) 60 | tags = list(set(tags)) 61 | 62 | log.debug('Updated tags: %s', tags) 63 | 64 | return dict(tags=tags) 65 | -------------------------------------------------------------------------------- /defplorex/config/00-base-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "project": "__PROJECT_NAME__", 3 | 4 | "default_lang": "eng", 5 | "bulk_size": 1000, 6 | 7 | "es": { 8 | "client": { 9 | "hosts": [ 10 | "__ELASTCSEARCH_SERVER__:9200" 11 | ], 12 | "timeout": 240, 13 | "retry_on_timeout": true 14 | }, 15 | "index": "__INDEX_NAME__", 16 | "doc_type": "__DOC_TYPE_NAME__" 17 | }, 18 | 19 | "encoding": "utf-8", 20 | "path_encoding": "utf-8", 21 | "target_tz": "UTC", 22 | "date_format": "YYYY-MM-DD HH:mm:ss", 23 | 24 | "LOGGING": { 25 | "version": 1, 26 | "disable_existing_loggers": true, 27 | "formatters": { 28 | "console": { 29 | "()": "defplorex.loggers.Formatter", 30 | "format": "[%(asctime)s] - %(name)s |%(levelname)s| %(message)s" 31 | }, 32 | "colorlog": { 33 | "()": "colorlog.ColoredFormatter", 34 | "format": "%(log_color)s[%(asctime)s] - %(name)s |%(levelname)s| %(message)s" 35 | }, 36 | "logstash": { 37 | "()": "defplorex.loggers.Formatter", 38 | "format": "1 %(asctime)s {host} %(processName)s %(process)d {project} [pythonData filename=\"%(filename)s\" thread=\"%(threadName)s\" line=\"%(lineno)d\" module=\"%(module)s\"][esData index_keyword=\"{project}\"] %(levelname)s %(message)s" 39 | } 40 | }, 41 | "handlers": { 42 | "console": { 43 | "()": "logging.StreamHandler", 44 | "formatter": "console", 45 | "level": "DEBUG" 46 | }, 47 | "colorlog": { 48 | "()": "colorlog.StreamHandler", 49 | "formatter": "colorlog", 50 | "level": "DEBUG" 51 | }, 52 | "logstash": { 53 | "()": "logging.handlers.SysLogHandler", 54 | "formatter": "logstash", 55 | "level": "DEBUG", 56 | "address": ["__LOGSTASH_SERVER__", 5514] 57 | } 58 | }, 59 | "loggers": { 60 | "defplorex": { 61 | "handlers": [ 62 | "colorlog" 63 | ], 64 | "level": "DEBUG", 65 | "propagate": false 66 | } 67 | }, 68 | "root": { 69 | "handlers": [ 70 | "colorlog" 71 | ], 72 | "level": "DEBUG" 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /defplorex/celeryconfig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | # NOTE this file is provided as a mere example: the end user can change it as 32 | # needed 33 | 34 | 35 | import os 36 | 37 | from kombu import Queue as Q, Exchange as E 38 | from dotenv import load_dotenv, find_dotenv 39 | 40 | 41 | # load './.env' file, if any 42 | load_dotenv(find_dotenv()) 43 | 44 | imports = ('tasks',) 45 | 46 | timezone = os.environ.get('celery_timezone', 'UTC') 47 | broker_url = os.environ.get('celery_broker_url', 'redis://') 48 | result_backend = os.environ.get('celery_result_backend', 'redis://') 49 | 50 | enable_utc = timezone == 'UTC' 51 | 52 | result_persistent = False 53 | 54 | worker_hijack_root_logger = False 55 | worker_pool_restarts = True 56 | task_default_queue = 'celery' 57 | 58 | worker_send_task_events = True 59 | 60 | queues = [ 61 | 'processor_task'] 62 | 63 | task_queues = [Q(n, E(n, 'direct'), n, durable=False) for n in queues] 64 | task_routes = {'tasks.{}'.format(q): q for q in queues} 65 | -------------------------------------------------------------------------------- /defplorex/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import logging 32 | 33 | from defplorex.transformer.tag import TagTransformer 34 | 35 | log = logging.getLogger(__name__) 36 | 37 | __all__ = [ 38 | 'TagTransformer' 39 | ] 40 | 41 | classes = [ 42 | TagTransformer 43 | ] 44 | 45 | 46 | class TransformerFactory(object): 47 | registry = {t._name: t for t in classes} 48 | 49 | @classmethod 50 | def get_by_name(cls, name): 51 | return cls.registry.get(name) 52 | 53 | @classmethod 54 | def get_by_list(cls, name_lst): 55 | return [cls.get_by_name(name) for name in name_lst] 56 | 57 | @classmethod 58 | def get_names(cls): 59 | return cls.registry.keys() 60 | 61 | @classmethod 62 | def get_classes(cls): 63 | return cls.registry.items() 64 | 65 | 66 | class Pipeline(object): 67 | @staticmethod 68 | def chain(doc, transformers, updates_only=True, *args, **kwargs): 69 | doc = doc.copy() 70 | 71 | if '_source' in doc: 72 | doc = doc.get('_source', {}) 73 | 74 | kwargs.update(**dict(original_doc=doc)) 75 | updates = {} 76 | 77 | for transformer in transformers: 78 | _ = transformer(updates.copy(), *args, **kwargs) 79 | updates.update(**_) 80 | 81 | if updates_only: 82 | return updates 83 | 84 | doc.update(**updates) 85 | 86 | return doc 87 | -------------------------------------------------------------------------------- /defplorex/loggers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # copyright (c) 2017, trend micro incorporated 4 | # all rights reserved. 5 | # 6 | # redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # this software is provided by the copyright holders and contributors "as is" 16 | # and any express or implied warranties, including, but not limited to, the 17 | # implied warranties of merchantability and fitness for a particular purpose 18 | # are disclaimed. in no event shall the copyright owner or contributors be 19 | # liable for any direct, indirect, incidental, special, exemplary, or 20 | # consequential damages (including, but not limited to, procurement of 21 | # substitute goods or services; loss of use, data, or profits; or business 22 | # interruption) however caused and on any theory of liability, whether in 23 | # contract, strict liability, or tort (including negligence or otherwise) 24 | # arising in any way out of the use of this software, even if advised of the 25 | # possibility of such damage. 26 | # 27 | # the views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the freebsd project. 30 | 31 | import logging 32 | import logging.config 33 | import logging.handlers 34 | import socket 35 | 36 | import arrow 37 | from tzlocal import get_localzone 38 | 39 | from defplorex.config import load_settings 40 | 41 | log = logging.getLogger(__name__) 42 | 43 | 44 | class Formatter(logging.Formatter): 45 | """Simple formatter""" 46 | 47 | def converter(self, timestamp): 48 | """Convert date to local timezone""" 49 | 50 | ltz = get_localzone() 51 | converted = arrow.get( 52 | timestamp, 53 | tz=ltz).to('UTC') 54 | return converted.datetime.timetuple() 55 | 56 | 57 | def config_logger(level=logging.WARN, debug=False): 58 | """Configure logger""" 59 | 60 | # settings 61 | settings = load_settings() 62 | project = settings.get('project', 'project') 63 | host = socket.getfqdn() 64 | _logging = settings.get('LOGGING').copy() 65 | 66 | if settings.get('DEBUG', False) or debug: 67 | level = logging.DEBUG 68 | 69 | # override level if DEBUG 70 | for handler in _logging.get('handlers').keys(): 71 | _logging['handlers'][handler]['level'] = level 72 | for logger in _logging.get('loggers').keys(): 73 | _logging['loggers'][logger]['level'] = level 74 | if 'root' in _logging: 75 | _logging['root']['level'] = level 76 | 77 | # set host in format 78 | if 'logstash' in _logging.get('formatters'): 79 | fmt = _logging['formatters']['logstash']['format'] 80 | fmt = fmt.format(host=host, project=project) 81 | _logging['formatters']['logstash']['format'] = fmt 82 | 83 | if 'logstash' in _logging.get('handlers'): 84 | address = _logging['handlers']['logstash']['address'] 85 | _logging['handlers']['logstash']['address'] = tuple(address) 86 | 87 | logging.config.dictConfig(_logging) 88 | 89 | log.info('Logger configured: %s', log) 90 | 91 | 92 | if __name__ == '__main__': 93 | config_logger() 94 | -------------------------------------------------------------------------------- /defplorex/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | from __future__ import division 32 | 33 | import re 34 | import logging 35 | 36 | from progress.bar import Bar 37 | from progress.spinner import Spinner 38 | from progress.helpers import WriteMixin 39 | from progress import Infinite 40 | import humanize 41 | 42 | ip_re = re.compile( 43 | '(([2][5][0-5]\.)|([2][0-4][0-9]\.)|([0-1]?[0-9]?[0-9]\.)){3}' 44 | '(([2][5][0-5])|([2][0-4][0-9])|([0-1]?[0-9]?[0-9]))') 45 | 46 | 47 | log = logging.getLogger(__name__) 48 | 49 | 50 | def fopen(fname, *args): 51 | if not fname: 52 | import sys 53 | return sys.stdout 54 | 55 | if fname.endswith('.gz'): 56 | import gzip 57 | return gzip.open(fname, *args) 58 | return open(fname, *args) 59 | 60 | 61 | class Counter(WriteMixin, Infinite): 62 | message = '' 63 | 64 | def __init__(self, name): 65 | super(Counter, self).__init__() 66 | self.name = name 67 | 68 | def update(self, txt): 69 | self.write('{}: {}'.format(self.name, str(txt))) 70 | 71 | 72 | class FancyBar(Bar): 73 | message = '' 74 | fill = '*' 75 | suffix = '[%(percent)d%%] %(index)d/%(max)d - ' \ 76 | 'ETA: %(eta)ds' \ 77 | ' (%(elapsed_td)s) - %(avg)f sec/itm' 78 | 79 | 80 | class SlowFancyBar(Bar): 81 | message = '' 82 | fill = '*' 83 | suffix = '[%(percent)d%%] %(index)d/%(max)d - ' \ 84 | 'ETA: %(eta)ds ~= %(rem_h)dhrs' \ 85 | ' (%(elapsed_td)s) - %(avg)f s/itm' 86 | 87 | @property 88 | def rem_h(self): 89 | return self.eta // 3600 90 | 91 | 92 | class SlowOverallFancyBar(Bar): 93 | message = '' 94 | fill = '*' 95 | suffix = '[%(percent)d%%] %(index)d/%(max)d ' \ 96 | 'ETA: %(natural_eta)s' \ 97 | ' (%(natural_overall_eta)s for %(grand_tot)s)' \ 98 | ' (%(nat_elapsed)s) - %(avg)f s/itm' 99 | 100 | def __init__(self, *args, **kwargs): 101 | self.grand_total = kwargs.pop('grand_total') 102 | super(SlowOverallFancyBar, self).__init__(*args, **kwargs) 103 | 104 | @property 105 | def natural_eta(self): 106 | return humanize.naturaldelta(self.eta) 107 | 108 | @property 109 | def natural_overall_eta(self): 110 | return humanize.naturaldelta(self.avg * self.grand_total) 111 | 112 | @property 113 | def grand_tot(self): 114 | return humanize.intword(self.grand_total) 115 | 116 | @property 117 | def nat_elapsed(self): 118 | return humanize.naturaldelta(self.elapsed_td) 119 | 120 | 121 | class FancySpinner(Spinner): 122 | suffix = '%(index)d' 123 | -------------------------------------------------------------------------------- /defplorex/tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | import logging 32 | 33 | from celeryapp import app as clapp 34 | 35 | from defplorex.transformer import TagTransformer, TransformerFactory, Pipeline 36 | 37 | log = logging.getLogger(__name__) 38 | 39 | 40 | class ProcessorTask(object): 41 | max_retries = 3 42 | default_retry_delay = 30 43 | 44 | _settings = None 45 | _es = None 46 | 47 | @property 48 | def settings(self): 49 | if self._settings is None: 50 | from config import load_settings 51 | self._settings = load_settings() 52 | return self._settings 53 | 54 | @property 55 | def es(self): 56 | if self._es is None: 57 | from defplorex.backend.elastic import ES 58 | self._es = ES(self.settings) 59 | return self._es 60 | 61 | def __init__(self, transformers, tr_args=[], tr_kwargs={}): 62 | self.transformers = [TagTransformer()] 63 | 64 | if isinstance(transformers, list): 65 | for k in transformers: 66 | tr_kwargs.update(**dict(settings=self.settings)) 67 | self.transformers.append(k(*tr_args, **tr_kwargs)) 68 | 69 | def run(self, ids, index, *args, **kwargs): 70 | log.info('Received task for %d IDs on index %s', len(ids), index) 71 | 72 | query = dict(query=dict(ids=dict(values=filter(lambda x: x, ids)))) 73 | kwargs.update(**dict(settings=self.settings)) 74 | update = kwargs.get('update', True) 75 | ephemeral = kwargs.get('ephemeral', False) 76 | 77 | def _transform(doc): 78 | return Pipeline.chain( 79 | doc, 80 | self.transformers, 81 | updates_only=update, *args, **kwargs) 82 | 83 | if ephemeral: 84 | return [_transform(doc) for doc in self.es.scan(index, query)] 85 | 86 | err_ids = self.es.partial_update_from_query( 87 | index=index, 88 | query=query, 89 | transform=_transform) 90 | 91 | if err_ids: 92 | raise Exception('IDs = %s have failed (will retry)', err_ids) 93 | 94 | 95 | @clapp.task( 96 | bind=True, 97 | default_retry_delay=ProcessorTask.default_retry_delay, 98 | max_retries=ProcessorTask.max_retries) 99 | def processor_task(self, ids, index, **kwargs): 100 | """ 101 | Generic task that executes a serie of transformations on the doc 102 | """ 103 | transformers_lst = kwargs.get('transformers_lst', []) 104 | tr_args = kwargs.get('tr_args', []) 105 | tr_kwargs = kwargs.get('tr_kwargs', {}) 106 | ephemeral = kwargs.get('ephemeral', False) 107 | 108 | if index is None: 109 | return [] 110 | 111 | transformers = TransformerFactory.get_by_list(transformers_lst) 112 | processor = ProcessorTask( 113 | transformers, 114 | tr_args=tr_args, 115 | tr_kwargs=tr_kwargs) 116 | 117 | try: 118 | r = processor.run(ids, index, **kwargs) 119 | if ephemeral: 120 | return r 121 | except Exception as e: 122 | log.warn('Retrying task %s because: %s', self.request.id, e) 123 | raise self.retry(exc=e) 124 | -------------------------------------------------------------------------------- /defplorex/console.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | from __future__ import division 32 | 33 | import sys 34 | import time 35 | import logging 36 | 37 | # 3rd partymodules 38 | import click 39 | import simplejson 40 | 41 | from elasticsearch_dsl import Search, Q 42 | 43 | # local modules 44 | from defplorex.loggers import config_logger 45 | from defplorex.config import load_settings 46 | from defplorex.backend.elastic import ES 47 | from defplorex.transformer import TransformerFactory 48 | from defplorex.utils import ( 49 | SlowOverallFancyBar, 50 | SlowFancyBar, 51 | fopen) 52 | 53 | # locals 54 | log = logging.getLogger(__name__) 55 | settings = load_settings() 56 | es = ES(settings) 57 | TR = TransformerFactory.get_names() 58 | 59 | INDEX = settings.get('es').get('index') 60 | 61 | 62 | @click.group() 63 | @click.option( 64 | '--debug', '-d', 65 | is_flag=True, default=False, help='Enable debugging output') 66 | def cli(debug): 67 | config_logger(debug=debug) 68 | 69 | # NOTE put any preparatory task here 70 | 71 | log.info('Command line ready...') 72 | 73 | 74 | @cli.command() 75 | def show_settings(): 76 | """Print the configuration settings""" 77 | simplejson.dump(settings, sys.stdout) 78 | 79 | 80 | @cli.group() 81 | def process(): 82 | """Distributed data-processing commands""" 83 | pass 84 | 85 | 86 | @cli.group() 87 | def elastic(): 88 | """ES commands""" 89 | log.info('Elasticsearch commands') 90 | 91 | 92 | @process.command() 93 | @click.option( 94 | '--index', '-i', 95 | help='Read from index', 96 | metavar='F', default=INDEX) 97 | @click.option( 98 | '--transformer', '-T', 99 | multiple=True, 100 | type=click.Choice(TR), 101 | metavar='T', 102 | help='Transformation: {}'.format(TR)) 103 | @click.option( 104 | '--limit', '-l', type=int, 105 | metavar='L', help='Limit number of records') 106 | @click.option( 107 | '--tag', '-t', 108 | metavar='TAG', help='Tag the records') 109 | @click.option( 110 | '--reindex', '-r', 111 | is_flag=True, default=False, help='Do not update,' 112 | ' but re-index (expensive)') 113 | @click.option( 114 | '--now', '-n', 115 | is_flag=True, default=False, help='Execute locally') 116 | @click.option( 117 | '--ephemeral', '-e', 118 | is_flag=True, default=False, help='Dry run') 119 | @click.argument('q', metavar='') 120 | def enqueue(index, transformer, limit, tag, reindex, now, ephemeral, q): 121 | """ 122 | Read from index according to query, process, and write to index 123 | """ 124 | if not transformer: 125 | log.warn('Please choose at least one transform among %s', TR) 126 | 127 | from defplorex.tasks import processor_task 128 | 129 | log.info('Working on index %s', index) 130 | 131 | kwargs = dict( 132 | update=not reindex, 133 | ephemeral=ephemeral) 134 | 135 | if tag: 136 | kwargs.update(**dict(tag=tag)) 137 | 138 | # iterator that paginates through records 139 | it = es.paginate( 140 | index=index, 141 | q=q, 142 | limit=limit, 143 | id_only=True) 144 | 145 | # enqueue one task per page of records 146 | for ids in it: 147 | ids = list(ids) 148 | click.echo('Launching task with {} IDs'.format(len(ids))) 149 | 150 | kwargs.update(**dict(transformers_lst=transformer)) 151 | 152 | s = processor_task.s(ids, index, **kwargs) 153 | 154 | if now: 155 | res = s() 156 | else: 157 | res = s.delay() 158 | if ephemeral: 159 | res = res.get() 160 | 161 | if ephemeral: 162 | click.echo(simplejson.dumps(res, indent=2)) 163 | 164 | 165 | @process.command() 166 | @click.option('--index', '-i', default=INDEX, help='Read from index') 167 | @click.option('--delta', '-D', help='Measure delta from beginning', 168 | is_flag=True) 169 | @click.argument('query_string', metavar='') 170 | def monitor(index, delta, query_string): 171 | click.clear() 172 | 173 | def cnt(): 174 | q = Q('query_string', query=query_string) 175 | s = Search( 176 | using=es.client, 177 | index=index).query(q) 178 | return s.count() 179 | 180 | N = cnt() 181 | tot = Search(using=es.client, index=index).count() 182 | 183 | if not delta: 184 | N = tot 185 | 186 | log.info('Processing %d records (total: %d)', N, tot) 187 | 188 | click.echo('You can exit by CTRL-C: results will still process') 189 | 190 | bar = SlowOverallFancyBar('', max=N, grand_total=tot) 191 | while True: 192 | time.sleep(5.0) 193 | try: 194 | n = cnt() 195 | if isinstance(n, int): 196 | if delta: 197 | done = N - n 198 | else: 199 | done = n 200 | bar.goto(done) 201 | except Exception as e: 202 | log.warn('Cannot count: %s', e) 203 | bar.finish() 204 | 205 | 206 | @elastic.command() 207 | @click.argument('index') 208 | @click.argument('mappings_and_settings', type=click.File('rb')) 209 | def create_index(index, mappings_and_settings): 210 | """Create an index given mappings and settings as a JSON""" 211 | body = simplejson.load(mappings_and_settings) 212 | 213 | click.confirm('Create index "%s"?' % index, abort=True) 214 | 215 | es.client.indices.create(index=index, body=body) 216 | 217 | log.info('Index created') 218 | 219 | 220 | @elastic.command() 221 | @click.argument('index') 222 | def delete_index(index): 223 | """Delete an index""" 224 | click.clear() 225 | 226 | click.confirm( 227 | click.style( 228 | 'Really DELETE index "%s"?' % index, 229 | fg='white', 230 | bg='red'), abort=True) 231 | 232 | es.client.indices.delete(index=index) 233 | 234 | log.info('Index deleted') 235 | 236 | 237 | @elastic.command() 238 | @click.option( 239 | '--use-helper', 240 | '-H', 241 | is_flag=True, 242 | default=False, 243 | help='Use old helper API') 244 | @click.argument('from_index') 245 | @click.argument('to_index') 246 | def clone_index(use_helper, from_index, to_index): 247 | """Clone an index""" 248 | from elasticsearch_dsl import Search 249 | from elasticsearch.helpers import reindex 250 | 251 | click.clear() 252 | 253 | if not es.client.indices.exists(index=to_index): 254 | click.secho('%s not existing!'.format(to_index), fg='red') 255 | return 1 256 | 257 | cnt = Search(using=es.client, index=to_index).count() 258 | message = 'Index %s already exists (%d records). Overwrite?' % ( 259 | to_index, cnt) 260 | 261 | click.confirm(message, abort=True) 262 | 263 | if use_helper: 264 | reindex( 265 | client=es.client, 266 | source_index=from_index, 267 | target_index=to_index) 268 | else: 269 | es.client.reindex( 270 | body=dict( 271 | source=dict(index=from_index), 272 | dest=dict(index=to_index)), 273 | wait_for_completion=False) 274 | 275 | 276 | @elastic.command() 277 | @click.argument('from_index') 278 | @click.argument('to_index') 279 | def monitor_clone_index(from_index, to_index): 280 | """Monitor the size of an index""" 281 | from elasticsearch_dsl import Search 282 | 283 | click.clear() 284 | 285 | cnt = Search(using=es.client, index=from_index).count() 286 | 287 | bar = SlowFancyBar('', max=cnt) 288 | while True: 289 | time.sleep(2.0) 290 | _cnt = Search(using=es.client, index=to_index).count() 291 | bar.goto(_cnt) 292 | bar.finish() 293 | -------------------------------------------------------------------------------- /defplorex/backend/elastic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Copyright (c) 2017, Trend Micro Incorporated 4 | # All rights reserved. 5 | # 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are met: 8 | # 9 | # 1. Redistributions of source code must retain the above copyright notice, 10 | # this list of conditions and the following disclaimer. 11 | # 2. Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 19 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 20 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 21 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 23 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 24 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 25 | # POSSIBILITY OF SUCH DAMAGE. 26 | # 27 | # The views and conclusions contained in the software and documentation are 28 | # those of the authors and should not be interpreted as representing official 29 | # policies, either expressed or implied, of the FreeBSD Project. 30 | 31 | from __future__ import division 32 | 33 | # built-in modules 34 | import gc 35 | import logging 36 | from datetime import datetime 37 | 38 | # 3rd party modules 39 | import simplejson 40 | from elasticsearch import Elasticsearch, helpers 41 | from elasticsearch_dsl import Search, Q 42 | 43 | log = logging.getLogger(__name__) 44 | 45 | 46 | class FailedTransformException(Exception): 47 | def __init__(self, message, _id): 48 | super(FailedTransformException, self).__init__(message) 49 | 50 | self._id = _id 51 | 52 | 53 | class ESStorer(object): 54 | """ 55 | Generic ES wrapper 56 | """ 57 | def __init__(self, settings): 58 | kwargs = settings.get('es').get('client') 59 | es_user = settings.get('es_user') 60 | es_pass = settings.get('es_pass') 61 | 62 | if es_user and es_pass: 63 | kwargs.update(**dict(http_auth=(es_user, es_pass))) 64 | 65 | self.client = Elasticsearch(**kwargs) 66 | self.timeout = settings.get('es').get('client').get('timeout') 67 | self.doc_type = settings.get('es').get('doc_type') 68 | self.index_name = settings.get('es').get('index') 69 | self.id_field = settings.get('id_field') 70 | self.bulk_size = settings.get('bulk_size', 1000) 71 | self.path_encoding = settings.get('path_encoding') 72 | 73 | self.actions = [] 74 | 75 | log.debug('ESStorer instance created: %s', self.client) 76 | 77 | def get(self, doc_id, index): 78 | log.debug('Getting _id = %s from index %s', doc_id, index) 79 | 80 | try: 81 | return self.client.get(index=index, doc_type=self.doc_type, 82 | id=doc_id) 83 | except Exception as e: 84 | log.warn('Cannot get doc with ID = %s because: %s', doc_id, e) 85 | 86 | def search(self, **kwargs): 87 | q = kwargs.get('q', '*') 88 | sort = kwargs.get('sort', 'timestamp') 89 | search_after = kwargs.get('search_after') 90 | size = kwargs.get('size', 50) 91 | source = kwargs.get('source') 92 | extra = dict( 93 | size=size) 94 | 95 | if search_after: 96 | extra.update(dict(search_after=search_after)) 97 | 98 | s = Search(using=self.client, index=self.index_name) 99 | if source: 100 | s = s.source(source) 101 | s = s.sort(sort) 102 | s = s.query(Q('query_string', query=q)) 103 | s = s.extra(**extra) 104 | 105 | log.info('Query: %s', s.to_dict()) 106 | 107 | r = s.execute() 108 | count = r.hits.total 109 | took = r.took 110 | 111 | result = r, count, took 112 | 113 | return result 114 | 115 | def partial_update_from_query( 116 | self, index, query, transform, last_updated=True): 117 | 118 | gc.collect() 119 | err_ids = [] 120 | 121 | def it(): 122 | batch = [] 123 | 124 | log.info('Received query: %s', query) 125 | 126 | s = Search( 127 | using=self.client, 128 | index=index, 129 | doc_type=self.doc_type) 130 | s = s.update_from_dict(query) 131 | 132 | log.info('Running query: %s', s.to_dict()) 133 | 134 | # this loop shold spin `bulk_size` times 135 | for doc in s.scan(): 136 | batch.append(doc) 137 | 138 | log.info('Accumulated %d items', len(batch)) 139 | 140 | for doc in batch: 141 | data = doc.to_dict() 142 | _id = doc.meta.id 143 | data['_id'] = _id 144 | 145 | log.debug('Working on doc %s', data) 146 | 147 | try: 148 | try: 149 | doc_body = transform(data) 150 | log.debug('Invoking transform on ID = %s', _id) 151 | except Exception as e: 152 | log.warn( 153 | 'Error while transforming doc ID = %s: %s', 154 | _id, e) 155 | raise e 156 | 157 | if doc_body: 158 | if last_updated: 159 | doc_body['last_updated'] = datetime.now() 160 | 161 | op = self.partial_update_op( 162 | doc_id=_id, 163 | index=index, 164 | doc_body=doc_body, 165 | doc_type=self.doc_type) 166 | yield op 167 | except Exception as e: 168 | log.warn('Cannot process doc ID = %s: %s', _id, e) 169 | err_ids.append(_id) 170 | del(batch) 171 | 172 | try: 173 | # call the iterator via bulk 174 | self.bulk(it()) 175 | log.info('Invoking self.bulk(it())') 176 | except Exception as e: 177 | log.warn('Error in bulk on query = %s because: %s', query, e) 178 | 179 | return err_ids 180 | 181 | def bulk_index_from_it( 182 | self, index, it, transform=lambda x: x, last_updated=True): 183 | 184 | gc.collect() 185 | err_ids = [] 186 | 187 | def _it(): 188 | for doc_body in it: 189 | try: 190 | log.debug('Working on record: %s', doc_body) 191 | _id = doc_body.get(self.id_field) 192 | 193 | try: 194 | doc_body = transform(doc_body) 195 | except Exception as e: 196 | log.warn( 197 | 'Error while transforming doc ID = %s: %s', 198 | _id, e) 199 | raise e 200 | 201 | if doc_body: 202 | if last_updated: 203 | doc_body['last_updated'] = datetime.now() 204 | 205 | op = self.partial_index_op( 206 | doc_id=_id, 207 | index=index, 208 | doc_body=doc_body, 209 | doc_type=self.doc_type) 210 | yield op 211 | except Exception as e: 212 | log.warn('Cannot process doc ID = %s: %s', _id, e) 213 | err_ids.append(_id) 214 | 215 | try: 216 | self.bulk(_it()) 217 | log.info('Invoked self.bulk(_it())') 218 | except Exception as e: 219 | log.warn('Error in bulk index because: %s', e) 220 | 221 | return err_ids 222 | 223 | def create_op( 224 | self, doc_id, index, doc_body, op_type='update', 225 | doc_type=None): 226 | if not doc_id: 227 | raise Exception('Invalid document ID: %s', doc_id) 228 | 229 | if not doc_type: 230 | doc_type = self.doc_type 231 | 232 | # remove _id 233 | if '_id' in doc_body: 234 | del(doc_body['_id']) 235 | 236 | if op_type == 'update': 237 | body = { 238 | 'doc': doc_body 239 | } 240 | else: 241 | body = doc_body 242 | 243 | op_template = { 244 | '_id': doc_id, 245 | '_op_type': op_type, 246 | '_retry_on_conflict': 3, 247 | '_index': index, 248 | '_type': doc_type, 249 | '_source': body 250 | } 251 | 252 | return op_template.copy() 253 | 254 | def partial_index_op(self, doc_id, index, doc_body, doc_type=None): 255 | return self.create_op( 256 | doc_id=doc_id, 257 | index=index, 258 | doc_body=doc_body, 259 | op_type='index', 260 | doc_type=doc_type) 261 | 262 | def partial_update_op( 263 | self, doc_id, index, doc_body, doc_type=None): 264 | return self.create_op( 265 | doc_id=doc_id, 266 | index=index, 267 | doc_body=doc_body, 268 | op_type='update', 269 | doc_type=doc_type) 270 | 271 | def index(self, doc_id, index, source): 272 | log.debug('Storing _id = %s <- %s', doc_id, source) 273 | try: 274 | self.client.index(id=doc_id, index=index, doc_type=self.doc_type, 275 | body=source) 276 | except Exception as e: 277 | log.warn('Cannot index %s because: %s', doc_id, e) 278 | 279 | def bulk(self, it): 280 | try: 281 | log.info('Sending bulk request on iterable/generator') 282 | args = dict(client=self.client, 283 | actions=it, 284 | chunk_size=self.bulk_size, 285 | raise_on_exception=False, 286 | raise_on_error=False, 287 | stats_only=False, 288 | request_timeout=self.timeout) 289 | 290 | res_succ, res_err = helpers.bulk(**args) 291 | 292 | log.info( 293 | 'Sent bulk request on queue iterator: ' 294 | 'successfull ops = %d, failed ops = %d', 295 | res_succ, len(res_err)) 296 | 297 | for res in res_err: 298 | log.warn('Error response: %s', res) 299 | except Exception as e: 300 | log.error('Error in storing: %s', e, exc_info=True) 301 | 302 | def get_fields(self, index): 303 | return self.client.indices.get_mapping(index, doc_type=self.doc_type) 304 | 305 | def count(self, index, query): 306 | try: 307 | s = Search( 308 | using=self.client, 309 | index=index, 310 | doc_type=self.doc_type). \ 311 | update_from_dict(query) 312 | log.info('Querying: %s', s.to_dict()) 313 | 314 | return s.count() 315 | except Exception as e: 316 | log.warn('Cannot count: %s', e) 317 | 318 | def scan(self, index, query, limit=None, id_only=False): 319 | size = self.bulk_size 320 | max_records = None 321 | cnt = 0 322 | 323 | if isinstance(limit, int): 324 | if limit > 0: 325 | size = min(limit, size) 326 | max_records = limit 327 | 328 | kw = dict( 329 | index=index, 330 | query=query, 331 | size=size 332 | ) 333 | 334 | if id_only: 335 | kw['_source'] = ['_id'] 336 | 337 | log.debug('Scanning for %s (size = %d, index = %s)', 338 | query, size, index) 339 | 340 | for hit in helpers.scan(self.client, **kw): 341 | if max_records: 342 | if cnt >= max_records: 343 | log.debug('Stopping after pulling %d records' 344 | ' as requested', cnt) 345 | raise StopIteration 346 | 347 | log.debug('Yielding %s', hit['_id']) 348 | cnt += 1 349 | 350 | if id_only: 351 | yield hit.get('_id') 352 | else: 353 | yield hit 354 | 355 | def paginate(self, index, q='*', limit=None, size=None, id_only=True): 356 | if not size: 357 | size = self.bulk_size 358 | 359 | log.info('Limit %s, size %s (q = "%s")', limit, size, q) 360 | 361 | s = Search( 362 | using=self.client, 363 | index=index, 364 | doc_type=self.doc_type) 365 | s = s.query(Q('query_string', query=q)) 366 | 367 | if limit: 368 | size = min(size, limit) 369 | s = s.extra(size=size) 370 | 371 | s = s.params( 372 | scroll='20m', 373 | size=size) 374 | 375 | if id_only: 376 | s = s.source(False) 377 | 378 | log.debug('Query: %s', simplejson.dumps(s.to_dict(), indent=2)) 379 | 380 | hits = [] 381 | overall = 0 382 | 383 | for h in s.scan(): 384 | if limit is not None and overall >= limit: 385 | raise StopIteration() 386 | 387 | log.debug('Hit: %s (progress: %d)', h.meta.id, overall) 388 | if overall < limit or not limit: 389 | if id_only: 390 | hits.append(h.meta.id) 391 | else: 392 | hits.append(h.to_dict()) 393 | 394 | if len(hits) == size: 395 | yield iter(hits) 396 | hits = [] 397 | overall += size 398 | 399 | if len(hits): 400 | yield iter(hits) 401 | else: 402 | raise StopIteration() 403 | 404 | 405 | ES = ESStorer 406 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DefPloreX (Public Release) 2 | At [BlackHat USA 2017's Arsenal we've showcased 3 | DefPloreX](https://www.blackhat.com/us-17/arsenal/schedule/index.html#defplorex-a-machine-learning-toolkit-for-large-scale-ecrime-forensics-8065), 4 | an Elasticsearch-based toolkit that our team uses for large-scale processing, 5 | analysis and visualization of e-crime records. In particular, we've 6 | successfully been applying DefPloreX to the analysis of deface records (e.g., from web compromises); 7 | hence its name, Def(acement) eXPlorer (DefPloreX). 8 | 9 | ![DefPloreX Visualization](i/dpx-clusters-viz.png?raw=true "DefPloreX Visualization") 10 | 11 | DefPloreX automatically organizes deface records by web pages' content and format (what we call ``template pages''). 12 | This allows an analyst to easily investigate on campaigns, 13 | for example in discovering websites targeted by the same campaign or 14 | attributing one or more actors to the same hacking group. 15 | All of this without sacrificing the interactivity aspect of the investigation. 16 | 17 | ![Overview of DefPloreX](i/dpx-overall.png?raw=true "Overview of DefPloreX") 18 | 19 | The full version of DefPloreX includes: 20 | 21 | * A thin wrapper to interact with an Elasticsearch backend (included in this release) 22 | * A distributed data-processing pipeline based on Celery (example included in this release) 23 | * An analysis component to extract information from deface web pages 24 | * A features extraction component to produce a compact, numerical and categorical representation of each web page 25 | * A statistical machine-learning component to automatically find groups of similar web pages 26 | 27 | The input to DefPloreX is a feed of URLs describing the deface web pages, 28 | including metadata such as the (declared) attacker name, timestamp, reason 29 | for hacking that page, and so on. Separately, we also have a mirror of the 30 | web pages at the time of compromise. 31 | 32 | ## Code Release 33 | This repository contains the public release of DefPloreX. Technically speaking, 34 | we're releasing an example use of the DefPloreX approach to distributed data 35 | processing using Elasticsearch (ES). This is not meant to be a ready-to-use, 36 | plug-n-play solution, but rather a framework that you can reuse, extend and 37 | improve to adapt it to your needs. 38 | 39 | The goal that guided us to implement DefPloreX was the need to efficiently 40 | analyze a large number of records (pages) for common aspects, recurrent attackers, 41 | or groups of organized attackers. In other words, a typical e-crime 42 | forensics task. 43 | 44 | In this, the core challenge was to visit and analyze over 13 million web pages, 45 | parse their source code, analyze their resources (e.g., 46 | images, scripts), extract visual information, store the data so extracted in 47 | a database, and query it to answer the typical questions that arise during 48 | a post-mortem investigation. Given its popularity and scalability, 49 | we've chosen Elasticsearch as our data storage solution. Since we wanted our 50 | solution to be scalable, and given that visiting a web page (with an automated, 51 | headless browser) takes at least 5 seconds, the only option was to distribute 52 | the workload across several worker machines. 53 | 54 | ## Distributed Data Processing 55 | 56 | Normally, to take full advantage of Elasticsearch's distributed 57 | data-processing functionality, you need to resort to 58 | [scripting](https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-scripting.html). 59 | Although scripting is quite powerful and handy for small data-manipulation 60 | tasks, it's a bit cumbersome to deploy and handle requires; and, in addition, it 61 | requires full access to the Elasticsearch's client nodes. For example, if you 62 | need to process all the documents in an Elastic index (e.g., to enrich them by 63 | computing additional fields), you will have to choose one of the scripting 64 | languages supported by Elastic, write a script, deploy it and run it. Needless 65 | to say, your script will run within the context of the ES runtime, 66 | with all the limitations that this implies. For example, should you need to use 67 | Python, you're forced to use the Jython Java implementation of Python, which is 68 | not the same as pure Python. For instance, some of the libraries that you may 69 | want to use may not be supported, and so on. In other words, we don't want to depend 70 | on the Elastic's scripting subsystem in our work :) 71 | 72 | Instead, we take a more "detached" approach. We decouple the data-processing 73 | part, making it independent from the Elasticsearch runtime and architecture, 74 | and rely on ES exclusively as a data back-end to store, retrieve and 75 | modify JSON documents. The coordination of the distributed computation is 76 | delegated to a well-known and widely used distributed task queue: 77 | [Celery](http://www.celeryproject.org/). The friendliness of Celery is 78 | astonishing: from the programmer's perspective, all it requires is to write 79 | your data-processing code by means of a function, and Celery will take care of 80 | offloading the (expensive and long-running) computation to one of the available 81 | workers. 82 | 83 | ![DefPloreX distributed data processing via Celery](i/dpx-celery.png?raw=true "DefPloreX distributed data processing via Celery") 84 | 85 | For example, if you need to visit a web page with an automated headless browser, 86 | all you need to do is to wrap your code into a function, let's say `visit_page`, 87 | and decorate it with `@app.task` to inform Celery that this is a task: 88 | 89 | ``` 90 | @app.task 91 | def visit_page(url): 92 | result = long_running_process(url) 93 | 94 | return result 95 | ``` 96 | 97 | Later on in your code, all you need to do is to call the function (almost) as 98 | you would normally do: 99 | 100 | ``` 101 | visit_page.delay(url) 102 | ``` 103 | 104 | The `.delay()` function indicates that the function call will not execute 105 | immediately, but instead will be "pushed" into a task list, from which an 106 | available worker will pull it and do the work. 107 | 108 | On the other end of the task list, you can launch as many workers as you need, 109 | by simply keeping the Celery daemon active: 110 | 111 | ``` 112 | $ celery worker --autoscale=6,64 113 | ``` 114 | 115 | Assuming having a 64-core machine, this command spawns 6 concurrent processes, up 116 | to 64 when more workload comes in. And of course you can add as many workers as 117 | needed, from a single computer with a few tenths of cores, to a full rack 118 | distributed across the globe. In our deployment, we have 5 machines, with 119 | a total of 128 cores. With these modest resources, we were able to visit the 120 | entire collection of over 13 million web pages in a week. Adding more cores would have 121 | made the analysis even faster. 122 | 123 | # Document Transformations 124 | From this moment on, we have a solid foundation to efficiently transform JSON 125 | documents stored in the Elastic index. Therefore, we "encode" any operation 126 | that we need to perform in DefPloreX by means of a few lines of Python code. For 127 | example, we often need to "tag" JSON documents to mark those that have been 128 | processed. To this end, as exemplified in this repository, we use the 129 | `TagTransformer` transformation. As any other transform, this function receives one JSON 130 | document and returns the newly added fields, or the modified fields. 131 | 132 | ``` 133 | class TagTransformer(Transformer): 134 | """ 135 | Example transform to append tag to a record. 136 | """ 137 | _name = 'tag' # unique name 138 | 139 | def __call__(self, doc, *args, **kwargs): 140 | doc = super(TagTransformer, self).__call__( 141 | doc, *args, **kwargs) 142 | 143 | tag = kwargs.get('tag') # tag that we want to apply to the JSON 144 | 145 | if not tag: 146 | log.debug('No tags supplied, skipping') 147 | return [] 148 | 149 | tags = doc.get('tags', []) # get the 'tags' field from the existing JSON doc 150 | 151 | if tags: 152 | log.debug('Found tags: %s', tags) 153 | 154 | tags.append(tag) # append the new tag 155 | tags = list(set(tags)) # remove duplicates 156 | 157 | log.debug('Updated tags: %s', tags) 158 | 159 | return dict(tags=tags) # return the enriched JSON 160 | ``` 161 | 162 | The output of this transformation is automatically handled by our Elasticsearch 163 | wrapper (see `backend.elastic.ESStorer`) and the 164 | `transformer.Pipeline` class, which merges the new (partial) document with the 165 | original one and saves it into the ES index. Actually, this is 166 | performed in bulk: that is, every worker consumes and processes a given amount 167 | of documents at each round (default is 1000). To summarize: given a query, we 168 | enqueue all the IDs of the documents that match that query. The queue consumers 169 | will pull 1000 IDs at a time, query Elastic for the respective documents, 170 | transform them, and push them back on Elastic as update operations. 171 | 172 | Other transformations that we have implemented (briefly explained in 173 | the following) include for example visiting the web pages with an automated, 174 | headless browser, extracting information from the visited web pages, 175 | calculating numerical features, and so on. Every task is expressed by means of 176 | a subclass of `Transformer`, which takes as input a document, and returns the 177 | enriched or modified fields. 178 | 179 | ## Extracted Information 180 | From each web page, we were interested in collecting two "sides" of the same 181 | story: a "static" view of the page (e.g., non-interpreted resources, scripts, 182 | text) and a "dynamic" view of the same page (e.g., rendered page with DOM 183 | modifications and so on). In concrete, the full version of DefPloreX can 184 | extract URLs, e-mail addresses, social-network nicknames and handles, hashtags, 185 | images, file metadata, summarized text, and so on. These information captures the 186 | main characteristics of a defaced web page. 187 | 188 | ![Extracted data from each web page](i/dpx-extraction.png?raw=true "Extracted data from each page") 189 | 190 | ## Scalable Data Clustering 191 | We approach the problem of finding groups of related deface pages 192 | (e.g., hacktivism campaigns) as a typical data-mining problem. We assume that 193 | there are recurring and similar characteristics among these pages that we can 194 | capture and use as clustering features. For example, we assume that the same 195 | attacker will reuse the same web snippets or templates (albeit with minimal variations) 196 | within the same campaign. We capture this and other aspects by extracting 197 | numerical and categorical features from the data that we obtained by analyzing 198 | each page (static and dynamic view). To this end, we express the following 199 | task by means of a transform function. 200 | 201 | For example, here's an excerpt of the features that we compute from 202 | each of our documents: 203 | 204 | ``` 205 | { 206 | "n_urls": 135, 207 | "n_object": 0, 208 | "n_embed": 0, 209 | "n_telephone": 8, 210 | "n_email": 1, 211 | "n_img": 18, 212 | "n_link": 0, 213 | "n_sound_urls": 0, 214 | "n_anchor": 60, 215 | "n_meta": 4, 216 | "n_resource": 0, 217 | "n_iframe": 0, 218 | "n_script": 34, 219 | "n_hashtag": 0, 220 | "n_style": 9, 221 | "n_twitter": 1, 222 | "avg_color": "#000000", 223 | "frac_letters_in_title": 0.6979166666666666, 224 | "frac_punct_in_title": 0.17708333333333334, 225 | "frac_whitespace_in_title": 0.0625, 226 | "frac_digits_in_title": 0.0625 227 | } 228 | ``` 229 | 230 | ![Feature extraction](i/dpx-features.png?raw=true "Feature extraction") 231 | 232 | At this point we could use any clustering algorithm to find groups. However, 233 | this would not be the most efficient solution, at least in general, because 234 | we would need to compare all pairs of our collection of 13 million records, 235 | calculate "some" form of distance (e.g., ssdeep), and then start forming groups by 236 | means of such distance. 237 | 238 | We take a different approach, which is approximate but way faster. As a result, 239 | we're able to cluster our entire collection of 13 million documents in less than a 240 | minute, and we dynamically configure the clustering features on demand (i.e., at 241 | each clustering execution). 242 | 243 | Intuitively, we would like to be able to find logical groups of web pages that 244 | share "similar" feature values. Instead of approaching this problem as 245 | a distance-metric calculation task, we use the concept of "feature binning" or 246 | "feature quantization". In simple words, we want all the web pages with a "low 247 | number of URLs" to fall in the same cluster. At the same time, we want all the 248 | web pages with a "high number of URLs" to fall in another cluster. And so on, 249 | for all the features. In other words, the clustering task becomes a "group-by" 250 | task, which is natively and well supported by all database engines. In the case of 251 | Elastic, it's efficiently implemented in a map-reduce fashion, effectively distributing 252 | the workload across all the available nodes. 253 | 254 | The missing piece is how we obtain these "low, medium, high" values from the 255 | original, numerical feature values. For instance, is "42 URLs" considered low, 256 | high, or medium? To this end, we look at the statistical distribution of each feature, 257 | and divide its space into intervals according to estimated percentiles. For instance, 258 | the values below the 25% percentile are considered low, those between 25-50% percentile 259 | are medium, and those between 50% and 75% are high. Those above the 75% percentile 260 | are outliers. This is just an example, of course. 261 | 262 | ![Feature quantization and clustering](i/dpx-binning.png?raw=true "Feature quantization and clustering") 263 | 264 | It turns out that Elasticsearch already supports the calculation of a few 265 | statistical metrics, among which we happily found the percentiles. So all we need 266 | to do is asking Elastic to compute the percentiles of each feature -- done in a matter 267 | of few seconds. Then, we store these percentiles 268 | and use them as thresholds to quantize the numerical features. 269 | 270 | For example, here's an excerpt of four equally-spaced percentiles (from 1% 271 | to 99%) that we obtaine from our collection: 272 | 273 | ``` 274 | "features": { 275 | "n_style": [ 276 | 0, 277 | 2, 278 | 5, 279 | 10 280 | ], 281 | "n_anchor": [ 282 | 0, 283 | 10, 284 | 34, 285 | 284.78097304328793 286 | ], 287 | "n_urls": [ 288 | 0, 289 | 6.999999999999999, 290 | 19.575392097264313, 291 | 201.65553368092415 292 | ], 293 | "n_hashtag": [ 294 | 0, 295 | 2.2336270350272462, 296 | 5, 297 | 16 298 | ], 299 | "n_script": [ 300 | 0, 301 | 4, 302 | 12, 303 | 45 304 | ], 305 | "n_sound_urls": [ 306 | 0, 307 | 1, 308 | 2.4871283217280453, 309 | 7 310 | ], 311 | ... 312 | } 313 | ``` 314 | 315 | Overall, for each page, we obtain a vector as the following that we store in ES. 316 | 317 | ``` 318 | { 319 | "n_urls": H, 320 | "n_object": L, 321 | "n_embed": L, 322 | "n_telephone": M, 323 | "n_email": L, 324 | "n_img": M, 325 | "n_link": L, 326 | "n_sound_urls": L, 327 | "n_anchor": M, 328 | "n_meta": L, 329 | "n_resource": L, 330 | "n_iframe": L, 331 | "n_script": M, 332 | "n_hashtag": L, 333 | "n_style": L, 334 | "n_twitter": L, 335 | "avg_color": "#000000", 336 | "frac_letters_in_title": M, 337 | "frac_punct_in_title": L, 338 | "frac_whitespace_in_title": L, 339 | "frac_digits_in_title": L 340 | } 341 | ``` 342 | 343 | At this point, the web operator (the analyst) simply chooses the features for data pivoting, and 344 | runs an Elasticsearch aggregate query, which is natively supported. 345 | 346 | In the remainder of this page you can see some example results. 347 | 348 | ![Feature quantization and clustering (visualized)](i/dpx-binning-viz.png?raw=true "Feature quantization and clustering (visualized)") 349 | 350 | ![Feature quantization and clustering (visualized)](i/dpx-binned-records-viz.png?raw=true "Feature quantization and clustering (visualized)") 351 | 352 | # License 353 | ``` 354 | Copyright (c) 2017, Trend Micro Incorporated 355 | All rights reserved. 356 | 357 | Redistribution and use in source and binary forms, with or without 358 | modification, are permitted provided that the following conditions are met: 359 | 360 | 1. Redistributions of source code must retain the above copyright notice, 361 | this list of conditions and the following disclaimer. 362 | 2. Redistributions in binary form must reproduce the above copyright notice, 363 | this list of conditions and the following disclaimer in the documentation 364 | and/or other materials provided with the distribution. 365 | 366 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 367 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 368 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 369 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 370 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 371 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 372 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 373 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 374 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 375 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 376 | POSSIBILITY OF SUCH DAMAGE. 377 | 378 | The views and conclusions contained in the software and documentation are 379 | those of the authors and should not be interpreted as representing official 380 | policies, either expressed or implied, of the FreeBSD Project. 381 | ``` 382 | --------------------------------------------------------------------------------