├── images
    ├── webui.png
    ├── yara-output.png
    ├── architecture.png
    ├── clamav-output.png
    ├── external_intel.png
    ├── misp-overview.png
    ├── payload-output.png
    ├── sqlite-storage.png
    ├── console-storage.png
    └── architecture-prev.png
├── crawler
    ├── utils
    │   ├── subcrawl.db
    │   ├── ansi_colors.py
    │   ├── __init__.py
    │   ├── logos
    │   │   ├── subcrawl-2.txt
    │   │   ├── subcrawl-3.txt
    │   │   └── subcrawl-1.txt
    │   ├── setup_kafka_topic.py
    │   ├── banner.py
    │   ├── helpers.py
    │   ├── sqlite_model.py
    │   └── logger.py
    ├── processing
    │   ├── minisdhash
    │   │   ├── sdhash
    │   │   ├── libsdbf.a
    │   │   └── sdbf_class.py
    │   ├── default_processing.py
    │   ├── example_processing.py
    │   ├── __init__.py
    │   ├── tlsh_processing.py
    │   ├── jarm_processing.py
    │   ├── yara_processing.py
    │   ├── clamav_processing.py
    │   ├── sdhash_processing.py
    │   ├── payload_processing.py
    │   └── external_intel_processing.py
    ├── run.sh
    ├── storage
    │   ├── __init__.py
    │   ├── default_storage.py
    │   ├── example_storage.py
    │   ├── console_storage.py
    │   ├── sqlite_storage.py
    │   ├── elastic_storage.py
    │   ├── misp_storage.py
    │   └── kibana-dashboard
    │   │   └── overview-dashboard.ndjson
    ├── yara-rules
    │   ├── open_webshell.yar
    │   ├── php_file_manager_login.yar
    │   ├── erbium_discord_panel_login.yar
    │   ├── default_page_xampp_windows.yar
    │   ├── default_page_apache.yar
    │   ├── outlook_phish.yar
    │   ├── titan_stealer_panel_login.yar
    │   ├── royalmail_phish.yar
    │   ├── sharepoint_online_phish.yar
    │   ├── chase_login_spox_phish.yar
    │   ├── collector_stealer_panel_login.yar
    │   ├── bapr_banking_phish.yar
    │   ├── hex-encoded-pe-file.yar
    │   ├── microsoft_phish.yar
    │   ├── aurora_stealer_panel_login.yar
    │   ├── modernloader_panel_login.yar
    │   ├── office365_review_phish.yar
    │   ├── webpanel_origin_login.yar
    │   ├── base64_pe.yar
    │   ├── amadey_panel_login.yar
    │   ├── office365_verify_pdf_phish.yar
    │   ├── wellsfargo_phish.yar
    │   ├── bankamerica_phish.yar
    │   ├── link_sharing_onedrive.yar
    │   ├── pony_panel_login.yar
    │   ├── attachments_onedrive_phish.yar
    │   ├── microsoft_login_phish.yar
    │   ├── unam_webpanel_login.yar
    │   ├── sharepoint_dropbox_online_phish.yar
    │   ├── standard_bank_phish.yar
    │   ├── onedrive_business_phish.yar
    │   ├── panels.yar
    │   ├── h3k_tinyfilemanager_login.yar
    │   ├── grandamisha_panel_login.yar
    │   ├── wallet_connect_phish.yar
    │   ├── obfuscated_script.yar
    │   ├── acridrain_stealer_panel_login.yar
    │   ├── mars_panel_login.yar
    │   ├── huntington_phish.yar
    │   ├── mana5_panel_login.yar
    │   ├── base64_shellcode_dos_header_pe.yar
    │   ├── html_webshell_login.yar
    │   ├── php_webshell_backend.yar
    │   ├── agenttesla_webpanel_login.yar
    │   ├── js_webshell_tracking_script.yar
    │   └── combined-rules.yar
    ├── requirements.txt
    ├── Dockerfile
    ├── docker-compose.yml
    ├── app
    │   ├── templates
    │   │   ├── domains.html
    │   │   ├── urls.html
    │   │   ├── search_results.html
    │   │   ├── dashboard.html
    │   │   ├── url_details.html
    │   │   ├── domain_details.html
    │   │   └── base.html
    │   └── main.py
    ├── service.py
    ├── input
    │   ├── phishtank.py
    │   └── urlhaus.py
    ├── misp-objects
    │   └── opendir-url
    │   │   └── definition.json
    ├── supervisor
    │   └── supervisord.conf
    ├── config.yml
    └── subcrawl.py
├── conferences
    └── 2021
    │   └── blackhat_us_arsenal
    │       └── BH-Arsenal-2021.pdf
├── License.md
├── .gitignore
└── README.md


/images/webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/webui.png


--------------------------------------------------------------------------------
/images/yara-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/yara-output.png


--------------------------------------------------------------------------------
/crawler/utils/subcrawl.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/utils/subcrawl.db


--------------------------------------------------------------------------------
/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/architecture.png


--------------------------------------------------------------------------------
/images/clamav-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/clamav-output.png


--------------------------------------------------------------------------------
/images/external_intel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/external_intel.png


--------------------------------------------------------------------------------
/images/misp-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/misp-overview.png


--------------------------------------------------------------------------------
/images/payload-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/payload-output.png


--------------------------------------------------------------------------------
/images/sqlite-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/sqlite-storage.png


--------------------------------------------------------------------------------
/images/console-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/console-storage.png


--------------------------------------------------------------------------------
/images/architecture-prev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/architecture-prev.png


--------------------------------------------------------------------------------
/crawler/processing/minisdhash/sdhash:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/processing/minisdhash/sdhash


--------------------------------------------------------------------------------
/crawler/processing/minisdhash/libsdbf.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/processing/minisdhash/libsdbf.a


--------------------------------------------------------------------------------
/crawler/run.sh:
--------------------------------------------------------------------------------
1 | service clamav-daemon start
2 | service supervisor start
3 | gunicorn app.main:app -b 0.0.0.0:8000 --reload --workers 4


--------------------------------------------------------------------------------
/conferences/2021/blackhat_us_arsenal/BH-Arsenal-2021.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/conferences/2021/blackhat_us_arsenal/BH-Arsenal-2021.pdf


--------------------------------------------------------------------------------
/crawler/storage/__init__.py:
--------------------------------------------------------------------------------
1 | # © Copyright 2021 HP Development Company, L.P.
2 | from .console_storage import ConsoleStorage
3 | from .misp_storage import MISPStorage
4 | from .sqlite_storage import SqliteStorage
5 | from .elastic_storage import ElasticStorage
6 | 


--------------------------------------------------------------------------------
/crawler/utils/ansi_colors.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | 
 3 | class SubCrawlColors:
 4 |     BLUE = '\033[34m'
 5 |     GREEN = '\033[32m'
 6 |     PURPLE = '\033[35m'
 7 |     YELLOW = '\033[33m'
 8 |     RED = '\033[31m'
 9 |     CYAN = '\033[36m'
10 |     RESET = '\033[0m'
11 |     CLS = '\033[2J'
12 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/open_webshell.yar:
--------------------------------------------------------------------------------
 1 | rule open_webshell
 2 | {
 3 |   meta:
 4 |     description = "Open Webshell Detection"
 5 |     author = "patrick.schlapfer@hp.com"
 6 |     date = "2021-04-19"
 7 | 
 8 |   strings:
 9 |     $a = "file manager"
10 |     $b = "uname"
11 | 
12 |   condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/php_file_manager_login.yar:
--------------------------------------------------------------------------------
 1 | rule php_file_manager_login {
 2 | 
 3 | 	meta:
 4 | 		date = "2022-11-29"
 5 | 
 6 | 	strings:
 7 | 		$s1 = "<title>File Manager"
 8 | 		$s2 = "content=\"Web based File Manager"
 9 | 		$s3 = "class=\"form-signin\""
10 | 		$s4 = "File Manager</h1>"
11 | 
12 | 	condition:
13 | 		all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/processing/default_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | 
 3 | class DefaultProcessing:
 4 | 
 5 |     cfg = None
 6 |     logger = None
 7 | 
 8 |     def __init__(self, config, logger):
 9 |         self.cfg = config
10 |         self.logger = logger
11 | 
12 |     def process(self, url, resp):
13 |         pass
14 | 


--------------------------------------------------------------------------------
/crawler/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # © Copyright 2021 HP Development Company, L.P.
2 | from utils.logger import SubCrawlLogger, SubCrawlLoggerLevels
3 | from utils.banner import SubCrawlBanner
4 | from utils.sqlite_model import *
5 | from utils.setup_kafka_topic import check_topic
6 | from utils.ansi_colors import SubCrawlColors
7 | from utils.helpers import SubCrawlHelpers
8 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/erbium_discord_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule erbium_discord_panel_login {
 2 |     meta:
 3 |         data = "2022-11-28"
 4 | 
 5 |     strings:
 6 |         $x1 = "https://erbium_support.t.me"
 7 |         $x2 = "<title>Discord"
 8 |         $s1 = "id=\"username\""
 9 |         $s2 = "id=\"password\""
10 | 
11 |     condition:
12 |         all of them
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/default_page_xampp_windows.yar:
--------------------------------------------------------------------------------
 1 | rule default_page_xampp_windows
 2 | {
 3 |   meta:
 4 |     description = "Default page for XAMPP"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-27"
 7 |  strings:
 8 |     $title = "Welcome to XAMPP</title>" nocase
 9 |     $platform = "welcome to xampp for windows" nocase
10 |  condition:
11 |     all of them
12 | }
13 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/default_page_apache.yar:
--------------------------------------------------------------------------------
 1 | rule default_page_apache
 2 | {
 3 |   meta:
 4 |     description = "Default page for Apache2"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-02"
 7 |  strings:
 8 |     $title = /apache2.{,10}default page/ nocase
 9 |     $apache = "apache2" nocase
10 |     $default = "default page" nocase
11 |  condition:
12 |     all of them
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/outlook_phish.yar:
--------------------------------------------------------------------------------
 1 | rule outlook_phish
 2 | {
 3 |   meta:
 4 |     description = "Outlook login"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-06-29"
 7 |  strings:
 8 |     $form = "class=\"boxtext\"" nocase
 9 |     $title = "microsoft | login</title>" nocase
10 |     $pass = "id=\"pr\""
11 |     $header = "OUTLOOK</h2>"
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/titan_stealer_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule titan_stealer_panel_login {
 2 |     meta:
 3 |         date = "2022-11-30"
 4 | 
 5 |     strings:
 6 |         $s1 = "<title>Titan Stealer</title>" nocase
 7 |         $s2 = "class=\"auth__form\"" nocase 
 8 |         $s3 = "Sign in" nocase
 9 |         $s4 = "id=\"floatingPassword\"" nocase
10 | 
11 |     condition:
12 |         all of them
13 | }


--------------------------------------------------------------------------------
/crawler/yara-rules/royalmail_phish.yar:
--------------------------------------------------------------------------------
 1 | rule royal_mail_phish
 2 | {
 3 |   meta:
 4 |     description = "Royal Mail phish"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-04-18"
 7 |  strings:
 8 |     $title = "royal mail group ltd</title>" nocase
 9 |     $form_action = "action=\"login.php\""
10 |     $pass = "name=\"pass\"" nocase
11 |  condition:
12 |     all of them
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/sharepoint_online_phish.yar:
--------------------------------------------------------------------------------
 1 | rule sharepoint_online_phish
 2 | {
 3 |   meta:
 4 |     description = "Sharepoint Online Multiple Logins"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-02"
 7 |  strings:
 8 |     $title = "share point online</title>" nocase
 9 |     $user = "id=\"email\""
10 |     $post_url = "next.php" nocase
11 |  condition:
12 |     all of them
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/chase_login_spox_phish.yar:
--------------------------------------------------------------------------------
 1 | rule chase_login_spox_phish
 2 | {
 3 |   meta:
 4 |     description = "Chase Bank Login"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-27"
 7 |  strings:
 8 |     $title = "Online enrollement</title>"
 9 |     $form = "action=\"regex.php\""
10 |     $user = "name=\"id\""
11 |     $pass = "name=\"password\""
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/collector_stealer_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule collector_stealer_panel_login {
 2 |     meta:
 3 |         date = "2022-11-30"
 4 | 
 5 |     strings:
 6 |         $s1 = "<title>login</title>" nocase
 7 |         $s2 = "Collector Stealer panel" nocase
 8 |         $s3 = "action=\"/index.php?auth\"" nocase
 9 |         $s4 = "id=\"sendlogin\"" nocase 
10 | 
11 |     condition:
12 |         all of them
13 | }


--------------------------------------------------------------------------------
/crawler/requirements.txt:
--------------------------------------------------------------------------------
 1 | Jinja2 == 3.0.1
 2 | PyYAML == 5.4.1
 3 | beautifulsoup4 == 4.9.3
 4 | clamd == 1.0.2
 5 | falcon == 2.0.0
 6 | mergedeep == 1.3.4
 7 | peewee == 3.14.0
 8 | py_tlsh == 4.5.0
 9 | pyjarm == 0.0.5
10 | pymisp == 2.4.140
11 | python_magic == 0.4.22
12 | requests == 2.25.0
13 | timeloop == 1.0.2
14 | yara_python == 4.0.5
15 | gunicorn == 20.0.4
16 | kafka-python == 2.0.2
17 | elasticsearch == 7.15.1
18 | 


--------------------------------------------------------------------------------
/crawler/storage/default_storage.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | 
 3 | class DefaultStorage:
 4 | 
 5 |     cfg = None
 6 |     logger = None
 7 | 
 8 |     def __init__(self, config, logger):
 9 |         self.cfg = config
10 |         self.logger = logger
11 | 
12 |     def load_scraped_domains(self):
13 |         return []
14 | 
15 |     def store_result(self, result_data):
16 |         return True
17 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/bapr_banking_phish.yar:
--------------------------------------------------------------------------------
 1 | rule bapr_phish_phish
 2 | {
 3 |   meta:
 4 |     description = "BAPR Online banking phishing page"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-09"
 7 |  strings:
 8 |     $title = "personal internet banking</title>" nocase
 9 |     $form = "name=\"login.loginform\"" nocase
10 |     $pass = "id=\"passcrypt\"" nocase
11 |  condition:
12 |     all of them
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/hex-encoded-pe-file.yar:
--------------------------------------------------------------------------------
 1 | rule hexencoded_pe_file {
 2 |     meta:
 3 |         desc = "Detects hex-encoded pe file"
 4 |         author = "@jstrosch"
 5 |         date = "2022 Oct 24"
 6 | 
 7 |     strings:
 8 |         $mz = { 34 44 35 41 } //4D 5A -> MZ
 9 |         $pe = { 35 30 34 35 30 30 30 30 } // 50 45 00 00 -> PE00 
10 | 
11 |     condition:
12 |         $mz at 0 and $pe in (@mz[1]..0x200)
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/processing/example_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | from .default_processing import DefaultProcessing
 3 | 
 4 | 
 5 | class ExampleProcessing(DefaultProcessing):
 6 | 
 7 |     cfg = None
 8 |     logger = None
 9 | 
10 |     def __init__(self, config, logger):
11 |         self.cfg = config
12 |         self.logger = logger
13 | 
14 |     def process(self, url, resp):
15 |         pass
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/microsoft_phish.yar:
--------------------------------------------------------------------------------
 1 | rule microsoft_phish
 2 | {
 3 |   meta:
 4 |     description = "Microsoft login"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-01"
 7 |  strings:
 8 |     $form = "office/login.php" nocase
 9 |     $title = "sign in to your microsoft account</title>" nocase
10 |     $user = "id=\"user\""
11 |     $redirect = "pass.php" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/aurora_stealer_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule aurora_stealer_panel_login {
 2 |     meta:
 3 |         date = "2022-11-30"
 4 |         author = "@jstrosch"
 5 | 
 6 |     strings:
 7 |         $s1 = "<title>Auth</title>" nocase
 8 |         $s2 = "AURORA STEALER" nocase
 9 |         $s3 = "placeholder=\"YOU PASSWORD\"" nocase
10 |         $s4 = "id=\"email-2ee9\"" nocase
11 | 
12 |     condition:
13 |         all of them
14 | }


--------------------------------------------------------------------------------
/crawler/yara-rules/modernloader_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule modernloader_panel_login {
 2 |     meta:
 3 |         date = "2022-11-30"
 4 |         author = "@jstrosch"
 5 | 
 6 |     strings:
 7 |         $s1 = "<title>Panel - Login</title>" nocase
 8 |         $s2 = "class=\"login__form\"" nocase
 9 |         $s3 = "url = \"control.php\"" nocase
10 |         $s4 = "Welcome</h3>" nocase
11 | 
12 |     condition:
13 |         all of them
14 | }


--------------------------------------------------------------------------------
/crawler/yara-rules/office365_review_phish.yar:
--------------------------------------------------------------------------------
 1 | rule office365_review__phish
 2 | {
 3 |   meta:
 4 |     description = "Office 365 Review Document phish"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-07-12"
 7 |  strings:
 8 |     $form = "post.php" nocase
 9 |     $title = "Office 365</title>"
10 |     $user = "id=\"email\""
11 |     $placeholder = "Office 365 Email" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/webpanel_origin_login.yar:
--------------------------------------------------------------------------------
 1 | rule webpanel_origin_login
 2 | {
 3 |   meta:
 4 |     description = "Origin (AgentTesla) Webpanel"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-21"
 7 |  strings:
 8 |     $title = "Login</title>"
 9 |     $form = "action=\"login.php\""
10 |     $signin = "box-title m-b-20\">Sign In"
11 |     $style = "margin: auto;margin-top:100px;}"
12 |  condition:
13 |     all of them
14 | }


--------------------------------------------------------------------------------
/crawler/processing/__init__.py:
--------------------------------------------------------------------------------
1 | # © Copyright 2021 HP Development Company, L.P.
2 | from .payload_processing import PayloadProcessing
3 | from .external_intel_processing import ExternalIntelProcessing
4 | from .yara_processing import YARAProcessing
5 | from .clamav_processing import ClamAVProcessing
6 | from .jarm_processing import JARMProcessing
7 | from .tlsh_processing import TLSHProcessing
8 | #from .sdhash_processing import SDhashProcessing
9 | 


--------------------------------------------------------------------------------
/crawler/utils/logos/subcrawl-2.txt:
--------------------------------------------------------------------------------
1 | ________        ______  _________                           ______
2 | __  ___/____  _____  /_ __  ____/______________ ____      _____  /
3 | _____ \ _  / / /__  __ \_  /     __  ___/_  __ `/__ | /| / /__  / 
4 | ____/ / / /_/ / _  /_/ // /___   _  /    / /_/ / __ |/ |/ / _  /  
5 | /____/  \__,_/  /_.___/ \____/   /_/     \__,_/  ____/|__/  /_/   
6 |                                                                   


--------------------------------------------------------------------------------
/crawler/yara-rules/base64_pe.yar:
--------------------------------------------------------------------------------
 1 | rule base64_pe
 2 | {
 3 |   meta:
 4 |     description = "Detects base64 encoded PE files, often used with Powershell."
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-25"
 7 |  strings:
 8 |     $mz_header = /(TVqQ|QqVT)/
 9 |     $this_program = /(VGhpcyBwcm9ncmFt|tFmcn9mcwBycphGV)/
10 |     $null_bytes = "AAAAA"
11 |  condition:
12 |     $mz_header at 0 and $this_program and #null_bytes > 2
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/amadey_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule amadey_panel_login
 2 | {
 3 |   meta:
 4 |     description = "Amadey panel login"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-04-08"
 7 |  strings:
 8 |     $title = "authorization</title>" nocase
 9 |     $form_action = "action=\"Login.php\""
10 |     $bg_img = "images\\bg_1.png" nocase
11 |     $pass = "name=\"password\"" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/office365_verify_pdf_phish.yar:
--------------------------------------------------------------------------------
 1 | rule office365_verify_pdf_phish
 2 | {
 3 |   meta:
 4 |     description = "Office365/OneDrive Verify Yourself PDF phish"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-07-25"
 7 |  strings:
 8 |     $title = "Files - OneDrive"
 9 |     $form = "action=\"link.php\""
10 |     $user = "id=\"txtTOAAEmail\""
11 |     $verify = "Verify Yourself"
12 | 
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/wellsfargo_phish.yar:
--------------------------------------------------------------------------------
 1 | rule wells_fargo_phish
 2 | {
 3 |   meta:
 4 |     description = "Wells Fargo Phish"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-04-18"
 7 |  strings:
 8 |     $title = "Wells Fargo</title>" nocase
 9 |     $form_action = "action=\"./parse.php\""
10 |     $user = "name=\"j_username\"" nocase
11 |     $pass = "name=\"j_password\"" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/utils/logos/subcrawl-3.txt:
--------------------------------------------------------------------------------
1 |   _________       ___.    _________                           .__   
2 |  /   _____/ __ __ \_ |__  \_   ___ \ _______ _____   __  _  __|  |  
3 |  \_____  \ |  |  \ | __ \ /    \  \/ \_  __ \\__  \  \ \/ \/ /|  |  
4 |  /        \|  |  / | \_\ \\     \____ |  | \/ / __ \_ \     / |  |__
5 | /_______  /|____/  |___  / \______  / |__|   (____  /  \/\_/  |____/
6 |         \/             \/         \/              \/                


--------------------------------------------------------------------------------
/crawler/yara-rules/bankamerica_phish.yar:
--------------------------------------------------------------------------------
 1 | rule bank_america_phish
 2 | {
 3 |   meta:
 4 |     description = "Bank of America Phishing"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-04-19"
 7 |  strings:
 8 |     $title = "<title>Bank of America -" nocase
 9 |     $form_action = "action=\"login.php\""
10 |     $id = "name=\"onlineId1\"" nocase
11 |     $pass = "name=\"passcode1\"" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/link_sharing_onedrive.yar:
--------------------------------------------------------------------------------
 1 | rule link_sharing_onedrive
 2 | {
 3 |   meta:
 4 |     description = "OneDrive Link Sharing Phish"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-17"
 7 |  strings:
 8 |     $modified = "new injection"
 9 |     $title = /link.{0,10}validation<\/title>/ nocase
10 |     $form = "bmV4dC5waHA=" //next.php
11 |     $user = "id=\"ai\""
12 |     $pass = "id=\"pr\""
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/pony_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule pony_panel_login
 2 | {
 3 |   meta:
 4 |     description = "Pony stealer panel login"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-04-03"
 7 |  strings:
 8 |     $title = "authorization</title>" nocase
 9 |     $form_action = "action=\"/panel/admin.php\"" nocase
10 |     $lock = "lock_open.png" nocase
11 |     $pass = "name=\"password\"" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/attachments_onedrive_phish.yar:
--------------------------------------------------------------------------------
 1 | rule attachments_onedrive_phish
 2 | {
 3 |   meta:
 4 |     description = "OneDrive Attachments Phish"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-02"
 7 |  strings:
 8 |     $title = "attachments - onedrive</title>" nocase
 9 |     $post_out = "loginout.php" nocase
10 |     $post_365 = "login365.php" nocase
11 |     $class = "class=\"login-form\"" nocase
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/microsoft_login_phish.yar:
--------------------------------------------------------------------------------
 1 | rule microsoft_login_phish
 2 | {
 3 |   meta:
 4 |     description = "Microsoft login"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-10-19"
 7 |  strings:
 8 |     $form = "<form id=\"contact\"" nocase
 9 |     $title = "Microsoft | Login</title>" nocase
10 |     $user = "name=\"ai\""
11 |     $submit  = "bmV4dC5waHA=" //next.php
12 |     $forget = "Forget password?" nocase
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/unam_webpanel_login.yar:
--------------------------------------------------------------------------------
 1 | rule unam_webpanel_login {
 2 |     meta:
 3 |         description = "Detects Unam web panel login page"
 4 |         author = "@jstrosch"
 5 |         date = "2022-01-12"
 6 | 
 7 |     strings:
 8 |         $s1 = "Unam Web Panel &mdash; Login</title>" nocase 
 9 |         $s2 = "unamLogin'>" nocase
10 |         $s3 = "name='password'" nocase
11 |         $s4 = "Login" nocase
12 | 
13 |     condition:
14 |         all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/sharepoint_dropbox_online_phish.yar:
--------------------------------------------------------------------------------
 1 | rule sharepoint_dropbox_online_phish
 2 | {
 3 |   meta:
 4 |     description = "Sharepoint Online Multiple Logins with Dropbox login theme"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-07-11"
 7 |  strings:
 8 |     $title = "share point online</title>" nocase
 9 |     $user = "id=\"email\""
10 |     $post_url = "next.php" nocase
11 |     $dropbox = "DropBox Buisness"
12 |  condition:
13 |     all of them
14 | }
15 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/standard_bank_phish.yar:
--------------------------------------------------------------------------------
 1 | rule standard_bank_phish
 2 | {
 3 |   meta:
 4 |     description = "Standard Bank Login Phish"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-06"
 7 |  strings:
 8 |     $title = "sign in</title>" nocase
 9 |     $form_submit = "send_login.php" nocase
10 |     $user = "name=\"email\"" nocase
11 |     $password = "name=\"password\"" nocase
12 |     $register = "standard bank id?" nocase
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/onedrive_business_phish.yar:
--------------------------------------------------------------------------------
 1 | rule onedrive_business_phish
 2 | {
 3 |   meta:
 4 |     description = "OneDrive Business phishing page w/ multiple sign-in options"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-25"
 7 |  strings:
 8 |     $injection = "/new injection/"
 9 |     $title = "OnDrive | Login</title>"
10 |     $form = "id=\"contact\""
11 |     $user = "id=\"email\""
12 |     $pass = "name=\"password\""
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim-buster
 2 | WORKDIR /subcrawl
 3 | RUN apt-get -y update
 4 | RUN apt-get -y upgrade
 5 | RUN apt-get -y install build-essential gcc yara magic supervisor clamav-daemon clamav-freshclam clamav-unofficial-sigs
 6 | COPY requirements.txt requirements.txt
 7 | COPY supervisor/supervisord.conf /etc/supervisor/
 8 | RUN mkdir /var/log/subcrawl
 9 | RUN pip3 install -r requirements.txt
10 | RUN freshclam
11 | EXPOSE 8000
12 | COPY . .
13 | CMD ["bash", "run.sh"]


--------------------------------------------------------------------------------
/crawler/yara-rules/panels.yar:
--------------------------------------------------------------------------------
 1 | rule mana_tools_panel
 2 | {
 3 |   meta:
 4 |     description = "Mana Tools Panel"
 5 |     author = "patrick.schlapfer@hp.com"
 6 |     date = "2022-01-06"
 7 |  strings:
 8 |     $c = "background-image: url('1.jpg')"
 9 |     $d = "<title>Login</title>"
10 |     $e = "id=\"loginform\" action=\"login.php\""
11 |     $f = "<h3 class=\"box-title m-b-20\">Log-In</h3>"
12 |     $g = "Username"
13 |     $h = "Password"
14 |     $ih = "Log In"
15 |  condition:
16 |     all of them
17 | }
18 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/h3k_tinyfilemanager_login.yar:
--------------------------------------------------------------------------------
 1 | rule h3k_tinyfilemanager_login {
 2 |     meta:
 3 |         description = "H3K Tiny File Manager login"
 4 |         author = "Josh Stroschein josh@m9cyber.com"
 5 |         date = "2023-01-15"
 6 | 
 7 |     strings:
 8 |         $s1 = "Tiny File Manager</title>" nocase
 9 |         $s2 = "form-signin" nocase
10 |         $s3 = "fm_usr" nocase
11 |         $s4 = "fm_pwd" nocase
12 |         $s5 = ".fm-login-page" nocase
13 | 
14 |     condition:
15 |         all of them
16 | }
17 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/grandamisha_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule grandamisha_panel_login {
 2 |     meta:
 3 |         description = "Granda Misha panel login"
 4 |         author = "Josh Stroschein josh@m9cyber.com"
 5 |         date = "2022-12-29"
 6 | 
 7 |     strings:
 8 |         $r1 = "misha" nocase
 9 |         $r2 = "granda misha" nocase
10 |         $s1 = "placeholdler=\"Jabber ID\"" nocase
11 |         $s2 = "name=\"password\"" nocase
12 |         $s3 = "users_signin" nocase
13 | 
14 |     condition:
15 |         $r1 and $r2 and 1 of ($s*)
16 | }


--------------------------------------------------------------------------------
/crawler/yara-rules/wallet_connect_phish.yar:
--------------------------------------------------------------------------------
 1 | rule wallet_connect_phish
 2 | {
 3 |   meta:
 4 |     description = "Wallet Connect phishing page"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-08"
 7 |  strings:
 8 |     $title = "intergations protocol</title>" nocase
 9 |     $form_action = "action=\"#\"" nocase
10 |     $hidden = "value=\"AAVE\"" nocase
11 |     $phrase = "name=\"phrase\"" nocase
12 |     $private = "name=\"pkey\"" nocase
13 |     $json = "name=\"kjson\"" nocase
14 |  condition:
15 |     all of them
16 | }
17 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/obfuscated_script.yar:
--------------------------------------------------------------------------------
 1 | rule obfuscated_script
 2 | {
 3 |   meta:
 4 |     description = "Looks for common functions and patterns to deobfuscate scripts"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-27"
 7 |  strings:
 8 |     $eval = "eval(" nocase
 9 |     $hex = "hex(" nocase
10 |     $split = "split(" nocase
11 |     $exec = "execute" nocase
12 |     $char ="char(" nocase
13 |     $from_hex = /([\d]{2,3}[^\d]{1,10}){200,}/
14 |  condition:
15 |     ($hex or $split or $char or $from_hex) and ($eval or $exec)
16 | }
17 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/acridrain_stealer_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule acridrain_stealer_panel_login {
 2 |     meta:
 3 |         description = "AcridRain Stealer panel login"
 4 |         author = "Josh Stroschein josh@m9cyber.com"
 5 |         date = "2022-12-29"
 6 | 
 7 |     strings:
 8 |         $r1 = "<title>Acrid -" nocase
 9 |         $r2 = "AcridRain Stealer" nocase
10 |         $s1 = "/Account/Login" nocase
11 |         $s2 = "name=\"Email\"" nocase
12 |         $s3 = "name=\"Password\"" nocase
13 | 
14 |     condition:
15 |         $r1 and $r2 and 1 of ($s*)
16 | }


--------------------------------------------------------------------------------
/crawler/yara-rules/mars_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule mars_panel_login
 2 | {
 3 |   meta:
 4 |     description = "Mars stealer panel login"
 5 |     author = "Josh Stroschein josh@m9cyber.com"
 6 |     date = "2022-03-28"
 7 |     resources = "https://isc.sans.edu/diary/Arkei+Variants%3A+From+Vidar+to+Mars+Stealer/28468"
 8 |  strings:
 9 |     $title = "dashboard</title>" nocase
10 |     $form_action = "action=\"login.php\"" nocase
11 |     $login_btn = "name=\"do_login\"" nocase
12 |     $pass = "name=\"password\"" nocase
13 |  condition:
14 |     all of them
15 | }
16 | 


--------------------------------------------------------------------------------
/crawler/utils/setup_kafka_topic.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | from kafka.admin import KafkaAdminClient, NewTopic
 3 | 
 4 | 
 5 | def check_topic():
 6 |     admin_client = KafkaAdminClient(
 7 |         bootstrap_servers="kafka:9092",
 8 |         client_id='test'
 9 |     )
10 |     if "urls" not in admin_client.list_topics():
11 |         topic_list = []
12 |         topic_list.append(NewTopic(name="urls", num_partitions=10, replication_factor=1))
13 |         admin_client.create_topics(new_topics=topic_list, validate_only=False)
14 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/huntington_phish.yar:
--------------------------------------------------------------------------------
 1 | rule huntington_bank_phish
 2 | {
 3 |   meta:
 4 |     description = "Huntington Bank Phishing Kit"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-02-17"
 7 |  strings:
 8 |     $banner = "hgn.png"
 9 |     $title = "Huntington</title>"
10 |     $title_html = "&#72;&#117;&#110;&#116;&#105;&#110;&#103;&#116;&#111;&#110;</title>"
11 |     $form = "action=need1.php"
12 |     $user = "name=\"ud\""
13 |     $pass = "name=\"pd\""
14 |  condition:
15 |     ($title or $title_html) and $banner and $form and $user and $pass
16 | }
17 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/mana5_panel_login.yar:
--------------------------------------------------------------------------------
 1 | rule mana5_panel_login
 2 | {
 3 |   meta:
 4 |     description = "Mana Tools Panel 5.0"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-17"
 7 |  strings:
 8 |     $title = "login</title>" nocase
 9 |     $banner = "lone wolf version 5.0" nocase
10 |     $back_img = "background-image: url('1.jpg')"
11 |     $html_title = "<h3 class=\"box-title m-b-20\">Log-In</h3>"
12 |     $user = "name=\"username\"" nocase
13 |     $pass = "name=\"password\"" nocase
14 |     $button = "Log In"
15 |  condition:
16 |     all of them
17 | }
18 | 


--------------------------------------------------------------------------------
/crawler/storage/example_storage.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import json
 3 | import pprint
 4 | from re import subn
 5 | 
 6 | from utils import SubCrawlColors
 7 | from .default_storage import DefaultStorage
 8 | 
 9 | 
10 | class ExampleStorage(DefaultStorage):
11 | 
12 |     cfg = None
13 |     logger = None
14 | 
15 |     def __init__(self, config, logger):
16 |         self.cfg = config
17 |         self.logger = logger
18 | 
19 |     def load_scraped_domains(self):
20 |         return []
21 | 
22 |     def store_result(self, result_data):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/crawler/utils/logos/subcrawl-1.txt:
--------------------------------------------------------------------------------
1 |   ********         **        ******                               **
2 |  **//////         /**       **////**                             /**
3 | /**        **   **/**      **    //  ******  ******   ***     ** /**
4 | /*********/**  /**/****** /**       //**//* //////** //**  * /** /**
5 | ////////**/**  /**/**///**/**        /** /   *******  /** ***/** /**
6 |        /**/**  /**/**  /**//**    ** /**    **////**  /****/**** /**
7 |  ******** //******/******  //****** /***   //******** ***/ ///** ***
8 | ////////   ////// /////     //////  ///     //////// ///    /// /// 


--------------------------------------------------------------------------------
/crawler/yara-rules/base64_shellcode_dos_header_pe.yar:
--------------------------------------------------------------------------------
 1 | rule base64_shellcode_dos_header_pe
 2 | {
 3 |   meta:
 4 |     description = "Detects base64 encoded PE files, often used with Powershell, that contains magic bytes that allow for the image_dos_header to contain shellcode.."
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2023-01-23"
 7 |  strings:
 8 |     $mz_header = /(TVpFUu|uUFpVT|TVpSRQ|QRSpTV|TVpBUg|gUBpVT)/
 9 |     $this_program = /(VGhpcyBwcm9ncmFt|tFmcn9mcwBycphGV)/
10 |     $null_bytes = "AAAAA"
11 |  condition:
12 |     $mz_header at 0 and $this_program and #null_bytes > 2
13 | }
14 | 


--------------------------------------------------------------------------------
/crawler/utils/banner.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import os
 3 | import random
 4 | import sys
 5 | 
 6 | 
 7 | class SubCrawlBanner():
 8 | 
 9 |     logo_path = ""
10 |     tag_line = ""
11 |     logos = []
12 | 
13 |     def __init__(self, logopath, tagline):
14 |         self.logo_path = logopath
15 |         self.tag_line = tagline
16 |         for logo in os.listdir(self.logo_path):
17 |             self.logos.append(os.path.join(self.logo_path, logo))
18 | 
19 |     def print_banner(self):
20 |         logo = self.logos[random.randint(0, len(self.logos) - 1)]
21 |         with open(logo) as logodata:
22 |             print("\n" + logodata.read())
23 |             print(self.tag_line + "\n")
24 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/html_webshell_login.yar:
--------------------------------------------------------------------------------
 1 | rule protected_webshell
 2 | {
 3 |   meta:
 4 |     description = "Protected Webshell Login"
 5 |     author = "HP Threat Research @HPSecurity"
 6 |     filetype = "PHP"
 7 |     maltype = "notifier"
 8 |     date = "2021-06-08"
 9 | 
10 |   strings:
11 |     $a1 = /action\s*=\s*\"\"/
12 |     $a2 = /method\s*=\s*\"post\"/
13 |     $a3 = /type\s*=\s*\"submit\"/
14 |     $a4 = /name\s*=\s*\"[a-z]{0,}_{0,}[a-z]{2,}\"/
15 | 	
16 |     $b1 = /type\s*=\s*\"input\"/
17 |     $b2 = /type\s*=\s*\"text\"/
18 |    
19 |     $c1 = /value\s*=\s*\"(\s*>\s*){1,2}\"/
20 |     $c2 = /value\s*=\s*\"(\s?&gt;\s?){1,2}\"/
21 | 	
22 |   condition:
23 |     all of ($a*) and any of ($b*) and any of ($c*) and filesize < 1000
24 | }
25 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/php_webshell_backend.yar:
--------------------------------------------------------------------------------
 1 | rule php_webshell_backend : notifier
 2 | {
 3 |   meta:
 4 |     description = "PHP webshell backend used by the attacker"
 5 |     author = "HP Threat Research @HPSecurity"
 6 |     filetype = "PHP"
 7 |     maltype = "notifier"
 8 |     date = "2021-06-08"
 9 | 
10 |   strings:
11 |     $a1 = "__construct"
12 | 	$a2 = "ord"
13 | 	$a3 = "chr"
14 | 	$a4 = "class"
15 | 	$a5 = "strpos"
16 | 	$a6 = "strlen"
17 | 	
18 | 	$b = "array"
19 | 	$c = "function"
20 | 	$d = "var"
21 | 	
22 | 	$e = /\$\w+\s*\=\s*(\$\w+->\w+\[\d+\]\.?)+;/
23 | 	$f = /var\s*\$\w+\s*\=\s*['\"][\w\/\+\=\n\t]+/
24 | 	
25 |   condition:
26 |     all of ($a*) and #b >= 5 and #c == 9 and #d >= 9 and #e >= 5 and $f and filesize < 1MB
27 | }
28 | 


--------------------------------------------------------------------------------
/crawler/processing/tlsh_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | from .default_processing import DefaultProcessing
 3 | import tlsh
 4 | 
 5 | 
 6 | class TLSHProcessing(DefaultProcessing):
 7 | 
 8 |     cfg = None
 9 |     logger = None
10 | 
11 |     def __init__(self, config, logger):
12 |         self.cfg = config
13 |         self.logger = logger
14 | 
15 |     def process(self, url, content):
16 |         tlsh_result = {}
17 |         if len(content) < 50:
18 |             return {}
19 | 
20 |         try:
21 |             tlsh_result["tlsh"] = tlsh.hash(content)
22 |             tlsh_result["url"] = url
23 |         except Exception as e:
24 |             self.logger.ERROR('[TLSH] ' + str(e))
25 |             pass
26 |         return tlsh_result
27 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/agenttesla_webpanel_login.yar:
--------------------------------------------------------------------------------
 1 | rule agenttesla_panel_login
 2 | {
 3 |   meta:
 4 |     description = "AgentTesla panel login page"
 5 |     author = "josh@m9cyber.com"
 6 |     date = "2022-03-10"
 7 |  strings:
 8 |     $title = "web panel | login</title>" nocase
 9 |     $form_action = "action=\"login.php\"" nocase
10 |     $pass = "name=\"password\"" nocase
11 |     $user = "name=\"username\"" nocase
12 | 
13 |  condition:
14 |     all of them
15 | }
16 | 
17 | rule agenttesla_panel_login_2
18 | {
19 |   meta:
20 |     description = "Origin (AgentTesla) Webpanel"
21 |     author = "josh@m9cyber.com"
22 |     date = "2022-02-21"
23 |  strings:
24 |     $title = "Login</title>"
25 |     $form = "action=\"login.php\""
26 |     $signin = "box-title m-b-20\">Sign In"
27 |     $style = "margin: auto;margin-top:100px;}"
28 |  condition:
29 |     all of them
30 | }
31 | 


--------------------------------------------------------------------------------
/crawler/processing/jarm_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | from urllib.parse import urlparse
 3 | from jarm.scanner.scanner import Scanner
 4 | from .default_processing import DefaultProcessing
 5 | import requests
 6 | 
 7 | 
 8 | class JARMProcessing(DefaultProcessing):
 9 | 
10 |     cfg = None
11 |     logger = None
12 | 
13 |     def __init__(self, config, logger):
14 |         self.cfg = config
15 |         self.logger = logger
16 | 
17 |     def process(self, url, resp):
18 |         jarm_scan = {}
19 |         try:
20 |             domain = urlparse(url).netloc
21 |             res = requests.get("https://" + domain) # Leads on purpose to an exception if connection is refused
22 |             result = Scanner.scan(domain, 443)
23 |             jarm_scan["fingerprint"] = result[0]
24 |             jarm_scan["domain"] = result[1]
25 |             jarm_scan["port"] = result[2]
26 |         except Exception:
27 |             pass
28 |         return jarm_scan
29 | 


--------------------------------------------------------------------------------
/crawler/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:
 3 |   web:
 4 |     build: .
 5 |     ports:
 6 |       - "8000:8000" 
 7 |     volumes:
 8 |       - "/var/log/subcrawl:/var/log/subcrawl:rw"
 9 |     depends_on:
10 |       - "kafka"
11 |   
12 |   zookeeper:
13 |     image: confluentinc/cp-zookeeper:latest
14 |     environment:
15 |       ZOOKEEPER_CLIENT_PORT: 2181
16 |       ZOOKEEPER_TICK_TIME: 2000
17 |     expose:
18 |       - 2181
19 | 
20 |   kafka:
21 |     image: confluentinc/cp-kafka:latest
22 |     depends_on:
23 |       - zookeeper
24 |     expose:
25 |       - 29092
26 |       - 9092
27 |     environment:
28 |       KAFKA_BROKER_ID: 1
29 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
30 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://kafka:29092
31 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
32 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
33 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
34 | 


--------------------------------------------------------------------------------
/crawler/utils/helpers.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import hashlib
 3 | import re
 4 | import sys
 5 | from urllib.parse import urlparse
 6 | 
 7 | class SubCrawlHelpers:
 8 | 
 9 |     def get_sha256(data):
10 |         hash_object = hashlib.sha256(data)
11 |         return hash_object.hexdigest()
12 | 
13 |     def save_content(file_name, data):
14 |         with open(file_name, "wb") as file:
15 |             file.write(data)
16 | 
17 |     def defang_url(url):
18 |         parsed_url = urlparse(url)
19 |         last_dot = parsed_url.netloc.rindex('.')
20 |         defanged = parsed_url.netloc[0:last_dot] + '[.]' + parsed_url.netloc[last_dot + 1:]
21 |         return url.replace(parsed_url.netloc, defanged).replace('http', 'hxxp')
22 | 
23 |     def get_config(cfg, collection, key):
24 |         try:
25 |             return cfg[collection][key]
26 |         except Exception as e:
27 |             sys.exit("[ENGINE] Error loading configuration: "
28 |                      + collection + " : " + key)
29 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/js_webshell_tracking_script.yar:
--------------------------------------------------------------------------------
 1 | rule js_webshell_tracking_script : notifier
 2 | {
 3 |   meta:
 4 |     description = "JavaScript which notifies the attacker when the webshell becomes active"
 5 |     author = "HP Threat Research @HPSecurity"
 6 |     filetype = "JavaScript"
 7 |     maltype = "notifier"
 8 |     date = "2021-06-08"
 9 | 
10 |   strings:
11 |     $a1 = "ndsj===undefined"
12 | 	$a2 = "ndsw===undefined"
13 | 	
14 | 	$b = "function"
15 | 	
16 | 	$c = "HttpClient"
17 | 	
18 | 	$d1 = "XMLHttpRequest"
19 | 	$d2 = "Math"
20 | 	$d3 = "undefined"
21 | 	
22 | 	$e1 = "onreadystatechange"
23 | 	$e2 = "responseText"
24 | 	$e3 = "random"
25 | 	$e4 = "ndsx"
26 | 	$e5 = "GET"
27 | 	$e6 = "open"
28 | 	$e7 = "send"
29 | 	
30 | 	$f1 = "parseInt"
31 | 	$f2 = /var\s*\w+\s*\=\s*\[(['\"][\w\.\?\/\:]+['\"][,\]\s]+)+/
32 | 	$g = "0x"
33 | 
34 |   condition:
35 |     any of ($a*) and #b > 5 and #c >= 2 and all of ($d*) and (all of ($e*) or (all of ($f*) and #g > 50)) and filesize < 1MB
36 | }
37 | 


--------------------------------------------------------------------------------
/License.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | © Copyright 2021 HP Development Company, L.P.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/crawler/utils/sqlite_model.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import os
 3 | from peewee import *
 4 | from playhouse.hybrid import hybrid_property
 5 | import datetime
 6 | 
 7 | db = SqliteDatabase('utils/subcrawl.db')
 8 | 
 9 | 
10 | class BaseModel(Model):
11 |     class Meta:
12 |         database = db
13 | 
14 | 
15 | class Domain(BaseModel):
16 |     name = CharField(unique=True)
17 |     description = TextField(null=True)
18 | 
19 | 
20 | class Url(BaseModel):
21 |     domain = ForeignKeyField(Domain, backref='urls')
22 |     url = CharField()
23 |     status_code = IntegerField()
24 |     title = CharField(null=True)
25 |     sha256 = CharField()
26 |     last_check = DateTimeField(default=datetime.datetime.utcnow)
27 | 
28 | 
29 | class Extension(BaseModel):
30 |     key = CharField()
31 |     value = TextField(null=True)
32 |     url = ForeignKeyField(Url, backref='extensions')
33 | 
34 | 
35 | class Tag(BaseModel):
36 |     tag = CharField(unique=True)
37 |     description = TextField(null=True)
38 | 
39 | 
40 | class DomainTag(BaseModel):
41 |     domain = ForeignKeyField(Domain, backref='domaintag')
42 |     tag = ForeignKeyField(Tag, backref='domaintag')
43 | 


--------------------------------------------------------------------------------
/crawler/app/templates/domains.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 | 
 6 |           <div class="row">
 7 |             <h2 class="ui huge dividing header" style="margin-top:10px">Domains</h2>
 8 |           </div>
 9 |           <div class="row">
10 |             <table class="ui single line striped selectable searchable unstackable table sortable">
11 |               <thead>
12 |                 <tr>
13 |                   <th>#</th>
14 |                   <th>Name</th>
15 |                   <th>Description</th>
16 |                   <th>Urls</th>
17 |                 </tr>
18 |               </thead>
19 |               <tbody>
20 | 
21 |                 {% for domain in domains %}
22 |                   <tr class='clickable clickable-row' data-href='/domain/{{- domain.id }}'>
23 |                     <td>{{- domain.id }}</td>
24 |                     <td>{{- domain.name }}</td>
25 |                     <td>{{- domain.description }}</td>
26 |                     <td>{{- domain.urls.count() }}</td>
27 |                   </tr>
28 |                 {%- endfor %}
29 | 
30 |               </tbody>
31 |             </table>
32 | 
33 |           </div>
34 | {% endblock %}
35 | 


--------------------------------------------------------------------------------
/crawler/service.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import io
 3 | import logging
 4 | import os
 5 | import sys
 6 | from datetime import datetime, timedelta
 7 | 
 8 | import yaml
 9 | from timeloop import Timeloop
10 | from utils import SubCrawlColors, SubCrawlHelpers
11 | from utils import check_topic
12 | 
13 | # check if kafka topic exists and create if needed
14 | check_topic()
15 | tl = Timeloop()
16 | 
17 | 
18 | @tl.job(interval=timedelta(seconds=10))
19 | def start_crawling():
20 |     with open("config.yml", "r") as ymlfile:
21 |         global_cfg = yaml.safe_load(ymlfile)
22 | 
23 |     if not global_cfg:
24 |         sys.exit(0)
25 | 
26 |     processing_modules = list()
27 |     for processing_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "processing_modules"):
28 |         processing_modules.append(processing_module)
29 | 
30 |     storage_modules = list()
31 |     for storage_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "storage_modules"):
32 |         storage_modules.append(storage_module)
33 | 
34 |     try:
35 |         os.system("/usr/local/bin/python3 subcrawl.py -k -p " + ",".join(processing_modules) + " -s " + ",".join(storage_modules))
36 |     except Exception as e:
37 |         print(e)
38 | 
39 | 
40 | tl.start(block=True)
41 | 


--------------------------------------------------------------------------------
/crawler/app/templates/urls.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 | 
 6 |           <div class="row">
 7 |             <h2 class="ui huge dividing header" style="margin-top:10px">Urls</h2>
 8 |           </div>
 9 |           <div class="row">
10 |             <table class="ui single line striped selectable searchable unstackable table sortable">
11 |               <thead>
12 |                 <tr>
13 |                   <th>#</th>
14 |                   <th>Url</th>
15 |                   <th>Status Code</th>
16 |                   <th>Hash</th>
17 |                   <th>Scanned</th>
18 |                 </tr>
19 |               </thead>
20 |               <tbody>
21 | 
22 |                 {% for url in urls %}
23 |                   <tr class='clickable clickable-row' data-href='/url/{{- url.id }}'>
24 |                     <td>{{- url.id }}</td>
25 |                     <td>{{- url.url }}</td>
26 |                     <td>{{- url.status_code }}</td>
27 |                     <td>{{- url.sha256 }}</td>
28 |                     <td>{{- url.last_check }}</td>
29 |                   </tr>
30 |                 {%- endfor %}
31 | 
32 |               </tbody>
33 |             </table>
34 | 
35 |           </div>
36 | {% endblock %}
37 | 


--------------------------------------------------------------------------------
/crawler/processing/yara_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import hashlib
 3 | 
 4 | import yara
 5 | from utils import SubCrawlColors, SubCrawlHelpers
 6 | from .default_processing import DefaultProcessing
 7 | 
 8 | 
 9 | class YARAProcessing(DefaultProcessing):
10 | 
11 |     cfg = None
12 |     rules = None
13 |     logger = None
14 | 
15 |     def __init__(self, config, logger):
16 |         self.cfg = config
17 |         self.logger = logger
18 | 
19 |     def process(self, url, content):
20 |         if not self.rules:
21 |             self.rules = yara.compile(filepath=SubCrawlHelpers.get_config(
22 |                                        self.cfg, "crawler", "yara_rules"))
23 | 
24 |         yara_matches = {}
25 |         http_resp = content.decode("latin-1")
26 | 
27 |         matches = self.rules.match(data=http_resp)
28 |         if len(matches) > 0:
29 |             self.logger.info(SubCrawlColors.CYAN + "[YARA] Matches - " +
30 |                              ' '.join(map(str, matches)) +
31 |                              " (" + url + " )" + SubCrawlColors.RESET)
32 |             yara_matches["url"] = url
33 |             yara_matches["hash"] = SubCrawlHelpers.get_sha256(
34 |                                     http_resp.encode('utf-8'))
35 |             for match in matches:
36 |                 yara_matches.setdefault("matches", []).append(str(match))
37 | 
38 |         return yara_matches
39 | 


--------------------------------------------------------------------------------
/crawler/processing/clamav_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import hashlib
 3 | from io import BytesIO
 4 | 
 5 | import clamd
 6 | 
 7 | from .default_processing import DefaultProcessing
 8 | from utils import SubCrawlColors, SubCrawlHelpers
 9 | 
10 | # Installation ClamAV for this Module
11 | # sudo apt-get install clamav-daemon clamav-freshclam clamav-unofficial-sigs
12 | # sudo freshclam
13 | # sudo service clamav-daemon start
14 | 
15 | 
16 | class ClamAVProcessing(DefaultProcessing):
17 | 
18 |     cfg = None
19 |     cd = None
20 |     logger = None
21 | 
22 |     def __init__(self, config, logger):
23 |         self.cfg = config
24 |         self.logger = logger
25 |         self.cd = clamd.ClamdUnixSocket()
26 | 
27 |     def process(self, url, content):
28 |         scan_results = {}
29 |         # self.cd = clamd.ClamdUnixSocket()
30 |         # pong = self.cd.ping() # Will crash if not correctly installed. Handled in main crawler.
31 |         buffer = BytesIO(content)
32 |         scan_results = self.cd.instream(buffer)
33 |         scan_results['url'] = url
34 |         scan_results['hash'] = SubCrawlHelpers.get_sha256(content)
35 | 
36 |         try:
37 |             if "OK" in scan_results['stream']:
38 |                 scan_results = {}
39 |             else:
40 |                 clamav_status = str(scan_results['stream']).split(',')
41 |                 label = clamav_status[1].replace("'", '').replace(')', '').strip()
42 |                 scan_results['matches'] = label
43 |                 self.logger.info('[CLAMAV] Found - ' + label)
44 |         except Exception as e:
45 |             self.logger.error('[CLAMAV] ' + str(e))
46 |             scan_results = {}
47 |         return scan_results
48 | 


--------------------------------------------------------------------------------
/crawler/app/templates/search_results.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 |           <div class="row">
 6 |             <h2 class="ui huge dividing header" style="margin-top:10px">Search results</h2>
 7 |           </div>
 8 |           <div class="center aligned row" >
 9 | 		  
10 | 		  {% if error %}
11 | 		  <div style="display: table; width: 60%; border-radius: 25px; border: 1px solid black; background-color: #fce4e4; margin-bottom: 50px">
12 | 			<div style="display: table-cell; padding-top: 10px; text-align:left; padding-left: 20px">
13 | 				
14 | 				  <label style="font-size:16px">{{ error }}</label>
15 | 				
16 | 			  </div>
17 | 		  </div>
18 | 		  {% endif %}
19 | 		  
20 | 		  <table class="ui single line striped selectable searchable unstackable table sortable">
21 |               <thead>
22 |                 <tr>
23 |                   <th>#</th>
24 |                   <th>Url</th>
25 |                   <th>Status Code</th>
26 |                   <th>Hash</th>
27 |                   <th>Scanned</th>
28 |                 </tr>
29 |               </thead>
30 |               <tbody>
31 | 
32 |                 {% for url in urls %}
33 |                   <tr class='clickable clickable-row' data-href='/url/{{- url.id }}'>
34 |                     <td>{{- url.id }}</td>
35 |                     <td>{{- url.url }}</td>
36 |                     <td>{{- url.status_code }}</td>
37 |                     <td>{{- url.sha256 }}</td>
38 |                     <td>{{- url.last_check }}</td>
39 |                   </tr>
40 |                 {%- endfor %}
41 | 
42 |               </tbody>
43 |             </table>
44 | 		  
45 |           </div>
46 | 
47 | {% endblock %}
48 | 


--------------------------------------------------------------------------------
/crawler/input/phishtank.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import csv
 3 | import io
 4 | import logging
 5 | import os
 6 | import sys
 7 | from datetime import datetime, timedelta
 8 | from json import dumps, loads
 9 | 
10 | import requests
11 | from kafka import KafkaConsumer, KafkaProducer
12 | from timeloop import Timeloop
13 | 
14 | producer = KafkaProducer(bootstrap_servers=['kafka:9092'], value_serializer=lambda x: dumps(x).encode('utf-8'))
15 | consumer = KafkaConsumer(
16 |     'urls',
17 |     bootstrap_servers=['kafka:9092'],
18 |     auto_offset_reset='earliest',
19 |     enable_auto_commit=False,
20 |     group_id='urls-dedup',
21 |     consumer_timeout_ms=2000,
22 |     auto_commit_interval_ms=1000,
23 |     value_deserializer=lambda x: loads(x.decode('utf-8')))
24 | 
25 | PHISHTANK_API = "http://data.phishtank.com/data/online-valid.csv"
26 | tl = Timeloop()
27 | urls = set()
28 | 
29 | 
30 | # consume all urls from kafka and dedup
31 | def load_urls():
32 |     global urls
33 |     try:
34 |         for message in consumer:
35 |             urls.add(message.value)
36 |     except Exception as e:
37 |         print(e)
38 | 
39 | 
40 | @tl.job(interval=timedelta(seconds=300))
41 | def phishtank():
42 |     global urls
43 |     if len(urls) == 0:
44 |         load_urls()
45 | 
46 |     try:
47 |         r = requests.get(PHISHTANK_API, allow_redirects=True)
48 |         csv_data = io.StringIO(r.content.decode("utf-8"))
49 |         csv_reader = csv.DictReader(csv_data)
50 |         for row in csv_reader:
51 |             url = row["url"]
52 |             if url not in urls:
53 |                 producer.send('urls', value=url)
54 |                 urls.add(url)
55 |     except Exception as e:
56 |         print(e)
57 |         pass  # Could not download file. Try again in a few seconds.
58 | 
59 | 
60 | tl.start(block=True)
61 | 


--------------------------------------------------------------------------------
/crawler/yara-rules/combined-rules.yar:
--------------------------------------------------------------------------------
 1 | include "./open_webshell.yar"
 2 | include "./html_webshell_login.yar"
 3 | include "./js_webshell_tracking_script.yar"
 4 | include "./php_webshell_backend.yar"
 5 | include "./panels.yar"
 6 | include "./huntington_phish.yar"
 7 | include "./link_sharing_onedrive.yar"
 8 | include "./onedrive_business_phish.yar"
 9 | include "./base64_pe.yar"
10 | include "./chase_login_spox_phish.yar"
11 | include "./obfuscated_script.yar"
12 | include "./default_page_xampp_windows.yar"
13 | include "./microsoft_phish.yar"
14 | include "./sharepoint_online_phish.yar"
15 | include "./attachments_onedrive_phish.yar"
16 | include "./default_page_apache.yar"
17 | include "./standard_bank_phish.yar"
18 | include "./wallet_connect_phish.yar"
19 | include "./bapr_banking_phish.yar"
20 | include "./agenttesla_webpanel_login.yar"
21 | include "./mana5_panel_login.yar"
22 | include "./mars_panel_login.yar"
23 | include "./pony_panel_login.yar"
24 | include "./amadey_panel_login.yar"
25 | include "./bankamerica_phish.yar"
26 | include "./royalmail_phish.yar"
27 | include "./wellsfargo_phish.yar"
28 | include "./outlook_phish.yar"
29 | include "./sharepoint_dropbox_online_phish.yar"
30 | include "./office365_review_phish.yar"
31 | include "./office365_verify_pdf_phish.yar"
32 | include "./microsoft_login_phish.yar"
33 | include "./hex-encoded-pe-file.yar"
34 | include "./erbium_discord_panel_login.yar"
35 | include "./php_file_manager_login.yar"
36 | include "./collector_stealer_panel_login.yar"
37 | include "./titan_stealer_panel_login.yar"
38 | include "./modernloader_panel_login.yar"
39 | include "./aurora_stealer_panel_login.yar"
40 | include "./grandamisha_panel_login.yar"
41 | include "./acridrain_stealer_panel_login.yar"
42 | include "./unam_webpanel_login.yar"
43 | include "./h3k_tinyfilemanager_login.yar"
44 | include "./base64_shellcode_dos_header_pe.yar"
45 | 


--------------------------------------------------------------------------------
/crawler/misp-objects/opendir-url/definition.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attributes": {
 3 |     "url": {
 4 |       "description": "Scanned URL from opendir",
 5 |       "misp-attribute": "url",
 6 |       "ui-priority": 1
 7 |     },
 8 |     "sha256": {
 9 |       "description": "Secure Hash Algorithm 2 (256 bits)",
10 |       "misp-attribute": "sha256",
11 |       "ui-priority": 1
12 |     },
13 |     "content": {
14 |       "description": "Plaintext content of URL response",
15 |       "disable_correlation": true,
16 |       "misp-attribute": "attachment",
17 |       "ui-priority": 1
18 |     },
19 |     "title": {
20 |       "description": "Title of URL response",
21 |       "misp-attribute": "text",
22 |       "ui-priority": 1
23 |     },
24 |     "sdhash": {
25 |       "description": "SDhash of URL content",
26 |       "misp-attribute": "text",
27 |       "ui-priority": 1
28 |     },
29 |     "tlsh": {
30 |       "description": "Trend Micro Locality Sensitive Hash of URL content",
31 |       "misp-attribute": "text",
32 |       "ui-priority": 1
33 |     },
34 |     "yara": {
35 |       "description": "Matching YARA rule",
36 |       "misp-attribute": "text",
37 |       "ui-priority": 1
38 |     },
39 |     "status-code": {
40 |       "description": "Status Code of URL response.",
41 |       "disable_correlation": true,
42 |       "misp-attribute": "text",
43 |       "ui-priority": 0
44 |     },
45 |     "header": {
46 |       "description": "Headers of URL response.",
47 |       "disable_correlation": true,
48 |       "misp-attribute": "text",
49 |       "multiple": true,
50 |       "ui-priority": 0
51 |     }
52 |   },
53 |   "description": "A scanresult from an opendir url",
54 |   "meta-category": "network",
55 |   "name": "opendir-url",
56 |   "requiredOneOf": [
57 |     "url",
58 |     "sha256"
59 |   ],
60 |   "uuid": "7b4f16a7-7934-42e8-85ac-5e3415c0be5c",
61 |   "version": 9
62 | }
63 | 


--------------------------------------------------------------------------------
/crawler/input/urlhaus.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import csv
 3 | import io
 4 | import logging
 5 | import os
 6 | import sys
 7 | from datetime import datetime, timedelta
 8 | from json import dumps, loads
 9 | 
10 | import requests
11 | from kafka import KafkaConsumer, KafkaProducer
12 | from timeloop import Timeloop
13 | 
14 | producer = KafkaProducer(bootstrap_servers=['kafka:9092'], value_serializer=lambda x: dumps(x).encode('utf-8'))
15 | consumer = KafkaConsumer(
16 |     'urls',
17 |     bootstrap_servers=['kafka:9092'],
18 |     auto_offset_reset='earliest',
19 |     enable_auto_commit=False,
20 |     group_id='urls-dedup',
21 |     consumer_timeout_ms=2000,
22 |     auto_commit_interval_ms=1000,
23 |     value_deserializer=lambda x: loads(x.decode('utf-8')))
24 | 
25 | URLHAUS_API = "https://urlhaus.abuse.ch/downloads/csv_recent/"
26 | tl = Timeloop()
27 | urls = set()
28 | 
29 | 
30 | # consume all urls from kafka and dedup
31 | def load_urls():
32 |     global urls
33 |     try:
34 |         for message in consumer:
35 |             urls.add(message.value)
36 |     except Exception as e:
37 |         print(e)
38 | 
39 | 
40 | @tl.job(interval=timedelta(seconds=300))
41 | def urlhaus():
42 |     global urls
43 |     if len(urls) == 0:
44 |         load_urls()
45 | 
46 |     try:
47 |         r = requests.get(URLHAUS_API, allow_redirects=True)
48 |         csv_data = io.StringIO(r.content.decode("utf-8"))
49 |         counter = 0
50 |         while counter < 8:
51 |             next(csv_data)
52 |             counter += 1
53 | 
54 |         csv_reader = csv.DictReader(csv_data)
55 |         for row in csv_reader:
56 |             url = row["url"]
57 |             if url not in urls:
58 |                 producer.send('urls', value=url)
59 |                 urls.add(url)
60 |     except Exception as e:
61 |         print(e)
62 |         pass  # Could not download file. Try again in a few seconds.
63 | 
64 | 
65 | tl.start(block=True)
66 | 


--------------------------------------------------------------------------------
/crawler/supervisor/supervisord.conf:
--------------------------------------------------------------------------------
 1 | ; supervisor config file
 2 | 
 3 | [unix_http_server]
 4 | file=/dev/shm/supervisor.sock
 5 | chmod=0700                       ; sockef file mode (default 0700)
 6 | 
 7 | [supervisord]
 8 | ;nodaemon=true                                 
 9 | logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log)
10 | pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
11 | childlogdir=/var/log/supervisor            ; ('AUTO' child log dir, default $TEMP)
12 | 
13 | ; the below section must remain in the config file for RPC
14 | ; (supervisorctl/web interface) to work, additional interfaces may be
15 | ; added by defining them in separate rpcinterface: sections
16 | [rpcinterface:supervisor]
17 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
18 | 
19 | [supervisorctl]
20 | serverurl=unix:///dev/shm/supervisor.sock     
21 | 
22 | ; The [include] section can just contain the "files" setting.  This
23 | ; setting can list multiple files (separated by whitespace or
24 | ; newlines).  It can also contain wildcards.  The filenames are
25 | ; interpreted as relative to this file.  Included files *cannot*
26 | ; include files themselves.
27 | 
28 | [include]
29 | files = /etc/supervisor/conf.d/*.conf
30 | 
31 | [program:urlhaus]
32 | command=/usr/local/bin/python3 urlhaus.py
33 | directory=/subcrawl/input
34 | autostart=true
35 | autorestart=true
36 | startretries=3
37 | stderr_logfile=/var/log/subcrawl/urlhaus.err.log
38 | stdout_logfile=/var/log/subcrawl/urlhaus.out.log
39 | user=root
40 | 
41 | [program:phishtank]
42 | command=/usr/local/bin/python3 phishtank.py
43 | directory=/subcrawl/input
44 | autostart=true
45 | autorestart=true
46 | startretries=3
47 | stderr_logfile=/var/log/subcrawl/phishtank.err.log
48 | stdout_logfile=/var/log/subcrawl/phishtank.out.log
49 | user=root
50 | 
51 | [program:subcrawl]
52 | priority=1
53 | command=/usr/local/bin/python3 service.py
54 | directory=/subcrawl
55 | autostart=true
56 | autorestart=true
57 | startretries=3
58 | stderr_logfile=/var/log/subcrawl/subcrawl.err.log
59 | stdout_logfile=/var/log/subcrawl/subcrawl.out.log
60 | user=root
61 | 
62 | 


--------------------------------------------------------------------------------
/crawler/app/templates/dashboard.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 |           <div class="row">
 6 |             <h2 class="ui huge dividing header" style="margin-top:10px">Dashboard</h2>
 7 |           </div>
 8 |           <div class="center aligned row" >
 9 | 		  <div style="display: table; width: 48%; height: 200px; border: 1px solid black; margin-right:20px; border-radius: 25px; margin-top: 20px;">
10 | 			  <div style="display: table-cell; vertical-align: middle; text-align:center">
11 | 				<div>
12 | 				  <label style="font-size:28px">Scanned Domains<br><br>{{ domains }}</label>
13 | 				</div>
14 | 			  </div>
15 | 		  </div>
16 | 		  <div style="display: table; width: 48%; height: 200px; border-radius: 25px; border: 1px solid black; margin-top: 20px;">
17 | 			<div style="display: table-cell; vertical-align: middle; text-align:center">
18 | 				<div>
19 | 				  <label style="font-size:28px">Scanned URLs<br><br>{{ urls }}</label>
20 | 				</div>
21 | 			  </div>
22 | 		  </div>
23 | 		  
24 | 		  <div style="display: table; width: 48%; height: 200px; border-radius: 25px; border: 1px solid black; margin-right:20px; margin-top: 20px;">
25 | 			<div style="display: table-cell; vertical-align: middle; text-align:center">
26 | 				<div>
27 | 				  <label style="font-size:28px">Top Tags<br><br></label>
28 | 				  {% for tag in tags %}
29 | 					<a href="/search?search=tag:{{ tag | display_tagname}}" class="button ui mini {{ tag.color }}">{{ tag | display_tagname}} ({{ tag.count }})</a>
30 | 				  {%- endfor %}
31 | 				</div>
32 | 			  </div>
33 | 		  </div>
34 | 		  
35 | 		  <div style="display: table; width: 48%; height: 200px; border-radius: 25px; border: 1px solid black; margin-top: 20px;">
36 | 			<div style="display: table-cell; vertical-align: middle; text-align:center">
37 | 				<div>
38 | 				  <label style="font-size:28px">Top Hashes<br><br></label><p style="font-family: monospace;">
39 | 				  {% for hash in hashes %}
40 | 					<a href="/search?search=sha256:{{ hash.sha256 }}">{{ hash.sha256 }} ({{ hash.count }})</a><br>
41 | 				  {%- endfor %}
42 | 				</p></div>
43 | 			  </div>
44 | 		  </div>
45 | 			
46 |           </div>
47 | 
48 | {% endblock %}
49 | 


--------------------------------------------------------------------------------
/crawler/storage/console_storage.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | import json
 3 | import pprint
 4 | from re import subn
 5 | 
 6 | from utils import SubCrawlColors
 7 | from .default_storage import DefaultStorage
 8 | 
 9 | 
10 | class ConsoleStorage(DefaultStorage):
11 | 
12 |     cfg = None
13 |     logger = None
14 | 
15 |     def __init__(self, config, logger):
16 |         self.cfg = config
17 |         self.logger = logger
18 | 
19 |     def load_scraped_domains(self):
20 |         return []
21 | 
22 |     def store_result(self, result_data):
23 |         total_urls = 0
24 | 
25 |         print(SubCrawlColors.PURPLE + "\n" + "*" * 25 +
26 |               " CONSOLE STORAGE - SUMMARY " + "*" * 26 + "\n" +
27 |               SubCrawlColors.RESET)
28 | 
29 |         for domain in result_data:
30 |             results = dict()
31 | 
32 |             total_urls += len(result_data[domain])
33 | 
34 |             for url_content in result_data[domain]:
35 |                 for module in url_content["modules"]:
36 |                     if url_content["modules"][module]:
37 |                         if len(url_content["modules"][module]) > 0:
38 |                             results.setdefault(module, []).append(url_content["modules"][module])
39 | 
40 |             if len(results) > 0:
41 |                 print(SubCrawlColors.CYAN + "<=====   " + str(domain) +
42 |                       "  =====>"+SubCrawlColors.RESET)
43 |                       
44 |                 for payload_module in results:
45 |                     if payload_module == "JARMProcessing":
46 |                         for result in results[payload_module]:
47 |                             print("\t[" + payload_module + "] " +
48 |                                 result["fingerprint"] + " (" +
49 |                                 "port: " + str(result["port"]) + ")" + SubCrawlColors.RESET)
50 |                     else:
51 |                         for result in results[payload_module]:
52 |                             print("\t[" + payload_module + "] " +
53 |                                 str(result['matches']) + "( " +
54 |                                 result['url'] + " )" + SubCrawlColors.RESET)
55 |                             print("\t\t[SHA256] " + result['hash'])
56 |                 print("")
57 | 
58 |         return True
59 | 


--------------------------------------------------------------------------------
/crawler/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | # source: https://www.toptal.com/python/in-depth-python-logging
 3 | 
 4 | import logging
 5 | import sys
 6 | import enum
 7 | from logging.handlers import TimedRotatingFileHandler
 8 | from utils.ansi_colors import SubCrawlColors
 9 | 
10 | 
11 | class SubCrawlLogger():
12 | 
13 |     formatter = None
14 |     log_file = ""
15 |     logger_name = ""
16 |     log_level = logging.WARN
17 | 
18 |     def __init__(self, logfile, logger_name, log_level=logging.WARN):
19 |         self.log_file = logfile
20 |         self.logger_name = logger_name
21 |         self.log_level = log_level
22 |         self.formatter = CustomFormatter()
23 | 
24 |     def get_console_handler(self):
25 |         console_handler = logging.StreamHandler(sys.stdout)
26 |         console_handler.setFormatter(self.formatter)
27 |         return console_handler
28 | 
29 |     def get_file_handler(self):
30 |         file_handler = TimedRotatingFileHandler(self.log_file, when='midnight')
31 |         file_handler.setFormatter(self.formatter)
32 |         return file_handler
33 | 
34 |     def get_logger(self):
35 |         logger = logging.getLogger(self.logger_name)
36 |         logger.setLevel(self.log_level)
37 |         logger.addHandler(self.get_file_handler())
38 |         logger.addHandler(self.get_console_handler())
39 |         logger.propagate = False
40 |         return logger
41 | 
42 | 
43 | class SubCrawlLoggerLevels(enum.Enum):
44 |     NOTSET = 0
45 |     DEBUG = 10
46 |     INFO = 20
47 |     WARN = 30
48 |     ERROR = 40
49 |     CRITICAL = 50
50 | 
51 | 
52 | class CustomFormatter(logging.Formatter):
53 |     format = "%(asctime)s — %(name)s — %(levelname)s — %(message)s"
54 | 
55 |     FORMATS = {
56 |         logging.DEBUG: SubCrawlColors.GREEN + format + SubCrawlColors.RESET,
57 |         logging.INFO: SubCrawlColors.BLUE + format + SubCrawlColors.RESET,
58 |         logging.WARNING: SubCrawlColors.YELLOW + format + SubCrawlColors.RESET,
59 |         logging.ERROR: SubCrawlColors.RED + format + SubCrawlColors.RESET,
60 |         logging.CRITICAL: SubCrawlColors.RED + format + SubCrawlColors.RESET
61 |     }
62 | 
63 |     def format(self, record):
64 |         log_fmt = self.FORMATS.get(record.levelno)
65 |         formatter = logging.Formatter(log_fmt)
66 |         return formatter.format(record)
67 | 


--------------------------------------------------------------------------------
/crawler/processing/sdhash_processing.py:
--------------------------------------------------------------------------------
 1 | # © Copyright 2021 HP Development Company, L.P.
 2 | #####
 3 | # Probably not the easiest module to install. Needs protobuf-2.5.0 and python3.6 and of course sdhash
 4 | #
 5 | # Protobuf installation:
 6 | # > apt-get update
 7 | # > apt-get -y install libssl-dev libevent-pthreads-2.1-6 libomp-dev g++
 8 | # > apt-get -y install autoconf automake libtool curl make g++ unzip
 9 | # > wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.zip
10 | # > unzip protobuf-2.5.0.zip
11 | # > cd protobuf-2.5.0
12 | # > ./configure
13 | # > make
14 | # > sudo make install
15 | #
16 | # Python3.6 installation.
17 | # > apt-get install python3.6-dev
18 | # > sudo ldconfig
19 | #
20 | # SdHash installation:
21 | # Use binaries from folder minisdhash or compile itself. If you chose the later -> have fun.
22 | #
23 | 
24 | import os
25 | import random
26 | import string
27 | 
28 | from .default_processing import DefaultProcessing
29 | from .minisdhash import sdbf_class as sdhash
30 | from utils import SubCrawlHelpers, SubCrawlColors
31 | 
32 | 
33 | class SDhashProcessing(DefaultProcessing):
34 | 
35 |     cfg = None
36 |     logger = None
37 | 
38 |     def __init__(self, config, logger):
39 |         self.cfg = config
40 |         self.logger = logger
41 | 
42 |     def save_content(self, data):
43 |         try:
44 |             letters = string.ascii_lowercase
45 |             filename = ''.join(random.choice(letters) for i in range(10))
46 |             with open(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + filename, "wb") as file:
47 |                 file.write(data)
48 |             return filename
49 |         except Exception as e:
50 |             self.logger.error("[SDHASH] Error: " + str(e))
51 |             return ""
52 | 
53 |     def process(self, url, content):
54 |         sd_result = {}
55 |         if len(content) < 512:
56 |             return {}
57 | 
58 |         try:
59 |             file_name = self.save_content(content)
60 |             if file_name:
61 |                 sd = sdhash.sdbf(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + file_name, 0)
62 |                 sd_result["sdhash"] = sd.to_string()
63 |                 sd_result["url"] = url
64 |                 os.remove(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + file_name,)
65 |         except Exception as e:
66 |             self.logger.error("[SDHASH] Error: " + str(e))
67 |         return sd_result
68 | 


--------------------------------------------------------------------------------
/crawler/config.yml:
--------------------------------------------------------------------------------
  1 | crawler:
  2 |   batch_size: 250
  3 |   log_level: INFO
  4 |   scan_simple_domains: False
  5 |   host_max_crawl_depth: 2
  6 |   follow_redirects: False
  7 |   download_dir: samples/
  8 |   tmp_dir: tmp/
  9 |   save_payload_content: False
 10 |   yara_rules: yara-rules/combined-rules.yar
 11 |   logos_path: utils/logos/
 12 |   tag_line: ~~ Harvesting the Open Web ~~
 13 |   http_request_timeout: 10
 14 |   delay_execution_time: 0
 15 |   http_download_timeout: 60
 16 |   http_max_size: 26214400
 17 |   processing_modules:
 18 |     - ClamAVProcessing
 19 |     - JARMProcessing
 20 |     - TLSHProcessing
 21 |     - YARAProcessing
 22 |   storage_modules:
 23 |     - SqliteStorage
 24 |   opendir_title:
 25 |     - index of
 26 |     - directory listing for
 27 |   ext_exclude:
 28 |     - .js
 29 |     - .css
 30 |     - .eot
 31 |     - .woff
 32 |     - .woff2
 33 |     - .png
 34 |     - .jpg
 35 |     - .jpeg
 36 |     - .gif
 37 |     - .json
 38 |     - .scss
 39 |     - .md
 40 |     - tinymce.php
 41 |     - .mp4
 42 |     - .mp3
 43 |     - .mo
 44 |     - .svg
 45 |     - .po
 46 |     - .crt
 47 |     - .phar
 48 |     - .map
 49 |     - .xml
 50 |     - .pdf
 51 |     - .ico
 52 |     - .ttf
 53 |     - .go
 54 |     - .psd
 55 |     - .csv
 56 |     - .xap
 57 |     - .ts
 58 |     - .stub
 59 |     - .tpl
 60 |     - .h
 61 |   archive_magics:
 62 |     - zip archive data
 63 |   pe_magics:
 64 |     - pe32
 65 |     - ms-dos
 66 |   php_magics:
 67 |     - php script
 68 |   office_magics:
 69 |     - "application: microsoft"
 70 |     - microsoft ooxml
 71 |     - microsoft excel
 72 |     - microsoft word
 73 |   elf_magics:
 74 |     - "ELF 64"
 75 |     - "ELF 32"
 76 |   java_magics:
 77 |     - "Java archive data"
 78 |   headers:
 79 |     User-Agent: Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36
 80 |     DNT: "1"
 81 |     Pragma: no-cache
 82 |     Cache-Control: no-cache
 83 |   urlhaus_api: https://urlhaus.abuse.ch/downloads/csv_recent/
 84 | misp:
 85 |   misp_url: https://localhost
 86 |   misp_api_key: API_KEY_GOES_HERE
 87 |   domain_event: 0
 88 | elasticsearch:
 89 |   host: localhost
 90 |   port: 9200
 91 |   index: subcrawl
 92 |   archive_response_content: False
 93 |   archive_log_location: "log/"
 94 | external_intel:
 95 |     vt_api: <NOT SET>
 96 |     urlhaus_api: <NOT SET>
 97 |     bazaar_api: <NOT SET>
 98 |     submit_urlhaus: False
 99 |     submit_bazaar: False
100 |     


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #VS Code
  2 | .vscode
  3 | 
  4 | # SubCrawl specific
  5 | subcrawl.log.*
  6 | 
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 


--------------------------------------------------------------------------------
/crawler/app/templates/url_details.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 |           <div class="row">
 6 |             <h2 class="ui huge dividing header" style="margin-top:10px">Url Information</h2>
 7 |           </div>
 8 |           <div class="center aligned row">
 9 | 
10 |             <form class="ui form" style="width:100%" method="POST" action="/url/{{ url.id }}">
11 | 
12 |               <div class="field">
13 |                 <label style="font-size:16px">Url</label>
14 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="url" placeholder="Url" value="{{ url.url }}">
15 |               </div>
16 | 
17 |               <div class="field">
18 |                 <label style="font-size:16px">Status Code</label>
19 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="status_code" placeholder="status_code" value="{{ url.status_code }}">
20 |               </div>
21 | 
22 |               <div class="field">
23 |                 <label style="font-size:16px">Title</label>
24 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="title" placeholder="title" value="{{ url.title }}">
25 |               </div>
26 | 
27 |               <div class="field">
28 |                 <label style="font-size:16px">Sha256</label>
29 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="title" placeholder="title" value="{{ url.sha256 }}">
30 |               </div>
31 | 
32 |               <div class="field">
33 |                 <label style="font-size:16px">Scanned</label>
34 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="last_check" placeholder="last_check" value="{{ url.last_check }}">
35 |               </div>
36 | 			  
37 | 			  {% for ext in extensions %}
38 | 			  <div class="field">
39 | 			  <label style="font-size:16px">{{ ext.key }}</label>
40 |                 <input type="text" disabled style="font-size:16px; opacity:0.8;" name="extension_{{ ext.key }}" placeholder="Extension {{ ext.key }}" value="{{ ext.value }}">
41 |               </div>
42 | 			  {%- endfor %}
43 | 			  
44 | 			  
45 |               <input type="button" value="Delete Url" name="confirm" class="ui button red" id="action-delete" />
46 | 
47 |             </form>
48 | 
49 |             <div class="ui mini modal">
50 |               <div class="header">Confirm Deletion</div>
51 |               <div class="content">
52 |                 <p>Do you really want to delete this Url?</p>
53 |               </div>
54 |               <div class="actions">
55 |                 <form method="POST" action="/url/{{ url.id }}">
56 |                   <div class="ui deny button">
57 |                     Cancel
58 |                   </div>
59 |                   <input type="submit" value="Really Delete" name="delete" class="ui button red"  >
60 |                 </form>
61 |               </div>
62 |             </div>
63 | 
64 |           </div>
65 | 
66 |           <script>
67 |             $(document).ready(function() {
68 |               $('.ui.form').form({
69 |                   fields: {
70 |                     name : 'empty'
71 |                   }
72 |                 });
73 |               });
74 |               $("#action-delete").click(function(){
75 |                 $('.mini.modal')
76 |                  .modal('setting', 'closable', false)
77 |                  .modal('show')
78 |                 ;
79 |               });
80 |           </script>
81 | 
82 | {% endblock %}
83 | 


--------------------------------------------------------------------------------
/crawler/app/templates/domain_details.html:
--------------------------------------------------------------------------------
 1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
 2 | {% extends 'base.html' %}
 3 | 
 4 | {% block content %}
 5 |           <div class="row">
 6 |             <h2 class="ui huge dividing header" style="margin-top:10px">Domain Information</h2>
 7 |           </div>
 8 |           <div class="center aligned row">
 9 | 
10 |             <form class="ui form" style="width:100%" method="POST" action="/domain/{{ domain.id }}">
11 | 
12 |               <div class="field">
13 |                 <label style="font-size:16px">Name</label>
14 | 				<input type="text" disabled style="font-size:16px; opacity:0.8;" name="domain" placeholder="Domain" value="{{ domain.name }}">
15 |               </div>
16 | 
17 |               <div class="field">
18 |                 <label style="font-size:16px">Description</label>
19 |                 <textarea rows=2 name="description">{{ domain.description }}</textarea>
20 |               </div>
21 | 			  
22 | 			  <div class="field">
23 |                 <label style="font-size:16px">Tags</label>
24 | 				{% for tag in tags %}
25 | 					<a href="/search?search=tag:{{ tag.tag }}" class="ui mini {{ tag.color }} label">{{ tag.tag }}</a>
26 | 				{%- endfor %}
27 |               </div><br>
28 | 
29 | 
30 |               <input type="submit" value="Update Domain" class="ui button" />
31 |               <input type="button" value="Delete Domain" name="confirm" class="ui button red" id="action-delete" />
32 | 
33 |             </form>
34 | 
35 |             <div class="ui mini modal">
36 |               <div class="header">Confirm Deletion</div>
37 |               <div class="content">
38 |                 <p>Do you really want to delete this Domain with all its Urls?</p>
39 |               </div>
40 |               <div class="actions">
41 |                 <form method="POST" action="/domain/{{ domain.id }}">
42 |                   <div class="ui deny button">
43 |                     Cancel
44 |                   </div>
45 |                   <input type="submit" value="Really Delete" name="delete" class="ui button red"  >
46 |                 </form>
47 |               </div>
48 |             </div>
49 | 
50 |             <h4 class="ui dividing header" style="font-size:16px">Urls</h4>
51 | 
52 |             <table class="ui single line striped selectable unstackable table sortable searchable ">
53 |               <thead>
54 |                 <tr>
55 |                   <th>#</th>
56 |                   <th>Url</th>
57 |                   <th>Status Code</th>
58 |                   <th>Hash</th>
59 |                   <th>Scanned</th>
60 |                 </tr>
61 |               </thead>
62 |               <tbody>
63 | 
64 |                 {% for url in urls %}
65 |                   <tr class='clickable clickable-row' data-href='/url/{{- url.id }}'>
66 |                     <td>{{- url.id }}</td>
67 |                     <td>{{- url.url }}</td>
68 |                     <td>{{- url.status_code }}</td>
69 |                     <td>{{- url.sha256 }}</td>
70 |                     <td>{{- url.last_check }}</td>
71 |                   </tr>
72 |                 {%- endfor %}
73 | 
74 |               </tbody>
75 |             </table>
76 |           </div>
77 | 
78 |           <script>
79 |             $(document).ready(function() {
80 |               $('.ui.form').form({
81 |                   fields: {
82 |                     name : 'empty'
83 |                   }
84 |                 });
85 |               });
86 |               $("#action-delete").click(function(){
87 |                 $('.mini.modal')
88 |                  .modal('setting', 'closable', false)
89 |                  .modal('show')
90 |                 ;
91 |               });
92 |           </script>
93 | 
94 | {% endblock %}
95 | 


--------------------------------------------------------------------------------
/crawler/processing/payload_processing.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import hashlib
  3 | import os
  4 | import magic
  5 | from utils import SubCrawlColors, SubCrawlHelpers
  6 | 
  7 | from .default_processing import DefaultProcessing
  8 | 
  9 | 
 10 | class PayloadProcessing(DefaultProcessing):
 11 | 
 12 |     cfg = None
 13 |     logger = None
 14 | 
 15 |     def __init__(self, config, logger):
 16 |         self.cfg = config
 17 |         self.logger = logger
 18 | 
 19 |         if not os.path.exists(SubCrawlHelpers.get_config(
 20 |                               self.cfg, "crawler", "download_dir")):
 21 |             os.makedirs(SubCrawlHelpers.get_config(
 22 |                          self.cfg, "crawler", "download_dir"))
 23 | 
 24 |     def process(self, url, content):
 25 |         payload = {}
 26 |         content_match = True
 27 |         file_ext = ""
 28 | 
 29 |         shasum = SubCrawlHelpers.get_sha256(content)
 30 |         content_magic = magic.from_buffer(content).lower()
 31 |         matches = content_magic
 32 | 
 33 |         if any(partial in content_magic for partial in
 34 |                SubCrawlHelpers.get_config(self.cfg, "crawler", "pe_magics")):
 35 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] PE file found " +
 36 |                              url + " (" + shasum + ")" 
 37 |                              + SubCrawlColors.RESET)
 38 | 
 39 |             file_ext = ".bin"
 40 |             if "(dll)" in content_magic:
 41 |                 file_ext = ".dll" + file_ext
 42 |             elif "x86-64" in content_magic:
 43 |                 file_ext = ".64.exe" + file_ext
 44 |             else:
 45 |                 file_ext = ".exe" + file_ext
 46 | 
 47 |         elif any(partial in content_magic for partial in
 48 |                  SubCrawlHelpers.get_config(self.cfg, "crawler",
 49 |                                             "archive_magics")):
 50 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] ZIP found at " +
 51 |                              url + " (" + shasum + ")" +
 52 |                              SubCrawlColors.RESET)
 53 |             file_ext = ".zip.bin"
 54 |         elif any(partial in content_magic for partial in
 55 |                  SubCrawlHelpers.get_config(self.cfg, "crawler",
 56 |                                             "php_magics")):
 57 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] PHP found at " +
 58 |                              SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" + 
 59 |                              SubCrawlColors.RESET)
 60 |             file_ext = ".php.bin"
 61 |         elif any(partial in content_magic for partial in
 62 |                  SubCrawlHelpers.get_config(self.cfg, "crawler",
 63 |                                             "office_magics")):
 64 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] Doc found at " +
 65 |                              SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" +
 66 |                              SubCrawlColors.RESET)
 67 |             file_ext = ".office.bin"
 68 |         elif any(partial in content_magic for partial in
 69 |                  SubCrawlHelpers.get_config(self.cfg, "crawler",
 70 |                                             "elf_magics")):
 71 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] ELF found at " +
 72 |                              SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" +
 73 |                              SubCrawlColors.RESET)
 74 |             file_ext = ".elf.bin"
 75 |         elif any(partial in content_magic for partial in
 76 |                  SubCrawlHelpers.get_config(self.cfg, "crawler",
 77 |                                             "java_magics")):
 78 |             self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] Java found at " +
 79 |                              SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" +
 80 |                              SubCrawlColors.RESET)
 81 |         else:
 82 |             content_match = False
 83 | 
 84 |         if content_match:
 85 |             payload = {"hash": shasum, "url": url, "matches": matches}
 86 | 
 87 |         if content_match and \
 88 |            SubCrawlHelpers.get_config(self.cfg, "crawler",
 89 |                                       "save_payload_content"):
 90 |             try:
 91 |                 SubCrawlHelpers.save_content(
 92 |                     self.cfg['crawler']['download_dir'] +
 93 |                     shasum + file_ext, content)
 94 |                 self.logger.info(SubCrawlColors.CYAN +
 95 |                                  "[PAYLOAD] Saved file " +
 96 |                                  SubCrawlHelpers.defang_url(url) +
 97 |                                  SubCrawlColors.RESET)
 98 |             except Exception as e:
 99 |                 self.logger.error("[PAYLOAD] " + str(e))
100 |                 pass
101 | 
102 |         return payload
103 | 


--------------------------------------------------------------------------------
/crawler/storage/sqlite_storage.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import csv
  3 | import io
  4 | import logging
  5 | from io import StringIO
  6 | from urllib.parse import urlparse
  7 | 
  8 | import requests
  9 | from utils import Domain, DomainTag, Extension, Tag, Url, db, fn
 10 | from utils import SubCrawlColors, SubCrawlHelpers
 11 | from .default_storage import DefaultStorage
 12 | 
 13 | 
 14 | class SqliteStorage(DefaultStorage):
 15 | 
 16 |     cfg = None
 17 |     logger = None
 18 | 
 19 |     def __init__(self, config, logger):
 20 |         self.cfg = config
 21 |         self.logger = logger
 22 | 
 23 |     def load_scraped_domains(self):
 24 |         domains = Domain.select()
 25 |         return domains
 26 | 
 27 |     def store_result(self, result_data):
 28 |         # Load URLHaus tags
 29 |         url_info = dict()
 30 |         r = requests.get(SubCrawlHelpers.get_config(self.cfg, "crawler", "urlhaus_api"), allow_redirects=True)
 31 |         csv_data = io.StringIO(r.content.decode("utf-8"))
 32 |         counter = 0
 33 |         while counter < 8:
 34 |             next(csv_data)
 35 |             counter += 1
 36 | 
 37 |         csv_reader = csv.DictReader(csv_data)
 38 |         for row in csv_reader:
 39 |             domain = urlparse(row["url"]).netloc
 40 |             if domain not in url_info:
 41 |                 url_info[domain] = set()
 42 |             url_info[domain].update(row["tags"].lower().split(","))
 43 | 
 44 |         for domain in result_data:
 45 |             tags = []
 46 |             if domain in url_info:
 47 |                 tags = url_info[domain]
 48 | 
 49 |             if len(result_data[domain]) > 0:
 50 |                 domains = Domain.select().where(Domain.name == domain)
 51 | 
 52 |                 if len(domains) > 0:
 53 |                     ref_domain = domains[0]
 54 |                 else:
 55 |                     ref_domain = Domain(name=domain)
 56 |                     ref_domain.save()
 57 | 
 58 |                     for tag in tags:
 59 |                         db_tag = Tag.select().where(Tag.tag == tag)
 60 |                         if len(db_tag) == 0:
 61 |                             db_tag = Tag(tag=tag)
 62 |                             db_tag.save()
 63 |                         dt = DomainTag(domain=ref_domain, tag=db_tag)
 64 |                         dt.save()
 65 | 
 66 |                 for url_content in result_data[domain]:
 67 | 
 68 |                     url = Url(domain=ref_domain, url=str(url_content["url"]), status_code=url_content["data"]["resp"]["status_code"], title=str(url_content["data"]["title"]), sha256=str(url_content["sha256"]))
 69 |                     url.save()
 70 | 
 71 |                     if "index of" in str(url_content["data"]["title"]).lower():
 72 |                         db_tag = Tag.select().where(Tag.tag == "opendir")
 73 |                         if len(db_tag) == 0:
 74 |                             db_tag = Tag(tag="opendir")
 75 |                             db_tag.save()
 76 | 
 77 |                         dt = DomainTag.select().where(DomainTag.domain == ref_domain, DomainTag.tag == db_tag)
 78 |                         if len(dt) == 0:
 79 |                             dt = DomainTag(domain=ref_domain, tag=db_tag)
 80 |                             dt.save()
 81 | 
 82 |                     for header in url_content["data"]["resp"]["headers"]:
 83 |                         ext = Extension(key=str(header).lower(), value=url_content["data"]["resp"]["headers"][header], url=url)
 84 |                         ext.save()
 85 | 
 86 |                     try:
 87 |                         for module in url_content["modules"]:
 88 |                             if len(url_content["modules"][module]) > 0:
 89 |                                 if module == "JARMProcessing":
 90 |                                     ext = Extension(key="jarm", value=str(url_content["modules"][module]["fingerprint"]), url=url)
 91 |                                     ext.save()
 92 | 
 93 |                                 elif module == "SDhashProcessing":
 94 |                                     ext = Extension(key="sdhash", value=str(url_content["modules"][module]["sdhash"]), url=url)
 95 |                                     ext.save()
 96 | 
 97 |                                 elif module == "TLSHProcessing":
 98 |                                     ext = Extension(key="tlsh", value=str(url_content["modules"][module]["tlsh"]), url=url)
 99 |                                     ext.save()
100 | 
101 |                                 elif module == "YARAProcessing":
102 |                                     for rule in url_content["modules"][module]["rules"]:
103 |                                         ext = Extension(key="yara", value=str(rule), url=url)
104 |                                         ext.save()
105 | 
106 |                     except Exception as e:
107 |                         self.logger.error('[SQLite] ' + str(e))
108 | 
109 |                 self.logger.info("[SQLite] Scan results stored: " + domain)
110 | 


--------------------------------------------------------------------------------
/crawler/storage/elastic_storage.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import os
  3 | import zipfile
  4 | import io
  5 | from datetime import datetime
  6 | from urllib.parse import urlparse
  7 | from re import subn
  8 | from elasticsearch import Elasticsearch, helpers
  9 | 
 10 | from utils import SubCrawlColors, SubCrawlHelpers
 11 | from .default_storage import DefaultStorage
 12 | 
 13 | 
 14 | class ElasticStorage(DefaultStorage):
 15 | 
 16 |     cfg = None
 17 |     logger = None
 18 |     es = None
 19 |     index = None
 20 |     archive_location = None
 21 |     archive_content = False
 22 |     max_fields = 0
 23 | 
 24 |     def __init__(self, config, logger):
 25 |         self.cfg = config
 26 |         self.logger = logger
 27 |         self.archive_location = SubCrawlHelpers.get_config(self.cfg,'elasticsearch', 'archive_log_location')
 28 |         self.archive_content = SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'archive_response_content')
 29 |         self.index = SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'index')
 30 | 
 31 |         try:
 32 |             self.es = Elasticsearch([{'host': SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'host'),
 33 |                                     'port': SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'port'),
 34 |                                     'index': self.index}])
 35 |             self.es.ping()
 36 | 
 37 |             if not self.es.indices.exists(self.index):
 38 |                 self.logger.debug('[ELASTIC] Index did not exist, creating: ' + self.index)
 39 |                 self.es.indices.create(index=self.index)
 40 | 
 41 |             if self.archive_content:
 42 |                 if not os.path.isdir(self.archive_location):
 43 |                     os.mkdir(self.archive_location)
 44 |                     self.logger.debug('[ELASTIC] Response content being saved, log created at: ' + self.archive_location)
 45 |             
 46 |         except Exception as e:
 47 |             self.logger.error('[ELASTIC] Problem connecting to Elastic: ' + str(e))
 48 |             raise e
 49 | 
 50 |     def load_scraped_domains(self):
 51 |         return []
 52 | 
 53 |     def normalize_field_name(self, field_name):
 54 |         return field_name.replace(' ','_').replace('-','_').lower()
 55 | 
 56 |     def store_content(self, content_buffer, file_name):
 57 | 
 58 |         try:
 59 |             tmp_buffer = io.BytesIO()
 60 | 
 61 |             with zipfile.ZipFile(tmp_buffer, mode='w',compression=zipfile.ZIP_DEFLATED) as zip_file:
 62 |                 zip_file.writestr('http.response.payload', str.encode(content_buffer,'utf-8'))
 63 | 
 64 |             with open(self.archive_location + file_name,'wb') as tmp_zip:
 65 |                 tmp_zip.write(tmp_buffer.getvalue())
 66 | 
 67 |         except Exception as ex:
 68 |             self.logger.error('[ELASTIC] Problem adding data: ' + str(ex))
 69 | 
 70 | 
 71 |     def store_result(self, result_data):
 72 |         data = {}
 73 |         doc_list = []
 74 | 
 75 |         try:
 76 |             for domain in result_data:
 77 |                 for url_content in result_data[domain]:
 78 |                     field_cnt = 0
 79 |                     parsed_url = urlparse(url_content['url'])
 80 | 
 81 |                     data = {
 82 |                         'http.request.url': url_content['url'],
 83 |                         'http.request.scheme': parsed_url.scheme,
 84 |                         'http.request.netloc': parsed_url.netloc,
 85 |                         'http.request.path': parsed_url.path,
 86 |                         'http.request.params': parsed_url.params,
 87 |                         'http.request.query': parsed_url.query,
 88 |                         'http.request.fragment': parsed_url.fragment,
 89 |                         'crawled_on': url_content['scraped_on'],
 90 |                         'http.response.body.content.sha256': url_content['sha256'],                        
 91 |                         'http.response.body.content_magic': url_content['content_type'],
 92 |                         'http.signature': url_content['signature'],
 93 |                         'http.response.title': url_content['data']['title'],
 94 |                         'http.response.status_code': url_content['data']['resp']['status_code'],
 95 |                     }
 96 | 
 97 |                     for header in url_content['data']['resp']['headers']:
 98 |                         data['http.response.header.' + self.normalize_field_name(header)] = url_content['data']['resp']['headers'][header]
 99 |   
100 |                     for module in url_content['modules']:
101 |                         if len(url_content['modules'][module]) > 0:
102 |                             if module == 'YARAProcessing':
103 |                                 data['yara_results'] = url_content['modules'][module]['matches']
104 | 
105 |                     if self.archive_content:
106 |                         tmp_dt = datetime.strptime(url_content['scraped_on'][:-7], '%Y-%m-%dT%H:%M:%S')
107 |                         self.store_content(url_content['data']['text'],str(int(tmp_dt.timestamp())) + '_' + url_content['sha256'])
108 | 
109 |                     doc_list.append(data)
110 | 
111 |             helpers.bulk(
112 |                 self.es,
113 |                 doc_list,
114 |                 index=self.index
115 |             )
116 | 
117 |             self.logger.info('[ELASTIC] added ' + str(len(doc_list)) + ' items')
118 | 
119 |         except Exception as e:
120 |             self.logger.error('[ELASTIC] Problem adding data: ' + str(e))
121 | 


--------------------------------------------------------------------------------
/crawler/app/templates/base.html:
--------------------------------------------------------------------------------
  1 | <!-- © Copyright 2021 HP Development Company, L.P. -->
  2 | <!DOCTYPE html>
  3 | <html>
  4 |   <head>
  5 |     <meta charset="utf-8" />
  6 |     <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible" />
  7 |     <meta
  8 |       name="viewport"
  9 |       content="width=device-width, initial-scale=1, maximum-scale=2, user-scalable=no"
 10 |     />
 11 |     <meta
 12 |       name="description"
 13 |       content="Semantic-UI-Forest, collection of design, themes and templates for Semantic-UI."
 14 |     />
 15 |     <meta name="keywords" content="Semantic-UI, Theme, Design, Template" />
 16 |     <meta name="author" content="PPType" />
 17 |     <meta name="theme-color" content="#ffffff" />
 18 |     <title>SubCrawl</title>
 19 |     <link
 20 |       rel="stylesheet"
 21 |       href="https://cdnjs.cloudflare.com/ajax/libs/semantic-ui/2.4.1/semantic.min.css"
 22 |       type="text/css"
 23 |     />
 24 |     <style type="text/css">
 25 |       body {
 26 |         -webkit-font-smoothing: antialiased;
 27 |         -moz-font-smoothing: grayscale;
 28 |       }
 29 | 
 30 |       #sidebar {
 31 |         position: fixed;
 32 |         height: 100vh;
 33 |         background-color: #f5f5f5;
 34 |         padding-top: 68px;
 35 |         padding-left: 0;
 36 |         padding-right: 0;
 37 |       }
 38 | 
 39 |       #sidebar .ui.menu > a.item {
 40 |         padding: 10px 20px;
 41 |         line-height: 20px;
 42 |         color: #337ab7;
 43 |         border-radius: 0 !important;
 44 |         margin-top: 0;
 45 |         margin-bottom: 0;
 46 |       }
 47 | 
 48 |       #sidebar .ui.menu > a.item.active {
 49 |         background-color: #337ab7;
 50 |         color: white;
 51 |         border: none !important;
 52 |       }
 53 | 
 54 |       #sidebar .ui.menu > a.item:hover {
 55 |         background-color: #eee;
 56 |         color: #23527c;
 57 |       }
 58 | 
 59 |       #content {
 60 |         padding-top: 56px;
 61 |         padding-left: 20px;
 62 |         padding-right: 20px;
 63 |       }
 64 | 
 65 |       #content h1 {
 66 |         font-size: 36px;
 67 |       }
 68 | 
 69 |       #content .ui.dividing.header {
 70 |         width: 100%;
 71 |       }
 72 | 
 73 |       .ui.centered.small.circular.image {
 74 |         margin-top: 14px;
 75 |         margin-bottom: 14px;
 76 |       }
 77 | 
 78 |       .ui.borderless.menu {
 79 |         box-shadow: none;
 80 |         flex-wrap: wrap;
 81 |         border: none;
 82 |         padding-left: 0;
 83 |         padding-right: 0;
 84 |       }
 85 | 
 86 |       .ui.mobile.only.grid .ui.menu .ui.vertical.menu {
 87 |         display: none;
 88 |       }
 89 | 
 90 |       .ui.table > tr > td.selectable,
 91 |       .ui.table > tbody > tr > td.selectable,
 92 |       .ui.selectable.table > tbody > tr,
 93 |       .ui.selectable.table > tr {
 94 |         cursor:pointer;
 95 |       }
 96 |     </style>
 97 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
 98 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/semantic-ui/2.4.1/semantic.min.js"></script>
 99 |     <script src="https://semantic-ui.com/javascript/library/tablesort.js"></script>
100 |     <script src="https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js"></script>
101 |     <script src="https://cdn.datatables.net/1.10.22/js/dataTables.semanticui.min.js"></script>
102 | 
103 |   </head>
104 | 
105 |   <body id="root">
106 |     <div class="ui tablet computer only padded grid">
107 |       <div class="ui inverted borderless top fixed fluid menu">
108 |         <a class="header item" href="/">SubCrawl</a>
109 |         <div class="right menu">
110 |           <div class="item">
111 | 			<form class="ui form" style="width:100%" method="GET" action="/search">
112 | 				<div class="ui small input" style="width:400px"><input name="search" placeholder="Search..." /></div>
113 | 			</form>
114 |           </div>
115 |            <a class="item" target="_blank" href="https://github.com/hpthreatresearch/subcrawl">Help</a>
116 |         </div>
117 |       </div>
118 |     </div>
119 |     <div class="ui mobile only padded grid">
120 |       <div class="ui top fixed borderless fluid inverted menu">
121 |         <a class="header item">SubCrawl</a>
122 |         <div class="right menu">
123 |           <div class="item">
124 |             <button class="ui icon toggle basic inverted button">
125 |               <i class="content icon"></i>
126 |             </button>
127 |           </div>
128 |         </div>
129 |         <div class="ui vertical borderless inverted fluid menu">
130 |          <a class="item">Help</a>
131 |           <div class="ui fitted divider"></div>
132 |           <div class="item">
133 |             <div class="ui small input"><input placeholder="Search..." /></div>
134 |           </div>
135 |         </div>
136 |       </div>
137 |     </div>
138 |     <div class="ui padded grid">
139 |       <div
140 |         class="three wide tablet only three wide computer only column"
141 |         id="sidebar"
142 |       >
143 |         <div class="ui vertical borderless fluid text menu">
144 |           <a class="item {{ dashboard_active }}" href="/">Dashboard</a>
145 | 	        <a class="item {{ domains_active }}" href="/domain">Domains</a>
146 |           <a class="item {{ urls_active }}" href="/url">Urls</a>
147 |         </div>
148 |       </div>
149 |       <div
150 |         class="sixteen wide mobile thirteen wide tablet thirteen wide computer right floated column"
151 |         id="content"
152 |       >
153 |         <div class="ui padded grid">
154 | 		{% block content%}
155 | 
156 |     		{% endblock %}
157 |         </div>
158 |       </div>
159 |     </div>
160 |     <script>
161 |       $(document).ready(function() {
162 |         $(".ui.toggle.button").click(function() {
163 |           $(".mobile.only.grid .ui.vertical.menu").toggle(100);
164 |         });
165 |         $(".clickable-row").click(function() {
166 |            window.location = $(this).data("href");
167 |        });
168 | 
169 |        $('.ui.dropdown').dropdown();
170 |        $('.ui.toggle').checkbox();
171 | 
172 |        $('.ui.toggle').checkbox({
173 |          onChecked: function () { $("input[name='active']").val("True"); },
174 |         onUnchecked: function () { $("input[name='active']").val("False"); }
175 |       });
176 | 
177 |         $('table').tablesort();
178 | 
179 |         $('table.searchable').DataTable();
180 | 
181 |       });
182 |     </script>
183 | 
184 |     <div class="ui dimmer" id="LoadingScreen">
185 |       <div class="ui text loader">Loading</div>
186 |     </div>
187 |   </body>
188 | </html>
189 | 


--------------------------------------------------------------------------------
/crawler/processing/minisdhash/sdbf_class.py:
--------------------------------------------------------------------------------
  1 | # This file was automatically generated by SWIG (http://www.swig.org).
  2 | # Version 3.0.12
  3 | #
  4 | # Do not make changes to this file unless you know what you are doing--modify
  5 | # the SWIG interface file instead.
  6 | 
  7 | from sys import version_info as _swig_python_version_info
  8 | if _swig_python_version_info >= (2, 7, 0):
  9 |     def swig_import_helper():
 10 |         import importlib
 11 |         pkg = __name__.rpartition('.')[0]
 12 |         mname = '.'.join((pkg, '_sdbf_class')).lstrip('.')
 13 |         try:
 14 |             return importlib.import_module(mname)
 15 |         except ImportError:
 16 |             return importlib.import_module('_sdbf_class')
 17 |     _sdbf_class = swig_import_helper()
 18 |     del swig_import_helper
 19 | elif _swig_python_version_info >= (2, 6, 0):
 20 |     def swig_import_helper():
 21 |         from os.path import dirname
 22 |         import imp
 23 |         fp = None
 24 |         try:
 25 |             fp, pathname, description = imp.find_module('_sdbf_class', [dirname(__file__)])
 26 |         except ImportError:
 27 |             import _sdbf_class
 28 |             return _sdbf_class
 29 |         try:
 30 |             _mod = imp.load_module('_sdbf_class', fp, pathname, description)
 31 |         finally:
 32 |             if fp is not None:
 33 |                 fp.close()
 34 |         return _mod
 35 |     _sdbf_class = swig_import_helper()
 36 |     del swig_import_helper
 37 | else:
 38 |     import _sdbf_class
 39 | del _swig_python_version_info
 40 | 
 41 | try:
 42 |     _swig_property = property
 43 | except NameError:
 44 |     pass  # Python < 2.2 doesn't have 'property'.
 45 | 
 46 | try:
 47 |     import builtins as __builtin__
 48 | except ImportError:
 49 |     import __builtin__
 50 | 
 51 | def _swig_setattr_nondynamic(self, class_type, name, value, static=1):
 52 |     if (name == "thisown"):
 53 |         return self.this.own(value)
 54 |     if (name == "this"):
 55 |         if type(value).__name__ == 'SwigPyObject':
 56 |             self.__dict__[name] = value
 57 |             return
 58 |     method = class_type.__swig_setmethods__.get(name, None)
 59 |     if method:
 60 |         return method(self, value)
 61 |     if (not static):
 62 |         if _newclass:
 63 |             object.__setattr__(self, name, value)
 64 |         else:
 65 |             self.__dict__[name] = value
 66 |     else:
 67 |         raise AttributeError("You cannot add attributes to %s" % self)
 68 | 
 69 | 
 70 | def _swig_setattr(self, class_type, name, value):
 71 |     return _swig_setattr_nondynamic(self, class_type, name, value, 0)
 72 | 
 73 | 
 74 | def _swig_getattr(self, class_type, name):
 75 |     if (name == "thisown"):
 76 |         return self.this.own()
 77 |     method = class_type.__swig_getmethods__.get(name, None)
 78 |     if method:
 79 |         return method(self)
 80 |     raise AttributeError("'%s' object has no attribute '%s'" % (class_type.__name__, name))
 81 | 
 82 | 
 83 | def _swig_repr(self):
 84 |     try:
 85 |         strthis = "proxy of " + self.this.__repr__()
 86 |     except __builtin__.Exception:
 87 |         strthis = ""
 88 |     return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,)
 89 | 
 90 | try:
 91 |     _object = object
 92 |     _newclass = 1
 93 | except __builtin__.Exception:
 94 |     class _object:
 95 |         pass
 96 |     _newclass = 0
 97 | 
 98 | KB = _sdbf_class.KB
 99 | 
100 | def new_intp():
101 |     return _sdbf_class.new_intp()
102 | new_intp = _sdbf_class.new_intp
103 | 
104 | def copy_intp(value):
105 |     return _sdbf_class.copy_intp(value)
106 | copy_intp = _sdbf_class.copy_intp
107 | 
108 | def delete_intp(obj):
109 |     return _sdbf_class.delete_intp(obj)
110 | delete_intp = _sdbf_class.delete_intp
111 | 
112 | def intp_assign(obj, value):
113 |     return _sdbf_class.intp_assign(obj, value)
114 | intp_assign = _sdbf_class.intp_assign
115 | 
116 | def intp_value(obj):
117 |     return _sdbf_class.intp_value(obj)
118 | intp_value = _sdbf_class.intp_value
119 | class sdbf_conf(_object):
120 |     __swig_setmethods__ = {}
121 |     __setattr__ = lambda self, name, value: _swig_setattr(self, sdbf_conf, name, value)
122 |     __swig_getmethods__ = {}
123 |     __getattr__ = lambda self, name: _swig_getattr(self, sdbf_conf, name)
124 |     __repr__ = _swig_repr
125 | 
126 |     def __init__(self, thread_cnt, warnings, max_elem_ct, max_elem_ct_dd):
127 |         this = _sdbf_class.new_sdbf_conf(thread_cnt, warnings, max_elem_ct, max_elem_ct_dd)
128 |         try:
129 |             self.this.append(this)
130 |         except __builtin__.Exception:
131 |             self.this = this
132 |     __swig_destroy__ = _sdbf_class.delete_sdbf_conf
133 |     __del__ = lambda self: None
134 | sdbf_conf_swigregister = _sdbf_class.sdbf_conf_swigregister
135 | sdbf_conf_swigregister(sdbf_conf)
136 | 
137 | class sdbf(_object):
138 |     __swig_setmethods__ = {}
139 |     __setattr__ = lambda self, name, value: _swig_setattr(self, sdbf, name, value)
140 |     __swig_getmethods__ = {}
141 |     __getattr__ = lambda self, name: _swig_getattr(self, sdbf, name)
142 |     __repr__ = _swig_repr
143 | 
144 |     def __init__(self, *args):
145 |         this = _sdbf_class.new_sdbf(*args)
146 |         try:
147 |             self.this.append(this)
148 |         except __builtin__.Exception:
149 |             self.this = this
150 |     __swig_destroy__ = _sdbf_class.delete_sdbf
151 |     __del__ = lambda self: None
152 | 
153 |     def name(self):
154 |         return _sdbf_class.sdbf_name(self)
155 | 
156 |     def size(self):
157 |         return _sdbf_class.sdbf_size(self)
158 | 
159 |     def input_size(self):
160 |         return _sdbf_class.sdbf_input_size(self)
161 | 
162 |     def compare(self, other, sample):
163 |         return _sdbf_class.sdbf_compare(self, other, sample)
164 | 
165 |     def to_string(self):
166 |         return _sdbf_class.sdbf_to_string(self)
167 | 
168 |     def get_index_results(self):
169 |         return _sdbf_class.sdbf_get_index_results(self)
170 | 
171 |     def clone_filter(self, position):
172 |         return _sdbf_class.sdbf_clone_filter(self, position)
173 | 
174 |     def filter_count(self):
175 |         return _sdbf_class.sdbf_filter_count(self)
176 |     __swig_setmethods__["config"] = _sdbf_class.sdbf_config_set
177 |     __swig_getmethods__["config"] = _sdbf_class.sdbf_config_get
178 |     if _newclass:
179 |         config = _swig_property(_sdbf_class.sdbf_config_get, _sdbf_class.sdbf_config_set)
180 |     if _newclass:
181 |         get_elem_count = staticmethod(_sdbf_class.sdbf_get_elem_count)
182 |     else:
183 |         get_elem_count = _sdbf_class.sdbf_get_elem_count
184 | sdbf_swigregister = _sdbf_class.sdbf_swigregister
185 | sdbf_swigregister(sdbf)
186 | cvar = _sdbf_class.cvar
187 | 
188 | def sdbf_get_elem_count(mine, index):
189 |     return _sdbf_class.sdbf_get_elem_count(mine, index)
190 | sdbf_get_elem_count = _sdbf_class.sdbf_get_elem_count
191 | 
192 | # This file is compatible with both classic and new-style classes.
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/crawler/storage/misp_storage.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import csv
  3 | import io
  4 | import logging
  5 | from io import StringIO
  6 | from urllib.parse import urlparse
  7 | 
  8 | import requests
  9 | from pymisp import ExpandedPyMISP, MISPAttribute, MISPEvent, MISPObject
 10 | from utils import SubCrawlColors, SubCrawlHelpers
 11 | from .default_storage import DefaultStorage
 12 | 
 13 | 
 14 | class MISPStorage(DefaultStorage):
 15 | 
 16 |     cfg = None
 17 |     logger = None
 18 | 
 19 |     def __init__(self, config, logger):
 20 |         logging.getLogger("pymisp").setLevel(logging.CRITICAL)
 21 |         self.cfg = config
 22 |         self.logger = logger
 23 | 
 24 |     def load_scraped_domains(self):
 25 |         misp = ExpandedPyMISP(SubCrawlHelpers.get_config(self.cfg, "misp", "misp_url"), SubCrawlHelpers.get_config(self.cfg, "misp", "misp_api_key"), False)
 26 | 
 27 |         domains = set()
 28 |         domain_event = None
 29 |         if SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event") != 0:
 30 |             domain_event = misp.get_event(SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event"), pythonify=True)
 31 |             for att in domain_event.attributes:
 32 |                 if att.type == "domain":
 33 |                     domains.add(att.value)
 34 |         else:
 35 |             self.logger.warning('[MISP] No domain MISP event configured')
 36 | 
 37 |         return domains
 38 | 
 39 |     def store_result(self, result_data):
 40 |         misp = ExpandedPyMISP(SubCrawlHelpers.get_config(self.cfg, "misp", "misp_url"), SubCrawlHelpers.get_config(self.cfg, "misp", "misp_api_key"), False)
 41 | 
 42 |         domain_event = None
 43 |         if SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event") != 0:
 44 |             domain_event = misp.get_event(SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event"), pythonify=True)
 45 | 
 46 |         url_info = dict()
 47 |         r = requests.get(SubCrawlHelpers.get_config(self.cfg, "crawler", "urlhaus_api"), allow_redirects=True)
 48 |         csv_data = io.StringIO(r.content.decode("utf-8"))
 49 |         counter = 0
 50 |         while counter < 8:
 51 |             next(csv_data)
 52 |             counter += 1
 53 | 
 54 |         csv_reader = csv.DictReader(csv_data)
 55 |         for row in csv_reader:
 56 |             domain = urlparse(row["url"]).netloc
 57 |             if domain not in url_info:
 58 |                 url_info[domain] = set()
 59 |             url_info[domain].update(row["tags"].lower().split(","))
 60 | 
 61 |         for domain in result_data:
 62 |             tags = []
 63 |             if domain in url_info:
 64 |                 tags = url_info[domain]
 65 | 
 66 |             if len(result_data[domain]) > 0:
 67 | 
 68 |                 jarm_added = False
 69 |                 event_data = misp.search_index(eventinfo=domain, pythonify=True)
 70 |                 if len(event_data) > 0:
 71 |                     event = event_data[0]
 72 |                 else:
 73 |                     event = MISPEvent()
 74 |                     event.distribution = 1
 75 |                     event.threat_level_id = 4
 76 |                     event.analysis = 1
 77 |                     event.info = domain
 78 | 
 79 |                     for tag in tags:
 80 |                         event.add_tag(tag)
 81 |                     event.add_tag("tlp:green")
 82 | 
 83 |                     event = misp.add_event(event, pythonify=True)
 84 | 
 85 |                 server_created = False
 86 |                 scripttech_created = False
 87 | 
 88 |                 attribute = MISPAttribute()
 89 |                 attribute.type = "domain"
 90 |                 attribute.value = domain
 91 |                 misp.add_attribute(event, attribute)
 92 |                 if domain_event:
 93 |                     dom_attribute = MISPAttribute()  # Not beautiful but new attribute must be generated due to the UUID
 94 |                     dom_attribute.type = "domain"
 95 |                     dom_attribute.value = domain
 96 |                     misp.add_attribute(domain_event, dom_attribute)
 97 | 
 98 |                 for url_content in result_data[domain]:
 99 | 
100 |                     obj = MISPObject(name='opendir-url', strict=True, misp_objects_path_custom='./misp-objects')
101 |                     obj.add_attribute('url', value=str(url_content["url"]))
102 |                     obj.add_attribute('sha256', value=str(url_content["sha256"]))
103 | 
104 |                     # obj.add_attribute("content", value=content_data[:20], data=content_data, expand='store_true')
105 | 
106 |                     if "index of" in str(url_content["data"]["title"]).lower():
107 |                         event.add_tag("opendir")
108 |                         misp.update_event(event)
109 | 
110 |                     obj.add_attribute('title', value=str(url_content["data"]["title"]))
111 |                     obj.add_attribute('status-code', value=url_content["data"]["resp"]["status_code"])
112 | 
113 |                     for header in url_content["data"]["resp"]["headers"]:
114 |                         obj.add_attribute('header', comment=header, value=url_content["data"]["resp"]["headers"][header])
115 | 
116 |                     if not server_created:
117 |                         if "Server" in url_content["data"]["resp"]["headers"]:
118 |                             attribute = MISPAttribute()
119 |                             attribute.type = "other"
120 |                             attribute.comment = "Webserver"
121 |                             attribute.value = url_content["data"]["resp"]["headers"]["Server"]
122 |                             misp.add_attribute(event, attribute)
123 |                             server_created = True
124 | 
125 |                     if not scripttech_created:
126 |                         if "X-Powered-By" in url_content["data"]["resp"]["headers"]:
127 |                             attribute = MISPAttribute()
128 |                             attribute.type = "other"
129 |                             attribute.comment = "Scripting Technology"
130 |                             attribute.value = url_content["data"]["resp"]["headers"]["X-Powered-By"]
131 |                             misp.add_attribute(event, attribute)
132 |                             scripttech_created = True
133 | 
134 |                     try:
135 |                         for module in url_content["modules"]:
136 |                             if len(url_content["modules"][module]) > 0:
137 |                                 if module == "JARMProcessing" and not jarm_added:
138 |                                     jarm_obj = MISPObject(name='jarm', strict=True)
139 |                                     jarm_obj.add_attribute("jarm", value=str(url_content["modules"][module]["fingerprint"]))
140 |                                     misp.add_object(event, jarm_obj)
141 |                                     jarm_added = True
142 |                                 elif module == "SDhashProcessing":
143 |                                     obj.add_attribute('sdhash', value=str(url_content["modules"][module]["sdhash"]))
144 |                                 elif module == "TLSHProcessing":
145 |                                     obj.add_attribute('tlsh', value=str(url_content["modules"][module]["tlsh"]))
146 |                                 elif module == "YARAProcessing":
147 |                                     for rule in url_content["modules"][module]["rules"]:
148 |                                         obj.add_attribute('yara', value=str(rule))
149 | 
150 |                     except Exception as e:
151 |                         self.logger.error('[MISP] ' + str(e))
152 | 
153 |                     misp.add_object(event, obj)
154 | 
155 |                 misp.publish(event)
156 |                 self.logger.info("[MISP] Event created: " + domain)
157 | 
158 |         if domain_event:
159 |             misp.publish(domain_event)
160 | 


--------------------------------------------------------------------------------
/crawler/app/main.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import os
  3 | import falcon
  4 | from jinja2 import Environment, FileSystemLoader
  5 | from utils import db, Domain, Url, Extension, Tag, DomainTag, fn
  6 | 
  7 | db.connect()
  8 | if len(db.get_tables()) == 0:
  9 |     db.create_tables([Domain, Url, Extension, Tag, DomainTag])
 10 | 
 11 | colors = ["orange", "yellow", "olive", "green", "teal", "blue", "violet", "purple", "pink", "brown", "grey"]
 12 | 
 13 | 
 14 | def display_tagname(value):
 15 |     try:
 16 |         return value.tag.tag
 17 |     except Exception as e:
 18 |         return "None"
 19 | 
 20 | 
 21 | def load_template(name):
 22 |     file_loader = FileSystemLoader('app/templates')
 23 |     env = Environment(loader=file_loader)
 24 |     env.filters['display_tagname'] = display_tagname
 25 |     return env.get_template(name)
 26 | 
 27 | 
 28 | class SearchResource(object):
 29 |     def on_get(self, req, resp):
 30 |         template = load_template('search_results.html')
 31 |         error = ""
 32 |         urls = list()
 33 | 
 34 |         if ":" not in req.params['search']:
 35 |             error = "<b>Error: No valid search pattern!</b><br><br>Examples:<br><ul><li>url:hp.com</li><li>sha256:da3b8d283051c5615f359e376c0d908e6d0539bceed19e6a5667a27d01bf9fef</li><li>yara:protected_webshell</li><li>server:nginx</li></ul>"
 36 |         else:
 37 |             search_arr = req.params['search'].split(":")
 38 |             key = search_arr[0]
 39 |             value = "".join(search_arr[1:])
 40 | 
 41 |             if key == "sha256":
 42 |                 urls = Url.select().where(Url.sha256 == value)
 43 |             elif key == "title":
 44 |                 urls = Url.select().where(Url.title.contains(value))
 45 |             elif key == "url":
 46 |                 urls = Url.select().where(Url.url.contains(value))
 47 |             elif key == "tag":
 48 |                 urls = (Url.select().join(Domain).join(DomainTag).join(Tag).where(Tag.tag == value))
 49 |             else:
 50 |                 urls = (Url.select().join(Extension).where((Extension.key == key) & (Extension.value.contains(value))))
 51 | 
 52 |         resp.status = falcon.HTTP_200
 53 |         resp.content_type = 'text/html'
 54 |         resp.body = template.render(error=error, urls=urls)
 55 | 
 56 | 
 57 | class DashboardResource(object):
 58 |     # TODO: Create useful charts as dashboard and show stats.
 59 | 
 60 |     def on_get(self, req, resp):
 61 |         template = load_template('dashboard.html')
 62 | 
 63 |         domains = Domain.select().count()
 64 |         urls = Url.select().count()
 65 | 
 66 |         tags = DomainTag.select(DomainTag.tag, fn.COUNT(DomainTag.tag).alias('count')).group_by(DomainTag.tag).order_by(fn.COUNT(DomainTag.tag).desc()).limit(5)
 67 |         hashes = Url.select(Url.sha256, fn.COUNT(Url.sha256).alias('count')).group_by(Url.sha256).order_by(fn.COUNT(Url.sha256).desc()).limit(5)
 68 | 
 69 |         i = 0
 70 |         for tag in tags:
 71 |             tag.color = colors[i % len(colors)]
 72 |             i += 1
 73 | 
 74 |         resp.status = falcon.HTTP_200
 75 |         resp.content_type = 'text/html'
 76 |         resp.body = template.render(dashboard_active='active', domains=domains, urls=urls, tags=tags, hashes=hashes)
 77 | 
 78 | 
 79 | class DomainResource(object):
 80 |     def on_get(self, req, resp):
 81 |         template = load_template('domains.html')
 82 |         domains = Domain.select()
 83 | 
 84 |         resp.status = falcon.HTTP_200
 85 |         resp.content_type = 'text/html'
 86 |         resp.body = template.render(domains_active='active', domains=domains)
 87 | 
 88 | 
 89 | class DomainDetailsResource(object):
 90 |     def on_delete(self, req, resp, did):
 91 |         domain = Domain.get(Domain.id == did)
 92 | 
 93 |         urls = Url.select().where(Url.domain == domain)
 94 |         for u in urls:
 95 |             ext_query = Extension.delete().where(Extension.url == u)
 96 |             ext_query.execute()
 97 | 
 98 |         query = Url.delete().where(Url.domain == domain)
 99 |         query.execute()
100 | 
101 |         query_domtag = DomainTag.delete().where(DomainTag.domain == domain)
102 |         query_domtag.execute()
103 | 
104 |         domain.delete_instance()
105 | 
106 |         template = load_template('domains.html')
107 |         domains = Domain.select()
108 |         resp.status = falcon.HTTP_200
109 |         resp.content_type = 'text/html'
110 |         resp.body = template.render(domains_active='active', domains=domains)
111 | 
112 |     def on_get(self, req, resp, did):
113 |         template = load_template('domain_details.html')
114 |         domain = Domain.get(Domain.id == did)
115 |         urls = Url.select().where(Url.domain == domain)
116 |         tags = (Tag.select().join(DomainTag).join(Domain).where(Domain.id == did))
117 | 
118 |         i = 0
119 |         for tag in tags:
120 |             tag.color = colors[i % len(colors)]
121 |             i += 1
122 | 
123 |         resp.status = falcon.HTTP_200
124 |         resp.content_type = 'text/html'
125 |         resp.body = template.render(domain=domain, urls=urls, tags=tags)
126 | 
127 |     def on_post(self, req, resp, did):
128 |         if "delete" in req.params:
129 |             self.on_delete(req, resp, did)
130 |             return
131 |         template = load_template('domain_details.html')
132 | 
133 |         domain = Domain.get(Domain.id == did)
134 |         domain.description = req.params['description']
135 |         domain.save()
136 | 
137 |         urls = Url.select().where(Url.domain == domain)
138 |         tags = (Tag.select().join(DomainTag).join(Domain).where(Domain.id == did))
139 | 
140 |         i = 0
141 |         for tag in tags:
142 |             tag.color = colors[i % len(colors)]
143 |             i += 1
144 | 
145 |         resp.status = falcon.HTTP_200
146 |         resp.content_type = 'text/html'
147 |         resp.body = template.render(domain=domain, urls=urls, tags=tags)
148 | 
149 | 
150 | class UrlResource(object):
151 |     def on_get(self, req, resp):
152 |         template = load_template('urls.html')
153 |         urls = Url.select()
154 | 
155 |         resp.status = falcon.HTTP_200
156 |         resp.content_type = 'text/html'
157 |         resp.body = template.render(urls_active='active', urls=urls)
158 | 
159 | 
160 | class UrlDetailsResource(object):
161 |     def on_delete(self, req, resp, uid):
162 |         url = Url.get(Url.id == uid)
163 | 
164 |         ext_query = Extension.delete().where(Extension.url == url)
165 |         ext_query.execute()
166 | 
167 |         url.delete_instance()
168 | 
169 |         template = load_template('urls.html')
170 |         urls = Url.select()
171 |         resp.status = falcon.HTTP_200
172 |         resp.content_type = 'text/html'
173 |         resp.body = template.render(urls_active='active', urls=urls)
174 | 
175 |     def on_get(self, req, resp, uid):
176 |         template = load_template('url_details.html')
177 |         url = Url.get(Url.id == uid)
178 |         extensions = Extension.select().where(Extension.url == url)
179 | 
180 |         resp.status = falcon.HTTP_200
181 |         resp.content_type = 'text/html'
182 |         resp.body = template.render(url=url, extensions=extensions)
183 | 
184 |     def on_post(self, req, resp, uid):
185 |         if "delete" in req.params:
186 |             self.on_delete(req, resp, uid)
187 |             return
188 |         template = load_template('url_details.html')
189 | 
190 |         url = Url.get(Url.id == uid)
191 |         extensions = Extension.select().where(Extension.url == url)
192 | 
193 |         resp.status = falcon.HTTP_200
194 |         resp.content_type = 'text/html'
195 |         resp.body = template.render(url=url)
196 | 
197 | 
198 | # api initialization
199 | app = falcon.API()
200 | app.req_options.auto_parse_form_urlencoded = True
201 | dashboard = DashboardResource()
202 | domains = DomainResource()
203 | domain_details = DomainDetailsResource()
204 | urls = UrlResource()
205 | url_details = UrlDetailsResource()
206 | search = SearchResource()
207 | 
208 | app.add_route('/', dashboard)
209 | app.add_route('/domain', domains)
210 | app.add_route('/domain/{did:int}', domain_details)
211 | 
212 | app.add_route('/url', urls)
213 | app.add_route('/url/{uid:int}', url_details)
214 | 
215 | app.add_route('/search', search)
216 | 


--------------------------------------------------------------------------------
/crawler/processing/external_intel_processing.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import magic
  4 | import requests
  5 | import json
  6 | from utils import SubCrawlColors, SubCrawlHelpers
  7 | 
  8 | from .default_processing import DefaultProcessing
  9 | 
 10 | 
 11 | class ExternalIntelProcessing(DefaultProcessing):
 12 | 
 13 |     cfg = None
 14 |     logger = None
 15 |     vt_api = None
 16 |     urlhaus_api = None
 17 |     bazaar_api = None
 18 |     submit_urlhaus = False
 19 |     submit_bazaar = False
 20 | 
 21 |     vt_api_url = "https://www.virustotal.com/api/v3/files/"
 22 |     urlhaus_api_url = "https://urlhaus-api.abuse.ch/v1/payload/"
 23 |     urlhaus_api_submit = "https://urlhaus.abuse.ch/api/"
 24 |     bazaar_api_url = "https://mb-api.abuse.ch/api/v1/"
 25 | 
 26 |     def __init__(self, config, logger):
 27 |         self.cfg = config
 28 |         self.logger = logger
 29 | 
 30 |         if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "vt_api"):
 31 |             self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] VirusTotal API Key not set' +
 32 |                                                         SubCrawlColors.RESET)
 33 |         else:
 34 |             self.vt_api = SubCrawlHelpers.get_config(
 35 |                                 self.cfg, "external_intel", "vt_api")
 36 | 
 37 |         if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "urlhaus_api"):
 38 |             self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] URLHaus API Key not set' +
 39 |                                                         SubCrawlColors.RESET)
 40 |         else:
 41 |             self.urlhaus_api = SubCrawlHelpers.get_config(self.cfg, "external_intel", "urlhaus_api")
 42 | 
 43 |         if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "bazaar_api"):
 44 |             self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Bazaar API Key not set' +
 45 |                                                         SubCrawlColors.RESET)
 46 |         else:
 47 |             self.bazaar_api = SubCrawlHelpers.get_config(self.cfg, "external_intel", "bazaar_api")
 48 | 
 49 |         self.submit_urlhaus = SubCrawlHelpers.get_config(self.cfg, "external_intel", "submit_urlhaus")
 50 |         if not self.submit_urlhaus:
 51 |             self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Not uploading to URLHaus' + SubCrawlColors.RESET)
 52 | 
 53 |         self.submit_bazaar = SubCrawlHelpers.get_config(self.cfg, "external_intel", "submit_bazaar")
 54 |         if not self.submit_bazaar:
 55 |             self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Not uploading to Bazaar' + SubCrawlColors.RESET)
 56 | 
 57 | 
 58 |     def process(self, url, content):
 59 |         payload = {}
 60 |         content_match = True
 61 |         signature = None
 62 | 
 63 |         shasum = SubCrawlHelpers.get_sha256(content)
 64 |         content_magic = magic.from_buffer(content).lower()
 65 |         
 66 |         tags = []
 67 |         if content_magic and any(partial in content_magic for partial in
 68 |                SubCrawlHelpers.get_config(self.cfg, "crawler", "pe_magics")):
 69 | 
 70 |             if "(dll)" in content_magic:
 71 |                 tags.append("dll")
 72 |             else:
 73 |                 tags.append("exe")
 74 |             
 75 |             if "x86-64" in content_magic:
 76 |                 tags.append("x64")
 77 | 
 78 |             if "mono/.net" in content_magic:
 79 |                 tags.append('.NET')
 80 |                 tags.append('MSIL')
 81 | 
 82 |             if not self.urlhaus_api is None:
 83 |                 signature = self.check_urlhaus(shasum, url, tags)
 84 | 
 85 |             if not self.bazaar_api is None:
 86 |                 signature = self.check_bazaar(shasum, url, content, tags)
 87 | 
 88 |             if not self.vt_api is None:
 89 |                 self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on VirusTotal:\t" +
 90 |                     self.check_virustotal(shasum) + " \t\t(" + shasum + ")" + SubCrawlColors.RESET)
 91 |         elif content_magic and any(partial in content_magic for partial in
 92 |                SubCrawlHelpers.get_config(self.cfg, "crawler", "office_magics")):
 93 |             
 94 |             if "Microsoft Word" in content_magic or "Microsoft Office Word" in content_magic:
 95 |                 tags.append("doc")
 96 |             elif "Microsoft Excel" in content_magic:
 97 |                 tags.append('xls')
 98 |             elif "Rich Text Format" in content_magic:
 99 |                 tags.append('rtf')
100 |             elif "CDFV2 Encrypted" in content_magic:
101 |                 tags.append('encrypted')
102 | 
103 |             if not self.urlhaus_api is None:
104 |                 signature = self.check_urlhaus(shasum, url, tags)
105 | 
106 |             if not self.bazaar_api is None:
107 |                 signature = self.check_bazaar(shasum, url, content, tags)
108 | 
109 |         else:
110 |             content_match = False
111 | 
112 |         if content_match:
113 |             payload = {"hash": shasum, "url": url, "signature": signature}
114 | 
115 |         return payload
116 | 
117 |     def check_urlhaus(self, sha256, url, tags):
118 |         status = SubCrawlColors.YELLOW + "NOT FOUND" + SubCrawlColors.CYAN
119 |         signature = None
120 |         sample_found = False
121 |         post_data = {'sha256_hash': sha256}
122 |         resp = requests.post(self.urlhaus_api_url, data = post_data)
123 | 
124 |         results = json.loads(resp.text)
125 | 
126 |         if results["query_status"] == "ok":
127 |             status = "FOUND - "
128 |             sample_found = True
129 |             if not results['signature'] is None:
130 |                 status += results['signature']
131 |                 signature = results['signature']
132 |             else:
133 |                 status += "No Signature"
134 |             
135 |         self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on URLHaus:\t" + status + "\t\t(" + sha256 + ")" + SubCrawlColors.RESET)
136 |         
137 |         if not sample_found and self.submit_urlhaus:
138 |             self.logger.info(SubCrawlColors.PURPLE + "[ExternalIntel] Submitting file to URLHaus:\t" + url + SubCrawlColors.RESET)
139 |             jsonDataURLHaus = {
140 |                 'token' : self.urlhaus_api,
141 |                 'anonymous' : '0',
142 |                 'submission' : [
143 |                     {
144 |                         'url': url,
145 |                         'threat': 'malware_download',
146 |                         'tags': 
147 |                             tags
148 |                     }
149 |                 ]
150 |             }
151 | 
152 |             headers = {
153 |                 "Content-Type" : "application/json"
154 |             }
155 |             r = requests.post(self.urlhaus_api_submit, json=jsonDataURLHaus, timeout=15, headers=headers)
156 |             if "inserted" in r.content.decode("utf-8"):
157 |                 self.logger.info(SubCrawlColors.GREEN + "[ExternalIntel] URL Submitted on URLHaus :)" + SubCrawlColors.RESET)
158 |             else:
159 |                 self.logger.error(SubCrawlColors.RED + "[ExternalIntel] Problem Submitting URL on URLHaus :(\t" + r.content.decode("utf-8").replace("\n","") + SubCrawlColors.RESET)
160 |         return signature
161 | 
162 |     def check_bazaar(self, sha256, url, content, tags):
163 |         status = SubCrawlColors.YELLOW + "NOT FOUND" + SubCrawlColors.CYAN
164 |         signature = None
165 |         sample_found = False
166 |         post_data = {'query':'get_info','hash': sha256}
167 |         resp = requests.post(self.bazaar_api_url, data = post_data)
168 |         results = json.loads(resp.text)
169 | 
170 |         if results["query_status"] == "ok":
171 |             sig = "no sig"
172 |             sample_found = True
173 |             for sample in results['data']:
174 |                 if not sample['signature'] is None:
175 |                     sig = sample['signature']
176 |                     signature = sample['signature']
177 |                 else:
178 |                     sig = "No Signature"
179 |             status = "FOUND - " + sig
180 |         
181 |         self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on Bazaar:\t" + status + "\t\t(" + sha256 + ")" + SubCrawlColors.RESET)
182 | 
183 |         if not sample_found and self.submit_bazaar:
184 |             self.logger.info(SubCrawlColors.PURPLE + "[ExternalIntel] Submitting file to Bazaar:\t" + url + SubCrawlColors.RESET)
185 |             
186 |             jsonDataBazaar = {
187 |                 'anonymous' : '0',
188 |                 'delivery_method' : 'web_download',
189 |                 'tags' : 
190 |                     tags,
191 |                 'context': {
192 |                     'comment' : 'Found at ' + SubCrawlHelpers.defang_url(url) + ' by #subcrawl',
193 |                 }
194 |             }
195 | 
196 |             files = {
197 |                 'json_data' : (None,json.dumps(jsonDataBazaar), 'application/json'),
198 |                 'file' : content
199 |             }
200 |             headers = {'API-KEY' : self.bazaar_api }
201 |             
202 |             r = requests.post(self.bazaar_api_url, files=files, verify=False, headers=headers)
203 | 
204 |             if "inserted" in r.content.decode("utf-8"):
205 |                  self.logger.info(SubCrawlColors.GREEN + "[ExternalIntel] Payload Submitted on Bazaar :)" + SubCrawlColors.RESET)
206 |             else:
207 |                 self.logger.error(SubCrawlColors.RED + "[ExternalIntel] Problem Submitting Payload on Bazaar :(\t" + r.content.decode("utf-8").replace("\n","") + SubCrawlColors.RESET)               
208 |         return signature
209 | 
210 |     def check_virustotal(self,sha256):
211 |         result = "NOT FOUND"
212 |         headers = {'x-apikey':self.vt_api}
213 |         resp = requests.get(self.vt_api_url + sha256, headers = headers)
214 | 
215 |         results = json.loads(resp.text)
216 | 
217 |         if not "error" in results:
218 |             result = "FOUND"
219 | 
220 |         return result
221 | 


--------------------------------------------------------------------------------
/crawler/subcrawl.py:
--------------------------------------------------------------------------------
  1 | # © Copyright 2021 HP Development Company, L.P.
  2 | import argparse
  3 | import base64
  4 | import datetime
  5 | import hashlib
  6 | import inspect
  7 | import io
  8 | import json
  9 | import os
 10 | import re
 11 | import sys
 12 | import time
 13 | from concurrent.futures import ProcessPoolExecutor
 14 | from io import BytesIO
 15 | from multiprocessing import Pool, cpu_count
 16 | from urllib.parse import urljoin, urlparse
 17 | 
 18 | import magic
 19 | import requests
 20 | import yaml
 21 | from bs4 import BeautifulSoup
 22 | from mergedeep import Strategy, merge
 23 | from requests.packages.urllib3.exceptions import InsecureRequestWarning
 24 | 
 25 | from processing import *
 26 | from storage import *
 27 | from utils import (SubCrawlBanner, SubCrawlColors, SubCrawlHelpers,
 28 |                    SubCrawlLogger, SubCrawlLoggerLevels)
 29 | 
 30 | try:
 31 |     from kafka import KafkaConsumer
 32 |     consumer = KafkaConsumer(
 33 |         'urls',
 34 |         bootstrap_servers=['kafka:9092'],
 35 |         auto_offset_reset='earliest',
 36 |         enable_auto_commit=True,
 37 |         group_id='urls-crawler',
 38 |         auto_commit_interval_ms=1000,
 39 |         consumer_timeout_ms=2000,
 40 |         value_deserializer=lambda x: json.loads(x.decode('utf-8')))
 41 | except:
 42 |     consumer = None
 43 | 
 44 | # region global variables and configs
 45 | 
 46 | # ignore TLS cert errors
 47 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 48 | 
 49 | process_pool = None
 50 | 
 51 | logger = None
 52 | global_cfg = None  # used in the main process
 53 | process_cfg = None  # used in the scraper processes
 54 | process_processing_modules = None  # used in the scraper process
 55 | 
 56 | init_pages = []  # initial found pages by splitting the url
 57 | crawl_pages = []  # found pages by scraping the initial urls
 58 | 
 59 | storage_modules = []
 60 | processing_modules = []
 61 | 
 62 | # endregion
 63 | 
 64 | 
 65 | def initialize():
 66 |     global logger, global_cfg, process_pool
 67 | 
 68 |     with open("config.yml", "r") as ymlfile:
 69 |         global_cfg = yaml.safe_load(ymlfile)
 70 | 
 71 |     if not global_cfg:
 72 |         print('[!] Error loading configuration file, engine could not start')
 73 |         sys.exit(0)
 74 | 
 75 |     logger = SubCrawlLogger("subcrawl.log", "SubCrawl",
 76 |                             SubCrawlLoggerLevels[SubCrawlHelpers.get_config(
 77 |                              global_cfg, 'crawler',
 78 |                              'log_level').upper()].value).get_logger()
 79 | 
 80 | 
 81 | def main(argv):
 82 | 
 83 |     banner = SubCrawlBanner(SubCrawlHelpers.get_config(
 84 |                             global_cfg, "crawler", "logos_path"),
 85 |                             SubCrawlHelpers.get_config(global_cfg,
 86 |                             "crawler", "tag_line"))
 87 |     banner.print_banner()
 88 | 
 89 |     options = setup_args(argv)
 90 | 
 91 |     start_time = datetime.datetime.now()
 92 | 
 93 |     # region process storage/payload modules
 94 | 
 95 |     str_storage_modules = list()
 96 |     if options.storage_modules:
 97 |         for storage_module in options.storage_modules.split(","):
 98 |             str_storage_modules.append(storage_module)
 99 |     else:
100 |         for storage_module in SubCrawlHelpers.get_config(global_cfg, "crawler",
101 |                                                          "storage_modules"):
102 |             str_storage_modules.append(storage_module)
103 | 
104 |     for storage_module in str_storage_modules:
105 |         try:
106 |             dynamic_class = str2Class(storage_module.strip())
107 |             storage_modules.append(dynamic_class(global_cfg, logger))
108 |             logger.info("[ENGINE] Loaded storage module: " + storage_module)
109 |         except Exception as e:
110 |             logger.error("[ENGINE] Error loading storage module: " + storage_module)
111 | 
112 |     str_processing_modules = list()
113 |     if options.processing_modules:
114 |         for processing_module in options.processing_modules.split(","):
115 |             str_processing_modules.append(processing_module)
116 |     else:
117 |         for processing_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "processing_modules"):
118 |             str_processing_modules.append(str(processing_module))
119 | 
120 |     for processing_module in str_processing_modules:
121 |         try:
122 |             dynamic_class = str2Class(processing_module.strip())
123 |             processing_modules.append(dynamic_class(global_cfg, logger))
124 |             logger.info("[ENGINE] Loaded processing module: " + processing_module)
125 |         except Exception as e:
126 |             logger.error("[ENGINE] Error loading processing module: " + processing_module + ": " + str(e))
127 | 
128 |     # endregion
129 | 
130 |     cpus = cpu_count()
131 |     if cpus > 1:
132 |         cpus = cpus - 1
133 |     process_pool = ProcessPoolExecutor(cpus)
134 | 
135 |     scrape_urls = set()
136 |     scraped_domains = set()
137 |     for s_module in storage_modules:
138 |         scraped_domains.update(s_module.load_scraped_domains())
139 | 
140 |     logger.info("[ENGINE] Parsing input sources...")
141 | 
142 |     # region gather input URLs
143 |     if options.kafka and consumer:
144 |         logger.info("[ENGINE] Using Kafka queue for URL processing...")
145 |         for message in consumer:
146 |             url = message.value
147 |             if SubCrawlHelpers.is_valid_url(url):
148 |                 parsed = urlparse(url)
149 |                 if parsed.netloc not in scraped_domains:
150 |                     parsed_url = url
151 |                     if not url.endswith("/"):
152 |                         parsed_url = remove_url_resource(url)
153 |                     if parsed_url:
154 |                         scrape_urls.add(parsed_url)
155 |                     scraped_domains.add(parsed.netloc)
156 |                 else:
157 |                     logger.debug("[~] Domain already added to the scanning queue: "
158 |                                           + SubCrawlHelpers.defang_url(str(parsed.netloc)))
159 |     else:
160 |         logger.info("[ENGINE] Using file input for URL processing...")
161 |         try:
162 |             with open(options.file_path, 'r') as f:
163 |                 for url in f:
164 |                     try:
165 |                         url = url.strip()
166 |                         parsed = urlparse(url)
167 |                         if parsed.netloc not in scraped_domains:
168 |                             parsed_url = url
169 |                             if not url.endswith('exe') and not url.endswith("/"):
170 |                                 parsed_url = remove_url_resource(url)
171 |                             if parsed_url:
172 |                                 scrape_urls.add(parsed_url)
173 |                             scraped_domains.add(parsed.netloc)
174 |                         else:
175 |                             logger.debug("[ENGINE] Domain already added to the scanning queue: " 
176 |                                 + str(parsed.netloc))
177 |                     except Exception as e:
178 |                         logger.error("[ENGINE] Error reading input file for URL processing: " + str(e))
179 |         except Exception as e:
180 |             logger.error("[ENGINE] Error reading input file for URL processing: " + str(e))
181 |             sys.exit(-1)
182 |             
183 |     logger.info("[ENGINE] Found " + str(len(scrape_urls)) + " hosts to scrape")
184 | 
185 |     # endregion  
186 | 
187 |     # region generate new URLs
188 | 
189 |     domain_urls = dict()
190 |     distinct_urls = list()
191 |     for start_url in scrape_urls:
192 |         # This will add the full URL if it ends with an extension, then passes it along for parsing
193 |         if start_url.endswith('.exe'):
194 |             logger.debug("[ENGINGE] Adding EXE URL directly: " + SubCrawlHelpers.defang_url(start_url))
195 |             if start_url not in distinct_urls:
196 |                 distinct_urls.append(start_url)
197 |                 domain_urls.setdefault(parsed.netloc, []).append(start_url)
198 |                 start_url = remove_url_resource(start_url)
199 |         
200 |         parsed = urlparse(start_url)
201 |         base = parsed.scheme + "://" + parsed.netloc
202 |         paths = parsed.path[:-1].split('/')  # remove the trailing '/' to avoid an empty path
203 |         tmp_url = base
204 | 
205 |         if not SubCrawlHelpers.get_config(global_cfg, "crawler", "scan_simple_domains") and len(paths) == 1 and paths[0] == "":
206 |             continue  # don't scan simple domains.
207 | 
208 |         for path in paths:
209 |             try:
210 |                 tmp_url = urljoin(tmp_url, path) + "/"
211 |                 tmp_url_parsed = urlparse(tmp_url)
212 | 
213 |                 logger.debug("Generated new URL: " + SubCrawlHelpers.defang_url(tmp_url))
214 | 
215 |                 if tmp_url not in distinct_urls:
216 |                     distinct_urls.append(tmp_url)
217 |                     domain_urls.setdefault(parsed.netloc, []).append(tmp_url)
218 |             except Exception as e:
219 |                 logger.debug("[ENGINE] error parsing generated url: " + str(e))
220 | 
221 |     # endregion
222 | 
223 |     logger.info("[ENGINE] Done parsing URLs, ready to begin scraping " + str(len(domain_urls)) + " hosts and " + str(len(distinct_urls)) + " URLs... starting in " + str(SubCrawlHelpers.get_config(global_cfg, "crawler", "delay_execution_time")) + " seconds!")
224 |     time.sleep(int(SubCrawlHelpers.get_config(global_cfg, "crawler",
225 |                                               "delay_execution_time")))
226 | 
227 |     # region crawl
228 | 
229 |     # used to convert url dict per domain into list of lists
230 |     list_of_domains = list()
231 |     for domain in domain_urls:
232 |         url_list = list()
233 |         for url in domain_urls[domain]:
234 |             url_list.append(url)
235 |         list_of_domains.append((url_list, global_cfg, processing_modules))
236 | 
237 |     # batch defines amount of domains to scan before calling storage modules
238 |     for batch_urls in chunks(list_of_domains,
239 |                              SubCrawlHelpers.get_config(global_cfg, "crawler",
240 |                                                         "batch_size")):
241 |         scrape_data = []  # result data of url scraping
242 |         final_crawl_pages = set()
243 |         result_dicts = process_pool.map(scrape_manager, batch_urls)
244 | 
245 |         original = dict()
246 |         for result in result_dicts:
247 |             merge(original, result, strategy=Strategy.ADDITIVE)
248 | 
249 |         scrape_data = original["scrape_data"] if "scrape_data" in original \
250 |             else dict()
251 |         crawl_pages = set(original["crawl_pages"]) if "crawl_pages" in \
252 |             original else set()
253 |         final_crawl_pages.update(crawl_pages)
254 | 
255 |         for s_module in storage_modules:
256 |             s_module.store_result(scrape_data)
257 | 
258 |     elapsed = datetime.datetime.now() - start_time
259 |     logger.info("Execution time (D:H:M:S): %02d:%02d:%02d:%02d" % (elapsed.days, elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60))
260 | 
261 |     # endregion
262 | 
263 | 
264 | def scrape_manager(data):
265 |     domain_urls, cfg, processing_modules = data
266 |     global process_cfg
267 |     global init_pages
268 |     global process_processing_modules
269 | 
270 |     process_cfg = cfg
271 |     init_pages = domain_urls
272 |     process_processing_modules = processing_modules
273 | 
274 |     logger.debug("[ENGINE] Starting down path... " + SubCrawlHelpers.defang_url(domain_urls[0]))
275 | 
276 |     result_dicts = list()
277 |     for url in domain_urls:
278 |         s_data = []
279 |         scrape_result = scrape(url, s_data)
280 |         result_dicts.append(scrape_result)
281 | 
282 |     original = dict()
283 |     for result in result_dicts:
284 |         if "scrape_data" in result:
285 |             result["scrape_data"] = json.loads(result["scrape_data"])
286 |         merge(original, result, strategy=Strategy.ADDITIVE)
287 | 
288 |     return original
289 | 
290 | 
291 | def scrape(start_url, s_data):
292 |     try:
293 |         scrape_domain = dict()
294 |         request_start = datetime.datetime.now()
295 |         logger.debug("[ENGINE] Scanning URL: " + SubCrawlHelpers.defang_url(start_url))
296 |         resp = requests.get(start_url, timeout=SubCrawlHelpers.get_config(
297 |             process_cfg, "crawler", "http_request_timeout"),
298 |             headers=SubCrawlHelpers.get_config(process_cfg, "crawler",
299 |                                                "headers"),
300 |             verify=False, allow_redirects=SubCrawlHelpers.get_config(process_cfg, "crawler",
301 |                                                "follow_redirects"),)
302 | 
303 |         if resp.status_code == 200:
304 |             response_size_ok = True
305 |             size = 0
306 |             maxsize = SubCrawlHelpers.get_config(process_cfg, "crawler",
307 |                                                  "http_max_size")
308 |             ctt = BytesIO()
309 | 
310 |             for chunk in resp.iter_content(2048):
311 |                 size += len(chunk)
312 |                 ctt.write(chunk)
313 |                 current_time = datetime.datetime.now()
314 |                 if size > maxsize or \
315 |                     (current_time - request_start).total_seconds() > \
316 |                         SubCrawlHelpers.get_config(process_cfg, "crawler",
317 |                                                    "http_download_timeout"):
318 |                     resp.close()
319 |                     response_size_ok = False
320 |                     logger.debug("[ENGINE] Response too large or download timeout: " + start_url)
321 |                     break
322 | 
323 |             if response_size_ok:
324 |                 content = ctt.getvalue()
325 |                 signature = ""
326 |                 title = None
327 |                 bs = None
328 |                 content_magic = "NONE"
329 |                 try:
330 |                     bs = BeautifulSoup(str(content), "html.parser")
331 |                     title = bs.find('title')
332 |                 except:
333 |                     bs = None
334 |                 content_magic = magic.from_buffer(content).lower()
335 |                 module_results = {}
336 |                 if title is not None and bs is not None\
337 |                     and any(partial in title.get_text().lower() for partial in \
338 |                         SubCrawlHelpers.get_config(process_cfg, "crawler", "opendir_title")):
339 | 
340 |                     for link in bs.find_all('a'):
341 |                         if link.has_attr('href'):
342 |                             href = link.attrs['href']
343 |                             if href is not None and not href.startswith("?"):
344 |                                 next_page = urljoin(start_url, href)
345 | 
346 |                                 if next_page not in crawl_pages and next_page not in init_pages \
347 |                                     and not next_page.lower().endswith(tuple(SubCrawlHelpers.get_config(process_cfg, "crawler", "ext_exclude"))):
348 |                                     logger.debug("[ENGINE] Discovered: " + SubCrawlHelpers.defang_url(next_page))
349 |                                     crawl_pages.append(next_page)
350 |                                     scrape(next_page, s_data)                
351 |                 else:
352 |                     for p_module in process_processing_modules:
353 |                         mod_res = p_module.process(start_url, content)
354 |                         if mod_res:
355 |                             module_results[type(p_module).__name__] = mod_res
356 | 
357 |                 title = bs.select_one('title')
358 |                 if title:
359 |                     title = title.string
360 | 
361 |                 try:
362 |                     text = base64.b64encode(content).decode('utf-8', errors='ignore')
363 |                 except Exception as e:
364 |                     logger.error("[ENGINE] " + str(e))
365 | 
366 |                 scrape_entry = {
367 |                     'scraped_on': datetime.datetime.now().isoformat(),
368 |                     'sha256': SubCrawlHelpers.get_sha256(content),
369 |                     'url': start_url,
370 |                     'content_type': content_magic,
371 |                     'signature': signature,
372 |                     'data': {
373 |                         'text': text,
374 |                         'title': title,
375 |                         'resp': {
376 |                             'headers': dict(resp.headers) if resp else '',
377 |                             'status_code': resp.status_code if resp else '',
378 |                         },
379 |                     },
380 |                     "modules": {}
381 |                 }
382 | 
383 |                 scrape_entry["modules"] = module_results
384 |                 s_data.append(scrape_entry)
385 |                 parsed = urlparse(start_url)
386 |                 scrape_domain = {parsed.netloc: s_data}
387 | 
388 |     except Exception as e:
389 |         logger.debug("[ENGINE] " + str(e))
390 | 
391 |     return {"crawl_pages": crawl_pages, "scrape_data": json.dumps(scrape_domain)}
392 | 
393 | 
394 | def remove_url_resource(unparsed_url):
395 |     try:
396 |         parsed_url = urlparse(unparsed_url)
397 |         last_slash = parsed_url.path.rindex('/')
398 |         return unparsed_url.replace(parsed_url.path[last_slash +1:], "")
399 |     except Exception as e:
400 |         logger.error("[URL_PARSER] Error with URL " + unparsed_url + str(e))
401 |         return None
402 | 
403 | 
404 | def chunks(lst, n):
405 |     """Yield successive n-sized chunks from lst."""
406 |     for i in range(0, len(lst), n):
407 |         yield lst[i:i + n]
408 | 
409 | 
410 | def unique_content(content):
411 |     unique_dict = dict()
412 |     for key in content:
413 |         unique_dict[key] = set(content[key])
414 |     return unique_dict
415 | 
416 | 
417 | def str2Class(str):
418 |     return getattr(sys.modules[__name__], str)
419 | 
420 | 
421 | def print_classes():
422 |     clsmembers_storage = inspect.getmembers(sys.modules["storage"], inspect.isclass)
423 |     clsmembers_processing = inspect.getmembers(sys.modules["processing"], inspect.isclass)
424 | 
425 |     print("\n  Available processing modules: ")
426 |     for mod in clsmembers_processing:
427 |         print("  - " + mod[0])
428 | 
429 |     print("\n  Available storage modules: ")
430 |     for mod in clsmembers_storage:
431 |         print("  - " + mod[0])
432 | 
433 | 
434 | def setup_args(argv):
435 |     parser = argparse.ArgumentParser(description="")
436 | 
437 |     parser.add_argument('-f', '--file', action="store", dest="file_path", help="Path of input URL file")
438 | 
439 |     parser.add_argument('-k', '--kafka', action="store_true", dest="kafka", help="Use Kafka Queue as input")
440 | 
441 |     parser.add_argument('-p', '--processing', action="store", dest="processing_modules", help="Processing modules to be executed comma separated.")
442 | 
443 |     parser.add_argument('-s', '--storage', action="store", dest="storage_modules", help="Storage modules to be executed comma separated.")
444 | 
445 |     if len(argv) == 0:
446 |         parser.print_help()
447 |         print_classes()
448 |         sys.exit(0)
449 | 
450 |     return parser.parse_args()
451 | 
452 | 
453 | initialize()
454 | 
455 | if __name__ == '__main__':
456 |     main(sys.argv[1:])
457 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SubCrawl
  2 | 
  3 | SubCrawl is a framework developed by [Patrick Schläpfer](https://twitter.com/stoerchl), [Josh Stroschein](https://twitter.com/jstrosch) and [Alex Holland](https://twitter.com/cryptogramfan) of HP Inc’s [Threat Research](https://threatresearch.ext.hp.com/blog/) team. SubCrawl is designed to find, scan and analyze open directories. The framework is modular, consisting of four components: input modules, processing modules, output modules and the core crawling engine. URLs are the primary input values, which the framework parses and adds to a queuing system before crawling them. The parsing of the URLs is an important first step, as this takes a submitted URL and generates additional URLs to be crawled by removing sub-directories, one at a time until none remain. This process ensures a more complete scan attempt of a web server and can lead to the discovery of additional content. Notably, SubCrawl does not use a brute-force method for discovering URLs. All the content scanned comes from the input URLs, the process of parsing the URL and discovery during crawling. When an open directory is discovered, the crawling engine extracts links from the directory for evaluation. The crawling engine determines if the link is another directory or if it is a file. Directories are added to the crawling queue, while files undergo additional analysis by the processing modules. Results are generated and stored for each scanned URL, such as the SHA256 and fuzzy hashes of the content, if an open directory was found, or matches against YARA rules. Finally, the result data is processed according to one or more output modules, of which there are currently three. The first provides integration with MISP, the second simply prints the data to the console, and the third stores the data in an SQLite database. Since the framework is modular, it is not only easy to configure which input, processing and output modules are desired, but also straightforward to develop new modules. 
  4 | 
  5 | ![Framework Architecture](images/architecture.png)
  6 | _Figure 1 - SubCrawl architecture_
  7 | 
  8 | SubCrawl supports two different modes of operation. First, SubCrawl can be started in a run-once mode. In this mode, the user supplies the URLs to be scanned in a file where each input value is separated by a line break. The second mode of operation is service mode. In this mode, SubCrawl runs in the background and relies on the input modules to supply the URLs to be scanned. Figure 1 shows an overview of SubCrawl’s architecture. The components that are used in both modes of operation are blue, run-once mode components are yellow, and service mode components are green.
  9 | 
 10 | ## Requirements
 11 | 
 12 | Based on the chosen run mode, other preconditions must be met.
 13 | 
 14 | ### Run-Once Mode Requirements
 15 | 
 16 | SubCrawl is written in Python3. In addition, there are several packages that are required before running SubCrawl. The following command can be used to install all required packages before running SubCrawl. From the *crawler* directory, run the following command:
 17 | 
 18 | ```
 19 | $ sudo apt install build-essential
 20 | $ pip3 install -r requirements.txt
 21 | ```
 22 | 
 23 | ### Service Mode Requirements
 24 | 
 25 | If SubCrawl is started in service mode, this can be done using Docker. For this reason, the installation of Docker and Docker Compose is required. Good installation instructions for this can be found directly on the Docker.com website.
 26 | - [Installing Docker Engine](https://docs.docker.com/engine/install/ubuntu/)
 27 | - [Installing Docker Compose](https://docs.docker.com/compose/install/)
 28 | 
 29 | ## Getting Help
 30 | 
 31 | SubCrawl has built-in help through the _-h/--help_ argument or by simply executing the script without any arguments.
 32 | 
 33 | ```
 34 |   ********         **        ******                               **
 35 |  **//////         /**       **////**                             /**
 36 | /**        **   **/**      **    //  ******  ******   ***     ** /**
 37 | /*********/**  /**/****** /**       //**//* //////** //**  * /** /**
 38 | ////////**/**  /**/**///**/**        /** /   *******  /** ***/** /**
 39 |        /**/**  /**/**  /**//**    ** /**    **////**  /****/**** /**
 40 |  ******** //******/******  //****** /***   //******** ***/ ///** ***
 41 | ////////   ////// /////     //////  ///     //////// ///    /// /// 
 42 | ~~ Harvesting the Open Web ~~
 43 | 
 44 | usage: subcrawl.py [-h] [-f FILE_PATH] [-k] [-p PROCESSING_MODULES] [-s STORAGE_MODULES]
 45 | 
 46 | optional arguments:
 47 |   -h, --help            show this help message and exit
 48 |   -f FILE_PATH, --file FILE_PATH
 49 |                         Path of input URL file
 50 |   -k, --kafka           Use Kafka Queue as input
 51 |   -p PROCESSING_MODULES, --processing PROCESSING_MODULES
 52 |                         Processing modules to be executed comma separated.
 53 |   -s STORAGE_MODULES, --storage STORAGE_MODULES
 54 |                         Storage modules to be executed comma separated.
 55 | 
 56 |   Available processing modules: 
 57 |   - ExternalIntelProcessing
 58 |   - ClamAVProcessing
 59 |   - JARMProcessing
 60 |   - PayloadProcessing
 61 |   - TLSHProcessing
 62 |   - YARAProcessing
 63 | 
 64 |   Available storage modules: 
 65 |   - ElasticStorage
 66 |   - ConsoleStorage
 67 |   - MISPStorage
 68 |   - SqliteStorage
 69 | ```
 70 | 
 71 | ## Run-Once Mode
 72 | 
 73 | This mode is suitable if you want to quickly scan a manageable amount of domains. For this purpose, the URLs to be scanned must be saved in a file, which then serves as input for the crawler. The following is an example of executing in run-once mode, not the _-f_ argument is used with a path to a file.
 74 | 
 75 | ```
 76 | python3 subcrawl.py -f urls.txt -p YARAProcessing,PayloadProcessing -s ConsoleStorage
 77 | ```
 78 | 
 79 | ## Service Mode
 80 | 
 81 | With the service mode, a larger amount of domains can be scanned and the results saved. Based on the selected storage module, the data can then be analyzed and evaluated in more detail. To make running the service mode as easy as possible for the user, we built all the functionalities into a Docker image. In service mode, the domains to be scanned are obtained via Input modules. By default, new malware and phishing URLs are downloaded from [URLhaus](https://urlhaus.abuse.ch/) and [PhishTank](https://www.phishtank.com/) and queued for scanning. The desired processing and storage modules can be entered directly in the `config.yml`. By default, the following processing modules are activated, utilizing the SQLite storage:
 82 | - ClamAVProcessing
 83 | - JARMProcessing
 84 | - TLSHProcessing
 85 | - YARAProcessing
 86 | 
 87 | In addition to the SQLite storage module, a simple web UI was developed that allows viewing and managing the scanned domains and URLs. 
 88 | 
 89 | ![Web UI for SQLite storage module](images/webui.png)
 90 | 
 91 | However, if this UI is not sufficient for the subsequent evaluation of the data, the MISP storage module can be activated alternatively or additionally. The corresponding settings must be made in `config.yml` under the `MISP` section. 
 92 | 
 93 | The following two commands are enough to clone the GIT repository, create the Docker container and start it directly. Afterwards the web UI can be reached at the address `https://localhost:8000/`. Please note, once the containers have started the input modules will begin to add URLs to the processing queue and the engine will begin crawling hosts.
 94 | 
 95 | 
 96 | ```
 97 | git clone https://github.com/hpthreatresearch/subcrawl.git
 98 | 
 99 | docker-compose up --build 
100 | ```
101 | 
102 | ## SubCrawl Modules
103 | 
104 | ### Input Modules
105 | 
106 | Input modules are only used in service mode. If SubCrawl started using the run-once mode then a file containing the URLs to scan must be supplied. The following two input modules have been implemented.
107 | 
108 | #### URLhaus
109 | 
110 | [URLhaus](https://urlhaus.abuse.ch/) is a prominent web service tracking malicious URLs. The web service also provides exports containing new detected URLs. Those malware URLs serve as perfect input to our crawler as we mainly want to analyze malicious domains. Recently submitted URLS are retrieved and search results are not refined through the API request (i.e. through tags or other parameters available). The HTTP request made in this [input module](crawler/input/urlhaus.py) to the URLHaus API can be modifed to further refine the results obtained. 
111 | 
112 | #### PhishTank
113 | 
114 | [PhishTank](https://www.phishtank.com/) is a website that collects phishing URLs. Users have the possibility to submit new found phishing pages. An export with active phishing URLs can be generated and downloaded from this web service via API. So this is also an ideal collection for our crawler.
115 | 
116 | ### Processing Modules
117 | 
118 | SubCrawl comes with several processing modules. The processing modules all follow similar behavior on how they provide results back to the core engine. If matches are found, results are returned to the core engine and later provided to the storage modules.  Below is a list of processing modules.
119 | 
120 | #### External Intelligence (Abuse.ch, VirusTotal)
121 | 
122 | The [ExternalIntel](https://github.com/jstrosch/subcrawl/blob/main/crawler/processing/external_intel_processing.py) processing module is used to check for the presense of a URL on the URLHaus, or a payload (via SHA256 hash) on the malware Bazaar. If the value exists, the module will parse the response and print the family tag associated with it. Optionally, this module can be used to submit samples and URLs to each respective Abuse.ch service. This module depends on configuration of the appropriate API key in the external_intel section in the [primary configuration](https://github.com/jstrosch/subcrawl/blob/main/crawler/config.yml).
123 | 
124 | ![external intelligence processing module output](images/external_intel.png)
125 | 
126 | #### SDHash
127 | 
128 | The [SDHash](https://github.com/sdhash/sdhash) processing modue is used to calculate a similarity hash of the HTTP response. The minimum size of the content must is 512 bytes to be able to successfully calculate a hash. This is probably the most complicated processing module to install, as it requires Protobuf and depending on the target host it must be recompiled. Therefore this processing module is deactivated by default. An already compiled version can be found in crawler/processing/minisdhash/ which requires protobuf-2.5.0 and python3.6. Those binaries were compiled on an Ubuntu 18.04.5 LTS x64. Following the installation instructions:
129 | 
130 | ```
131 | # Protobuf installation
132 | > apt-get update
133 | > apt-get -y install libssl-dev libevent-pthreads-2.1-6 libomp-dev g++
134 | > apt-get -y install autoconf automake libtool curl make g++ unzip
135 | > wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.zip
136 | > unzip protobuf-2.5.0.zip
137 | > cd protobuf-2.5.0
138 | > ./configure
139 | > make
140 | > sudo make install
141 | 
142 | # Python3.6 installation
143 | > apt-get install python3.6-dev
144 | > sudo ldconfig
145 | 
146 | # SDHash installation
147 | > git clone https://github.com/sdhash/sdhash.git
148 | > cd sdhash
149 | > make
150 | > make install
151 | > ldconfig
152 | ```
153 | 
154 | 
155 | #### JARM
156 | 
157 | [JARM](https://github.com/salesforce/jarm)  is a tool that fingerprints TLS connections developed by Salesforce. The JARM processing module performs a scan of the domain and returns a JARM hash with the domain to the core engine. Depending on the configuration of a web server, the TLS handshake has different properties. By calculating a hash of the attributes of this handshake, these differences can be used to track web server configurations. 
158 | 
159 | #### TLSH
160 | 
161 | The [TLSH](https://github.com/trendmicro/tlsh) processing module is similar to the SDHash processing module used to calculate a similarity hash. The advantage of the TLSH is, that the installation is much simpler and the input minium is smaller with 50 bytes. As most webshell logins are rather small and were the focus of our research, we activated this processing module by default.
162 | 
163 | #### YARA
164 | 
165 | The YARA processing module is used to scan HTTP response content with YARA rules. To invoke this processing module, provide the value *YARAProcessing* as a processing module argument. For example, the following command will load the YARA processing module and produce output to the console via the ConsoleStorage storage module.
166 | 
167 | ```
168 | python3 subcrawl.py -p YARAProcessing -s ConsoleStorage
169 | ```
170 | 
171 | Currently, the YARA processing module is used to identify webshell logins and various other interesting content. YARA rules included with this project:
172 | 
173 | * protected_webshell: Identifies login pages of password-protected webshells
174 | * js_webshell_tracking_script: Identifies backdoored plugins/themes that use JavaScript 
175 | to notifies the attacker when the webshell becomes active
176 | * open_webshell: Identifies open webshells (i.e. webshells that are not protected via login)
177 | * php_webshell_backend: Identifies PHP webshell backend used by the attacker
178 | 
179 | Sample output:
180 | ![Yara processing output](images/yara-output.png)
181 | 
182 | To add additional YARA rules, you can add .YAR files to the *yara-rules* folder, and then include the rule file by adding an *include* statement to *combined-rules.yar*.
183 | 
184 | #### ClamAV
185 | 
186 | The ClamAV processing module is used to scan HTTP response content during scanning with ClamAV. If a match is found, it is provided to the various output modules. To invoke this processing module, provide the value *ClamAVProcessing* as a processing module argument. For example, the following command will load the ClamAV processing module and produce output to the console via the ConsoleStorage storage module.
187 | 
188 | ```
189 | python3 subcrawl.py -p ClamAVProcessing -s ConsoleStorage
190 | ```
191 | 
192 | Sample output:
193 | ![ClamAV Processing Module](images/clamav-output.png)
194 | 
195 | To utilize this module, ClamAV must be installed. From a terminal, install ClamAV using the APT package manager:
196 | 
197 | ```
198 | $ sudo apt-get install clamav-daemon clamav-freshclam clamav-unofficial-sigs
199 | ```
200 | Once installed, the ClamAV update service should already be running. However, if you want to manually update using *freshclam*, ensure that the service is stopped:
201 | ```
202 | sudo systemctl stop clamav-freshclam.service
203 | ```
204 | And then run *freshclam* manually:
205 | ```
206 | $ sudo freshclam
207 | ```
208 | Finally, check the status of the ClamAV service:
209 | ```
210 | $ sudo systemctl status clamav-daemon.service
211 | ```
212 | If the service is not running, you can  use *systemctl* to start it:
213 | ```
214 | $ sudo systemctl start clamav-daemon.service
215 | ```
216 | 
217 | #### Payload
218 | 
219 | The Payload processing module is used to identify HTTP response content using the *libmagic* library. Additionally, SubCrawl can be configured to save content of interest, such as PE files or archives. To invoke this processing module, provide the value *PayloadProcessing* as a processing module argument. For example, the following command will load the Payload processing module and produce output to the console:
220 | 
221 | ```
222 | python3 subcrawl.py -p PayloadProcessing -s ConsoleStorage
223 | ```
224 | 
225 | There are no additional dependencies for this module. 
226 | 
227 | Sample output:
228 | ![Payload processing output](images/payload-output.png)
229 | 
230 | 
231 | ### Storage Modules
232 | 
233 | Storage modules are called by the SubCrawl engine after all URLs from the queue have been scanned. They were designed with two objectives in mind. First, to obtain the results from scanning immediately after finishing the scan queue and secondly to enable long-term storage and analysis. Therefore we not only implemented a ConsoleStorage module but also an integration for MISP and an SQLite storage module. 
234 | 
235 | #### Console
236 | 
237 | To quickly analyse results directly after scanning URLs, a well-formatted output is printed to the console. This output is best suited for when SubCrawl is used in run-once mode. While this approach worked well for scanning single domains or generating quick output, it is unwieldy for long-term research and analysis.
238 | 
239 | ![Console Storage UI](images/console-storage.png)
240 | 
241 | #### Elastic
242 | 
243 | Integration with an Elastic cluster is also available. Each URL along with it's data will be indexed as an event, this will include output from other modules such as Yara. A default dashboard has also been added to help get started using this module. Updates the _elasticsearch_ section will need to be made, this will include:
244 | 
245 | * Elastic search host (default localhost)
246 | * Port to find elastic on (default 9200)
247 | * Index name (default subcrawl)
248 | * Archive response content - this saves the HTTP response body to disk (default False)
249 | * Archive log location - location to save response content (default log/)
250 | 
251 | To use this output module, provide the value *ElasticStorage* with the _-s_ argument.
252 | 
253 | #### SQLite
254 | 
255 | Since the installation and configuration of MISP can be time-consuming, we implemented another module which stores the data in an SQLite database. To present the data to the user as simply and clearly as possible, we also developed a simple web GUI. Using this web application, the scanned domains and URLs can be viewed and searched with all their attributes. Since this is only an early version, no complex comparison features have been implemented yet.
256 | 
257 | ![SQLite UI](images/sqlite-storage.png)
258 | 
259 | #### MISP
260 | 
261 | [MISP](https://www.misp-project.org/) is an open-source threat intelligence platform with a flexible data model and API to store and analyze threat data. SubCrawl stores crawled data in MISP events, publishing one event per domain and adding any identified open directories as attributes. MISP also allows users to define tags for events and attributes. This is helpful for event comparison and link analyses. Since this was one of our primary research goals, we enriched the data from URLHaus when exporting SubCrawl’s output to MISP. URLHaus annotates its data using tags which can be used to identify a malware family or threat actor associated with a URL. For each open directory URL, the module queries locally-stored URLHaus data and adds URLHaus tags to the MISP event if they match. To avoid having a collection of unrelated attributes for each MISP event, we created a new MISP object for scanned URLs, called opendir-url. This ensures that related attributes are kept together, making it easier to get an overview of the data.
262 | 
263 | ![MISP UI](images/misp-overview.png)
264 | 
265 | ## Building your own Modules
266 | 
267 | Templates for processing and storage modules are provided as part of the framework.
268 | 
269 | ### Processing Modules
270 | 
271 | Processing modules can be found under `crawler->processing` and a sample module file `example_processing.py` found in this directory. The template provides the necessary inheritance and imports to ensure execution by the framework. The _init_ function provides for module initialization and receives an instance of the logger and the global configuration. The logger is used to provide logging information from the processing modules, as well as throughout the framework.
272 | 
273 | The _process_ function is implemented to process each HTTP response. To this end, it receives the URL and the raw response content. This is where the work of the module is implemented. This function should return a dictionary with the following fields:
274 | 
275 | - hash: the sha256 of the content
276 | - url: the URL the content was retrieved from
277 | - matches: any matching results in the module, For example, libmagic or YARA results.
278 | 
279 | A unique class name must be defined and is used to define this module when including it via the _-p_ argument or as a default processing module in the configuration file.
280 | 
281 | Finally, add an import statement in [`__init__.py`](crawler/processing/__init__.py), using your class name:
282 | 
283 | ```
284 | from .<REPLACE>_processing import <REPLACE>Processing
285 | ```
286 | 
287 | ### Storage Modules
288 | 
289 | Storage modules can be found under `crawler->storage` and a sample module file `example_storage.py` found in this directory. Similar to the processing modules, _init_ function provides for module initialization and receives an instance of the logger and the global configuration. The _store_results_ function receives structured data from the engine at intervals defined by the batch size in the configuration file.
290 | 
291 | A unique class name must be defined and is used to load the module when including it via the _-s_ argument or as a default processing module in the configuration file.
292 | 
293 | ## Presentations and Other Resources
294 | 
295 | 2021:
296 | 
297 | - [BlackHat Arsenal USA](https://www.blackhat.com/us-21/arsenal/schedule/index.html#introducing-subcrawl-a-framework-for-the-analysis-and-clustering-of-hacking-tools-found-using-open-directories-24081)
298 | - [VirusBulletin Localhost - Upcoming](https://vblocalhost.com/presentations/introducing-subcrawl-a-framework-for-the-analysis-and-clustering-of-hacking-tools-found-using-open-directories)
299 | 
300 | ## License
301 | SubCrawl is licensed under the MIT license
302 | 


--------------------------------------------------------------------------------
/crawler/storage/kibana-dashboard/overview-dashboard.ndjson:
--------------------------------------------------------------------------------
1 | {"attributes":{"fieldAttrs":"{\"crawled_on\":{\"count\":1},\"http.request.url\":{\"count\":1}}","fields":"[]","runtimeFieldMap":"{}","timeFieldName":"crawled_on","title":"subcrawl*","typeMeta":"{}"},"coreMigrationVersion":"7.17.0","id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","migrationVersion":{"index-pattern":"7.11.0"},"references":[],"type":"index-pattern","updated_at":"2021-12-09T21:28:47.683Z","version":"Wzc5MCwyXQ=="}
2 | {"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[]}"},"optionsJSON":"{\"useMargins\":true,\"syncColors\":false,\"hidePanelTitles\":false}","panelsJSON":"[{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":0,\"w\":48,\"h\":16,\"i\":\"ef96c601-3c40-4166-91a6-0041d38b29f2\"},\"panelIndex\":\"ef96c601-3c40-4166-91a6-0041d38b29f2\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsXY\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-0085efda-2a22-462d-8f0f-2ea2e0b401d5\"}],\"state\":{\"visualization\":{\"legend\":{\"isVisible\":true,\"position\":\"right\"},\"valueLabels\":\"hide\",\"fittingFunction\":\"None\",\"yLeftExtent\":{\"mode\":\"full\"},\"yRightExtent\":{\"mode\":\"full\"},\"axisTitlesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"tickLabelsVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"labelsOrientation\":{\"x\":0,\"yLeft\":0,\"yRight\":0},\"gridlinesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"preferredSeriesType\":\"line\",\"layers\":[{\"layerId\":\"0085efda-2a22-462d-8f0f-2ea2e0b401d5\",\"accessors\":[\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\"],\"position\":\"top\",\"seriesType\":\"line\",\"showGridlines\":false,\"layerType\":\"data\",\"xAccessor\":\"4047c20f-bfcc-4a39-8735-24a54ceab82b\"}]},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"0085efda-2a22-462d-8f0f-2ea2e0b401d5\":{\"columns\":{\"4047c20f-bfcc-4a39-8735-24a54ceab82b\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1d\"}},\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"4047c20f-bfcc-4a39-8735-24a54ceab82b\",\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":16,\"w\":26,\"h\":19,\"i\":\"d34b0422-8934-45c9-8b4f-a0697f8788e3\"},\"panelIndex\":\"d34b0422-8934-45c9-8b4f-a0697f8788e3\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsPie\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-12c41522-f747-410f-883f-a393b947cd19\"}],\"state\":{\"visualization\":{\"shape\":\"donut\",\"layers\":[{\"layerId\":\"12c41522-f747-410f-883f-a393b947cd19\",\"groups\":[\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\"],\"metric\":\"20a68eb5-b559-4475-8c81-f3bf76094a61\",\"numberDisplay\":\"percent\",\"categoryDisplay\":\"default\",\"legendDisplay\":\"default\",\"nestedLegend\":false,\"layerType\":\"data\"}]},\"query\":{\"query\":\"NOT http.response.body.content_magic : (\\\"empty\\\" or \\\"html*\\\" or \\\"ascii*\\\")\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"12c41522-f747-410f-883f-a393b947cd19\":{\"columns\":{\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":10,\"orderBy\":{\"type\":\"column\",\"columnId\":\"20a68eb5-b559-4475-8c81-f3bf76094a61\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"20a68eb5-b559-4475-8c81-f3bf76094a61\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\",\"20a68eb5-b559-4475-8c81-f3bf76094a61\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Overview of Activity\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":26,\"y\":16,\"w\":8,\"h\":10,\"i\":\"f275f2db-bfc8-4e39-b0ed-4b141168374c\"},\"panelIndex\":\"f275f2db-bfc8-4e39-b0ed-4b141168374c\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-36dd60e9-fb14-44c7-b298-ae035186dda3\"}],\"state\":{\"visualization\":{\"layerId\":\"36dd60e9-fb14-44c7-b298-ae035186dda3\",\"accessor\":\"192b6a30-eaca-4fa1-adae-81812804b9e9\",\"layerType\":\"data\"},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"36dd60e9-fb14-44c7-b298-ae035186dda3\":{\"columns\":{\"192b6a30-eaca-4fa1-adae-81812804b9e9\":{\"label\":\"Unique Hosts\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"192b6a30-eaca-4fa1-adae-81812804b9e9\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":34,\"y\":16,\"w\":7,\"h\":10,\"i\":\"3de30073-057a-453d-91df-0327dcde3840\"},\"panelIndex\":\"3de30073-057a-453d-91df-0327dcde3840\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40\"}],\"state\":{\"visualization\":{\"layerId\":\"b9253695-4d27-42ac-a159-fdc690673b40\",\"accessor\":\"246b5da5-1e3f-4920-be1f-580e536698a2\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : pe32*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"b9253695-4d27-42ac-a159-fdc690673b40\":{\"columns\":{\"246b5da5-1e3f-4920-be1f-580e536698a2\":{\"label\":\"PE\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"246b5da5-1e3f-4920-be1f-580e536698a2\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":41,\"y\":16,\"w\":7,\"h\":10,\"i\":\"0548058f-744b-46cc-876e-f3ebd94d47fc\"},\"panelIndex\":\"0548058f-744b-46cc-876e-f3ebd94d47fc\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\"}],\"state\":{\"visualization\":{\"layerId\":\"68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\",\"accessor\":\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\",\"layerType\":\"data\"},\"query\":{\"query\":\"yara_results : \\\"protected_webshell\\\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\":{\"columns\":{\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\":{\"label\":\"Protected Webshells\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":26,\"y\":26,\"w\":8,\"h\":9,\"i\":\"79fe7c6f-146a-4acd-a06d-96bcf36d75c6\"},\"panelIndex\":\"79fe7c6f-146a-4acd-a06d-96bcf36d75c6\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-98b138d5-6ef9-44ce-9bae-0dd42795a47d\"}],\"state\":{\"visualization\":{\"layerId\":\"98b138d5-6ef9-44ce-9bae-0dd42795a47d\",\"accessor\":\"4635bde2-23fc-438e-8919-73cc0c2a0d32\",\"layerType\":\"data\"},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"98b138d5-6ef9-44ce-9bae-0dd42795a47d\":{\"columns\":{\"4635bde2-23fc-438e-8919-73cc0c2a0d32\":{\"label\":\"URLs\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"4635bde2-23fc-438e-8919-73cc0c2a0d32\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":34,\"y\":26,\"w\":7,\"h\":9,\"i\":\"0d48db17-ea78-484c-922c-dd26997c7dbc\"},\"panelIndex\":\"0d48db17-ea78-484c-922c-dd26997c7dbc\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40\"}],\"state\":{\"visualization\":{\"layerId\":\"b9253695-4d27-42ac-a159-fdc690673b40\",\"accessor\":\"246b5da5-1e3f-4920-be1f-580e536698a2\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : zip*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"b9253695-4d27-42ac-a159-fdc690673b40\":{\"columns\":{\"246b5da5-1e3f-4920-be1f-580e536698a2\":{\"label\":\"ZIPs\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"246b5da5-1e3f-4920-be1f-580e536698a2\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":41,\"y\":26,\"w\":7,\"h\":9,\"i\":\"7ae2c21a-737b-49dc-94cd-5324621f8549\"},\"panelIndex\":\"7ae2c21a-737b-49dc-94cd-5324621f8549\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-04e472c4-38dd-4b8f-afdd-ef598fddda94\"}],\"state\":{\"visualization\":{\"layerId\":\"04e472c4-38dd-4b8f-afdd-ef598fddda94\",\"accessor\":\"50fcc77d-0b49-4246-9838-ede31c217e3f\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : composite document * or http.response.body.content_magic.keyword : *word*  or http.response.body.content_magic.keyword : *excel*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"04e472c4-38dd-4b8f-afdd-ef598fddda94\":{\"columns\":{\"50fcc77d-0b49-4246-9838-ede31c217e3f\":{\"label\":\"Office Docs\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"50fcc77d-0b49-4246-9838-ede31c217e3f\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":35,\"w\":48,\"h\":19,\"i\":\"ccb17e1b-56b5-4a20-9974-978e7c33e7f0\"},\"panelIndex\":\"ccb17e1b-56b5-4a20-9974-978e7c33e7f0\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\"},{\"isTransposed\":false,\"columnId\":\"ad41a585-a15a-45a4-a419-9b3d962962c3\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":174.66666666666666},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : zip*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"ad41a585-a15a-45a4-a419-9b3d962962c3\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\",\"ad41a585-a15a-45a4-a419-9b3d962962c3\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"ZIPs\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":89,\"w\":48,\"h\":19,\"i\":\"93d0dc66-32d1-4019-ab1a-2432f29637b6\"},\"panelIndex\":\"93d0dc66-32d1-4019-ab1a-2432f29637b6\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"c71beaa7-8c97-474d-acf1-25e7a887179f\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":174.66666666666666},{\"isTransposed\":false,\"columnId\":\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\"},{\"isTransposed\":false,\"columnId\":\"ad41a585-a15a-45a4-a419-9b3d962962c3\"},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\",\"hidden\":true}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : php*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"c71beaa7-8c97-474d-acf1-25e7a887179f\":{\"label\":\"Top values of http.response.body.content.sha256.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content.sha256.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":3,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"ad41a585-a15a-45a4-a419-9b3d962962c3\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\",\"ad41a585-a15a-45a4-a419-9b3d962962c3\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"c71beaa7-8c97-474d-acf1-25e7a887179f\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"PHP\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":54,\"w\":48,\"h\":19,\"i\":\"a32a8543-4c96-46dc-bfee-3aed03c2d97d\"},\"panelIndex\":\"a32a8543-4c96-46dc-bfee-3aed03c2d97d\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"e98609fe-29e7-4788-aab1-7f70c38c5c49\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":132.66666666666666},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"6d3db337-d347-4a26-8a9e-ca229172b7f3\",\"width\":484},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : pe32*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"e98609fe-29e7-4788-aab1-7f70c38c5c49\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":3,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6d3db337-d347-4a26-8a9e-ca229172b7f3\":{\"label\":\"Top values of http.request.url.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"6d3db337-d347-4a26-8a9e-ca229172b7f3\",\"e98609fe-29e7-4788-aab1-7f70c38c5c49\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"PEs\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":73,\"w\":24,\"h\":16,\"i\":\"42f79d55-ad71-463c-bcac-fd044e24454e\"},\"panelIndex\":\"42f79d55-ad71-463c-bcac-fd044e24454e\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-e4c26798-d7f1-48cc-9824-b5c0e2c3c940\"}],\"state\":{\"visualization\":{\"layerId\":\"e4c26798-d7f1-48cc-9824-b5c0e2c3c940\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"e985240d-4bec-47ae-a8fe-21c45ec993e9\"},{\"isTransposed\":false,\"columnId\":\"0ba9110d-3535-402d-a5c4-39e1a0549a89\"},{\"isTransposed\":false,\"columnId\":\"14cf9b50-057e-4d19-8d77-56126e7449be\"},{\"isTransposed\":false,\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\",\"hidden\":true}],\"sorting\":{\"columnId\":\"14cf9b50-057e-4d19-8d77-56126e7449be\",\"direction\":\"desc\"}},\"query\":{\"query\":\"yara_results : \\\"protected_webshell\\\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"e4c26798-d7f1-48cc-9824-b5c0e2c3c940\":{\"columns\":{\"e985240d-4bec-47ae-a8fe-21c45ec993e9\":{\"label\":\"URL\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false},\"customLabel\":true},\"0ba9110d-3535-402d-a5c4-39e1a0549a89\":{\"label\":\"SHA256\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content.sha256.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false},\"customLabel\":true},\"14cf9b50-057e-4d19-8d77-56126e7449be\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"14cf9b50-057e-4d19-8d77-56126e7449be\",\"e985240d-4bec-47ae-a8fe-21c45ec993e9\",\"0ba9110d-3535-402d-a5c4-39e1a0549a89\",\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Protected Webshells\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":24,\"y\":73,\"w\":24,\"h\":16,\"i\":\"3d1b22c5-a23a-49a2-b862-8b79177aa6c8\"},\"panelIndex\":\"3d1b22c5-a23a-49a2-b862-8b79177aa6c8\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-96ee832f-8e02-453c-83c8-c4cec031d5dc\"}],\"state\":{\"visualization\":{\"layerId\":\"96ee832f-8e02-453c-83c8-c4cec031d5dc\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\",\"width\":330.41666666666663},{\"isTransposed\":false,\"columnId\":\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\",\"width\":349.75},{\"isTransposed\":false,\"columnId\":\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\"},{\"isTransposed\":false,\"columnId\":\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"},{\"columnId\":\"fd8b632c-b733-4538-b2c0-53907b9e7e32\",\"isTransposed\":false}],\"sorting\":{\"columnId\":\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\",\"direction\":\"desc\"}},\"query\":{\"query\":\"   http.response.body.content_magic.keyword : composite document* or http.response.body.content_magic.keyword : *word*  or http.response.body.content_magic.keyword : *excel*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"96ee832f-8e02-453c-83c8-c4cec031d5dc\":{\"columns\":{\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":30,\"orderBy\":{\"type\":\"alphabetical\",\"fallback\":false},\"orderDirection\":\"asc\",\"otherBucket\":true,\"missingBucket\":false}},\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":30,\"orderBy\":{\"type\":\"alphabetical\",\"fallback\":false},\"orderDirection\":\"asc\",\"otherBucket\":true,\"missingBucket\":false}},\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"},\"fd8b632c-b733-4538-b2c0-53907b9e7e32\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}}},\"columnOrder\":[\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\",\"fd8b632c-b733-4538-b2c0-53907b9e7e32\",\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\",\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\",\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"],\"incompleteColumns\":{}}}}}}},\"hidePanelTitles\":false,\"enhancements\":{}},\"title\":\"Office Docs\"}]","timeRestore":false,"title":"Subcrawl - Main","version":1},"coreMigrationVersion":"7.17.0","id":"eaccd8b0-5382-11ec-9a97-899d8810a3c2","migrationVersion":{"dashboard":"7.17.0"},"references":[{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ef96c601-3c40-4166-91a6-0041d38b29f2:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ef96c601-3c40-4166-91a6-0041d38b29f2:indexpattern-datasource-layer-0085efda-2a22-462d-8f0f-2ea2e0b401d5","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"d34b0422-8934-45c9-8b4f-a0697f8788e3:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"d34b0422-8934-45c9-8b4f-a0697f8788e3:indexpattern-datasource-layer-12c41522-f747-410f-883f-a393b947cd19","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"f275f2db-bfc8-4e39-b0ed-4b141168374c:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"f275f2db-bfc8-4e39-b0ed-4b141168374c:indexpattern-datasource-layer-36dd60e9-fb14-44c7-b298-ae035186dda3","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3de30073-057a-453d-91df-0327dcde3840:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3de30073-057a-453d-91df-0327dcde3840:indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0548058f-744b-46cc-876e-f3ebd94d47fc:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0548058f-744b-46cc-876e-f3ebd94d47fc:indexpattern-datasource-layer-68fdd6c3-da83-40cd-8b00-bbf44d29b4ee","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"79fe7c6f-146a-4acd-a06d-96bcf36d75c6:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"79fe7c6f-146a-4acd-a06d-96bcf36d75c6:indexpattern-datasource-layer-98b138d5-6ef9-44ce-9bae-0dd42795a47d","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0d48db17-ea78-484c-922c-dd26997c7dbc:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0d48db17-ea78-484c-922c-dd26997c7dbc:indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"7ae2c21a-737b-49dc-94cd-5324621f8549:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"7ae2c21a-737b-49dc-94cd-5324621f8549:indexpattern-datasource-layer-04e472c4-38dd-4b8f-afdd-ef598fddda94","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ccb17e1b-56b5-4a20-9974-978e7c33e7f0:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ccb17e1b-56b5-4a20-9974-978e7c33e7f0:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"93d0dc66-32d1-4019-ab1a-2432f29637b6:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"93d0dc66-32d1-4019-ab1a-2432f29637b6:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"a32a8543-4c96-46dc-bfee-3aed03c2d97d:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"a32a8543-4c96-46dc-bfee-3aed03c2d97d:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"42f79d55-ad71-463c-bcac-fd044e24454e:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"42f79d55-ad71-463c-bcac-fd044e24454e:indexpattern-datasource-layer-e4c26798-d7f1-48cc-9824-b5c0e2c3c940","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3d1b22c5-a23a-49a2-b862-8b79177aa6c8:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3d1b22c5-a23a-49a2-b862-8b79177aa6c8:indexpattern-datasource-layer-96ee832f-8e02-453c-83c8-c4cec031d5dc","type":"index-pattern"}],"type":"dashboard","updated_at":"2022-02-21T21:03:10.544Z","version":"WzMwNDI5LDJd"}
3 | {"excludedObjects":[],"excludedObjectsCount":0,"exportedCount":2,"missingRefCount":0,"missingReferences":[]}


--------------------------------------------------------------------------------