├── images ├── webui.png ├── yara-output.png ├── architecture.png ├── clamav-output.png ├── external_intel.png ├── misp-overview.png ├── payload-output.png ├── sqlite-storage.png ├── console-storage.png └── architecture-prev.png ├── crawler ├── utils │ ├── subcrawl.db │ ├── ansi_colors.py │ ├── __init__.py │ ├── logos │ │ ├── subcrawl-2.txt │ │ ├── subcrawl-3.txt │ │ └── subcrawl-1.txt │ ├── setup_kafka_topic.py │ ├── banner.py │ ├── helpers.py │ ├── sqlite_model.py │ └── logger.py ├── processing │ ├── minisdhash │ │ ├── sdhash │ │ ├── libsdbf.a │ │ └── sdbf_class.py │ ├── default_processing.py │ ├── example_processing.py │ ├── __init__.py │ ├── tlsh_processing.py │ ├── jarm_processing.py │ ├── yara_processing.py │ ├── clamav_processing.py │ ├── sdhash_processing.py │ ├── payload_processing.py │ └── external_intel_processing.py ├── run.sh ├── storage │ ├── __init__.py │ ├── default_storage.py │ ├── example_storage.py │ ├── console_storage.py │ ├── sqlite_storage.py │ ├── elastic_storage.py │ ├── misp_storage.py │ └── kibana-dashboard │ │ └── overview-dashboard.ndjson ├── yara-rules │ ├── open_webshell.yar │ ├── php_file_manager_login.yar │ ├── erbium_discord_panel_login.yar │ ├── default_page_xampp_windows.yar │ ├── default_page_apache.yar │ ├── outlook_phish.yar │ ├── titan_stealer_panel_login.yar │ ├── royalmail_phish.yar │ ├── sharepoint_online_phish.yar │ ├── chase_login_spox_phish.yar │ ├── collector_stealer_panel_login.yar │ ├── bapr_banking_phish.yar │ ├── hex-encoded-pe-file.yar │ ├── microsoft_phish.yar │ ├── aurora_stealer_panel_login.yar │ ├── modernloader_panel_login.yar │ ├── office365_review_phish.yar │ ├── webpanel_origin_login.yar │ ├── base64_pe.yar │ ├── amadey_panel_login.yar │ ├── office365_verify_pdf_phish.yar │ ├── wellsfargo_phish.yar │ ├── bankamerica_phish.yar │ ├── link_sharing_onedrive.yar │ ├── pony_panel_login.yar │ ├── attachments_onedrive_phish.yar │ ├── microsoft_login_phish.yar │ ├── unam_webpanel_login.yar │ ├── sharepoint_dropbox_online_phish.yar │ ├── standard_bank_phish.yar │ ├── onedrive_business_phish.yar │ ├── panels.yar │ ├── h3k_tinyfilemanager_login.yar │ ├── grandamisha_panel_login.yar │ ├── wallet_connect_phish.yar │ ├── obfuscated_script.yar │ ├── acridrain_stealer_panel_login.yar │ ├── mars_panel_login.yar │ ├── huntington_phish.yar │ ├── mana5_panel_login.yar │ ├── base64_shellcode_dos_header_pe.yar │ ├── html_webshell_login.yar │ ├── php_webshell_backend.yar │ ├── agenttesla_webpanel_login.yar │ ├── js_webshell_tracking_script.yar │ └── combined-rules.yar ├── requirements.txt ├── Dockerfile ├── docker-compose.yml ├── app │ ├── templates │ │ ├── domains.html │ │ ├── urls.html │ │ ├── search_results.html │ │ ├── dashboard.html │ │ ├── url_details.html │ │ ├── domain_details.html │ │ └── base.html │ └── main.py ├── service.py ├── input │ ├── phishtank.py │ └── urlhaus.py ├── misp-objects │ └── opendir-url │ │ └── definition.json ├── supervisor │ └── supervisord.conf ├── config.yml └── subcrawl.py ├── conferences └── 2021 │ └── blackhat_us_arsenal │ └── BH-Arsenal-2021.pdf ├── License.md ├── .gitignore └── README.md /images/webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/webui.png -------------------------------------------------------------------------------- /images/yara-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/yara-output.png -------------------------------------------------------------------------------- /crawler/utils/subcrawl.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/utils/subcrawl.db -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/architecture.png -------------------------------------------------------------------------------- /images/clamav-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/clamav-output.png -------------------------------------------------------------------------------- /images/external_intel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/external_intel.png -------------------------------------------------------------------------------- /images/misp-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/misp-overview.png -------------------------------------------------------------------------------- /images/payload-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/payload-output.png -------------------------------------------------------------------------------- /images/sqlite-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/sqlite-storage.png -------------------------------------------------------------------------------- /images/console-storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/console-storage.png -------------------------------------------------------------------------------- /images/architecture-prev.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/images/architecture-prev.png -------------------------------------------------------------------------------- /crawler/processing/minisdhash/sdhash: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/processing/minisdhash/sdhash -------------------------------------------------------------------------------- /crawler/processing/minisdhash/libsdbf.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/crawler/processing/minisdhash/libsdbf.a -------------------------------------------------------------------------------- /crawler/run.sh: -------------------------------------------------------------------------------- 1 | service clamav-daemon start 2 | service supervisor start 3 | gunicorn app.main:app -b 0.0.0.0:8000 --reload --workers 4 -------------------------------------------------------------------------------- /conferences/2021/blackhat_us_arsenal/BH-Arsenal-2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstrosch/subcrawl/HEAD/conferences/2021/blackhat_us_arsenal/BH-Arsenal-2021.pdf -------------------------------------------------------------------------------- /crawler/storage/__init__.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from .console_storage import ConsoleStorage 3 | from .misp_storage import MISPStorage 4 | from .sqlite_storage import SqliteStorage 5 | from .elastic_storage import ElasticStorage 6 | -------------------------------------------------------------------------------- /crawler/utils/ansi_colors.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | 3 | class SubCrawlColors: 4 | BLUE = '\033[34m' 5 | GREEN = '\033[32m' 6 | PURPLE = '\033[35m' 7 | YELLOW = '\033[33m' 8 | RED = '\033[31m' 9 | CYAN = '\033[36m' 10 | RESET = '\033[0m' 11 | CLS = '\033[2J' 12 | -------------------------------------------------------------------------------- /crawler/yara-rules/open_webshell.yar: -------------------------------------------------------------------------------- 1 | rule open_webshell 2 | { 3 | meta: 4 | description = "Open Webshell Detection" 5 | author = "patrick.schlapfer@hp.com" 6 | date = "2021-04-19" 7 | 8 | strings: 9 | $a = "file manager" 10 | $b = "uname" 11 | 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/php_file_manager_login.yar: -------------------------------------------------------------------------------- 1 | rule php_file_manager_login { 2 | 3 | meta: 4 | date = "2022-11-29" 5 | 6 | strings: 7 | $s1 = "File Manager" 8 | $s2 = "content=\"Web based File Manager" 9 | $s3 = "class=\"form-signin\"" 10 | $s4 = "File Manager</h1>" 11 | 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/processing/default_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | 3 | class DefaultProcessing: 4 | 5 | cfg = None 6 | logger = None 7 | 8 | def __init__(self, config, logger): 9 | self.cfg = config 10 | self.logger = logger 11 | 12 | def process(self, url, resp): 13 | pass 14 | -------------------------------------------------------------------------------- /crawler/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from utils.logger import SubCrawlLogger, SubCrawlLoggerLevels 3 | from utils.banner import SubCrawlBanner 4 | from utils.sqlite_model import * 5 | from utils.setup_kafka_topic import check_topic 6 | from utils.ansi_colors import SubCrawlColors 7 | from utils.helpers import SubCrawlHelpers 8 | -------------------------------------------------------------------------------- /crawler/yara-rules/erbium_discord_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule erbium_discord_panel_login { 2 | meta: 3 | data = "2022-11-28" 4 | 5 | strings: 6 | $x1 = "https://erbium_support.t.me" 7 | $x2 = "<title>Discord" 8 | $s1 = "id=\"username\"" 9 | $s2 = "id=\"password\"" 10 | 11 | condition: 12 | all of them 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/default_page_xampp_windows.yar: -------------------------------------------------------------------------------- 1 | rule default_page_xampp_windows 2 | { 3 | meta: 4 | description = "Default page for XAMPP" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-27" 7 | strings: 8 | $title = "Welcome to XAMPP" nocase 9 | $platform = "welcome to xampp for windows" nocase 10 | condition: 11 | all of them 12 | } 13 | -------------------------------------------------------------------------------- /crawler/yara-rules/default_page_apache.yar: -------------------------------------------------------------------------------- 1 | rule default_page_apache 2 | { 3 | meta: 4 | description = "Default page for Apache2" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-02" 7 | strings: 8 | $title = /apache2.{,10}default page/ nocase 9 | $apache = "apache2" nocase 10 | $default = "default page" nocase 11 | condition: 12 | all of them 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/outlook_phish.yar: -------------------------------------------------------------------------------- 1 | rule outlook_phish 2 | { 3 | meta: 4 | description = "Outlook login" 5 | author = "josh@m9cyber.com" 6 | date = "2022-06-29" 7 | strings: 8 | $form = "class=\"boxtext\"" nocase 9 | $title = "microsoft | login" nocase 10 | $pass = "id=\"pr\"" 11 | $header = "OUTLOOK" 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/titan_stealer_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule titan_stealer_panel_login { 2 | meta: 3 | date = "2022-11-30" 4 | 5 | strings: 6 | $s1 = "Titan Stealer" nocase 7 | $s2 = "class=\"auth__form\"" nocase 8 | $s3 = "Sign in" nocase 9 | $s4 = "id=\"floatingPassword\"" nocase 10 | 11 | condition: 12 | all of them 13 | } -------------------------------------------------------------------------------- /crawler/yara-rules/royalmail_phish.yar: -------------------------------------------------------------------------------- 1 | rule royal_mail_phish 2 | { 3 | meta: 4 | description = "Royal Mail phish" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-04-18" 7 | strings: 8 | $title = "royal mail group ltd" nocase 9 | $form_action = "action=\"login.php\"" 10 | $pass = "name=\"pass\"" nocase 11 | condition: 12 | all of them 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/sharepoint_online_phish.yar: -------------------------------------------------------------------------------- 1 | rule sharepoint_online_phish 2 | { 3 | meta: 4 | description = "Sharepoint Online Multiple Logins" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-02" 7 | strings: 8 | $title = "share point online" nocase 9 | $user = "id=\"email\"" 10 | $post_url = "next.php" nocase 11 | condition: 12 | all of them 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/chase_login_spox_phish.yar: -------------------------------------------------------------------------------- 1 | rule chase_login_spox_phish 2 | { 3 | meta: 4 | description = "Chase Bank Login" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-27" 7 | strings: 8 | $title = "Online enrollement" 9 | $form = "action=\"regex.php\"" 10 | $user = "name=\"id\"" 11 | $pass = "name=\"password\"" 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/collector_stealer_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule collector_stealer_panel_login { 2 | meta: 3 | date = "2022-11-30" 4 | 5 | strings: 6 | $s1 = "login" nocase 7 | $s2 = "Collector Stealer panel" nocase 8 | $s3 = "action=\"/index.php?auth\"" nocase 9 | $s4 = "id=\"sendlogin\"" nocase 10 | 11 | condition: 12 | all of them 13 | } -------------------------------------------------------------------------------- /crawler/requirements.txt: -------------------------------------------------------------------------------- 1 | Jinja2 == 3.0.1 2 | PyYAML == 5.4.1 3 | beautifulsoup4 == 4.9.3 4 | clamd == 1.0.2 5 | falcon == 2.0.0 6 | mergedeep == 1.3.4 7 | peewee == 3.14.0 8 | py_tlsh == 4.5.0 9 | pyjarm == 0.0.5 10 | pymisp == 2.4.140 11 | python_magic == 0.4.22 12 | requests == 2.25.0 13 | timeloop == 1.0.2 14 | yara_python == 4.0.5 15 | gunicorn == 20.0.4 16 | kafka-python == 2.0.2 17 | elasticsearch == 7.15.1 18 | -------------------------------------------------------------------------------- /crawler/storage/default_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | 3 | class DefaultStorage: 4 | 5 | cfg = None 6 | logger = None 7 | 8 | def __init__(self, config, logger): 9 | self.cfg = config 10 | self.logger = logger 11 | 12 | def load_scraped_domains(self): 13 | return [] 14 | 15 | def store_result(self, result_data): 16 | return True 17 | -------------------------------------------------------------------------------- /crawler/yara-rules/bapr_banking_phish.yar: -------------------------------------------------------------------------------- 1 | rule bapr_phish_phish 2 | { 3 | meta: 4 | description = "BAPR Online banking phishing page" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-09" 7 | strings: 8 | $title = "personal internet banking" nocase 9 | $form = "name=\"login.loginform\"" nocase 10 | $pass = "id=\"passcrypt\"" nocase 11 | condition: 12 | all of them 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/hex-encoded-pe-file.yar: -------------------------------------------------------------------------------- 1 | rule hexencoded_pe_file { 2 | meta: 3 | desc = "Detects hex-encoded pe file" 4 | author = "@jstrosch" 5 | date = "2022 Oct 24" 6 | 7 | strings: 8 | $mz = { 34 44 35 41 } //4D 5A -> MZ 9 | $pe = { 35 30 34 35 30 30 30 30 } // 50 45 00 00 -> PE00 10 | 11 | condition: 12 | $mz at 0 and $pe in (@mz[1]..0x200) 13 | } 14 | -------------------------------------------------------------------------------- /crawler/processing/example_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from .default_processing import DefaultProcessing 3 | 4 | 5 | class ExampleProcessing(DefaultProcessing): 6 | 7 | cfg = None 8 | logger = None 9 | 10 | def __init__(self, config, logger): 11 | self.cfg = config 12 | self.logger = logger 13 | 14 | def process(self, url, resp): 15 | pass 16 | -------------------------------------------------------------------------------- /crawler/yara-rules/microsoft_phish.yar: -------------------------------------------------------------------------------- 1 | rule microsoft_phish 2 | { 3 | meta: 4 | description = "Microsoft login" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-01" 7 | strings: 8 | $form = "office/login.php" nocase 9 | $title = "sign in to your microsoft account" nocase 10 | $user = "id=\"user\"" 11 | $redirect = "pass.php" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/aurora_stealer_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule aurora_stealer_panel_login { 2 | meta: 3 | date = "2022-11-30" 4 | author = "@jstrosch" 5 | 6 | strings: 7 | $s1 = "Auth" nocase 8 | $s2 = "AURORA STEALER" nocase 9 | $s3 = "placeholder=\"YOU PASSWORD\"" nocase 10 | $s4 = "id=\"email-2ee9\"" nocase 11 | 12 | condition: 13 | all of them 14 | } -------------------------------------------------------------------------------- /crawler/yara-rules/modernloader_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule modernloader_panel_login { 2 | meta: 3 | date = "2022-11-30" 4 | author = "@jstrosch" 5 | 6 | strings: 7 | $s1 = "Panel - Login" nocase 8 | $s2 = "class=\"login__form\"" nocase 9 | $s3 = "url = \"control.php\"" nocase 10 | $s4 = "Welcome" nocase 11 | 12 | condition: 13 | all of them 14 | } -------------------------------------------------------------------------------- /crawler/yara-rules/office365_review_phish.yar: -------------------------------------------------------------------------------- 1 | rule office365_review__phish 2 | { 3 | meta: 4 | description = "Office 365 Review Document phish" 5 | author = "josh@m9cyber.com" 6 | date = "2022-07-12" 7 | strings: 8 | $form = "post.php" nocase 9 | $title = "Office 365" 10 | $user = "id=\"email\"" 11 | $placeholder = "Office 365 Email" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/webpanel_origin_login.yar: -------------------------------------------------------------------------------- 1 | rule webpanel_origin_login 2 | { 3 | meta: 4 | description = "Origin (AgentTesla) Webpanel" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-21" 7 | strings: 8 | $title = "Login" 9 | $form = "action=\"login.php\"" 10 | $signin = "box-title m-b-20\">Sign In" 11 | $style = "margin: auto;margin-top:100px;}" 12 | condition: 13 | all of them 14 | } -------------------------------------------------------------------------------- /crawler/processing/__init__.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from .payload_processing import PayloadProcessing 3 | from .external_intel_processing import ExternalIntelProcessing 4 | from .yara_processing import YARAProcessing 5 | from .clamav_processing import ClamAVProcessing 6 | from .jarm_processing import JARMProcessing 7 | from .tlsh_processing import TLSHProcessing 8 | #from .sdhash_processing import SDhashProcessing 9 | -------------------------------------------------------------------------------- /crawler/utils/logos/subcrawl-2.txt: -------------------------------------------------------------------------------- 1 | ________ ______ _________ ______ 2 | __ ___/____ _____ /_ __ ____/______________ ____ _____ / 3 | _____ \ _ / / /__ __ \_ / __ ___/_ __ `/__ | /| / /__ / 4 | ____/ / / /_/ / _ /_/ // /___ _ / / /_/ / __ |/ |/ / _ / 5 | /____/ \__,_/ /_.___/ \____/ /_/ \__,_/ ____/|__/ /_/ 6 | -------------------------------------------------------------------------------- /crawler/yara-rules/base64_pe.yar: -------------------------------------------------------------------------------- 1 | rule base64_pe 2 | { 3 | meta: 4 | description = "Detects base64 encoded PE files, often used with Powershell." 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-25" 7 | strings: 8 | $mz_header = /(TVqQ|QqVT)/ 9 | $this_program = /(VGhpcyBwcm9ncmFt|tFmcn9mcwBycphGV)/ 10 | $null_bytes = "AAAAA" 11 | condition: 12 | $mz_header at 0 and $this_program and #null_bytes > 2 13 | } 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/amadey_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule amadey_panel_login 2 | { 3 | meta: 4 | description = "Amadey panel login" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-04-08" 7 | strings: 8 | $title = "authorization" nocase 9 | $form_action = "action=\"Login.php\"" 10 | $bg_img = "images\\bg_1.png" nocase 11 | $pass = "name=\"password\"" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/office365_verify_pdf_phish.yar: -------------------------------------------------------------------------------- 1 | rule office365_verify_pdf_phish 2 | { 3 | meta: 4 | description = "Office365/OneDrive Verify Yourself PDF phish" 5 | author = "josh@m9cyber.com" 6 | date = "2022-07-25" 7 | strings: 8 | $title = "Files - OneDrive" 9 | $form = "action=\"link.php\"" 10 | $user = "id=\"txtTOAAEmail\"" 11 | $verify = "Verify Yourself" 12 | 13 | condition: 14 | all of them 15 | } 16 | -------------------------------------------------------------------------------- /crawler/yara-rules/wellsfargo_phish.yar: -------------------------------------------------------------------------------- 1 | rule wells_fargo_phish 2 | { 3 | meta: 4 | description = "Wells Fargo Phish" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-04-18" 7 | strings: 8 | $title = "Wells Fargo" nocase 9 | $form_action = "action=\"./parse.php\"" 10 | $user = "name=\"j_username\"" nocase 11 | $pass = "name=\"j_password\"" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/utils/logos/subcrawl-3.txt: -------------------------------------------------------------------------------- 1 | _________ ___. _________ .__ 2 | / _____/ __ __ \_ |__ \_ ___ \ _______ _____ __ _ __| | 3 | \_____ \ | | \ | __ \ / \ \/ \_ __ \\__ \ \ \/ \/ /| | 4 | / \| | / | \_\ \\ \____ | | \/ / __ \_ \ / | |__ 5 | /_______ /|____/ |___ / \______ / |__| (____ / \/\_/ |____/ 6 | \/ \/ \/ \/ -------------------------------------------------------------------------------- /crawler/yara-rules/bankamerica_phish.yar: -------------------------------------------------------------------------------- 1 | rule bank_america_phish 2 | { 3 | meta: 4 | description = "Bank of America Phishing" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-04-19" 7 | strings: 8 | $title = "Bank of America -" nocase 9 | $form_action = "action=\"login.php\"" 10 | $id = "name=\"onlineId1\"" nocase 11 | $pass = "name=\"passcode1\"" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/link_sharing_onedrive.yar: -------------------------------------------------------------------------------- 1 | rule link_sharing_onedrive 2 | { 3 | meta: 4 | description = "OneDrive Link Sharing Phish" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-17" 7 | strings: 8 | $modified = "new injection" 9 | $title = /link.{0,10}validation<\/title>/ nocase 10 | $form = "bmV4dC5waHA=" //next.php 11 | $user = "id=\"ai\"" 12 | $pass = "id=\"pr\"" 13 | condition: 14 | all of them 15 | } 16 | -------------------------------------------------------------------------------- /crawler/yara-rules/pony_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule pony_panel_login 2 | { 3 | meta: 4 | description = "Pony stealer panel login" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-04-03" 7 | strings: 8 | $title = "authorization" nocase 9 | $form_action = "action=\"/panel/admin.php\"" nocase 10 | $lock = "lock_open.png" nocase 11 | $pass = "name=\"password\"" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/attachments_onedrive_phish.yar: -------------------------------------------------------------------------------- 1 | rule attachments_onedrive_phish 2 | { 3 | meta: 4 | description = "OneDrive Attachments Phish" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-02" 7 | strings: 8 | $title = "attachments - onedrive" nocase 9 | $post_out = "loginout.php" nocase 10 | $post_365 = "login365.php" nocase 11 | $class = "class=\"login-form\"" nocase 12 | condition: 13 | all of them 14 | } 15 | -------------------------------------------------------------------------------- /crawler/yara-rules/microsoft_login_phish.yar: -------------------------------------------------------------------------------- 1 | rule microsoft_login_phish 2 | { 3 | meta: 4 | description = "Microsoft login" 5 | author = "josh@m9cyber.com" 6 | date = "2022-10-19" 7 | strings: 8 | $form = "
Log-In" 12 | $g = "Username" 13 | $h = "Password" 14 | $ih = "Log In" 15 | condition: 16 | all of them 17 | } 18 | -------------------------------------------------------------------------------- /crawler/yara-rules/h3k_tinyfilemanager_login.yar: -------------------------------------------------------------------------------- 1 | rule h3k_tinyfilemanager_login { 2 | meta: 3 | description = "H3K Tiny File Manager login" 4 | author = "Josh Stroschein josh@m9cyber.com" 5 | date = "2023-01-15" 6 | 7 | strings: 8 | $s1 = "Tiny File Manager" nocase 9 | $s2 = "form-signin" nocase 10 | $s3 = "fm_usr" nocase 11 | $s4 = "fm_pwd" nocase 12 | $s5 = ".fm-login-page" nocase 13 | 14 | condition: 15 | all of them 16 | } 17 | -------------------------------------------------------------------------------- /crawler/yara-rules/grandamisha_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule grandamisha_panel_login { 2 | meta: 3 | description = "Granda Misha panel login" 4 | author = "Josh Stroschein josh@m9cyber.com" 5 | date = "2022-12-29" 6 | 7 | strings: 8 | $r1 = "misha" nocase 9 | $r2 = "granda misha" nocase 10 | $s1 = "placeholdler=\"Jabber ID\"" nocase 11 | $s2 = "name=\"password\"" nocase 12 | $s3 = "users_signin" nocase 13 | 14 | condition: 15 | $r1 and $r2 and 1 of ($s*) 16 | } -------------------------------------------------------------------------------- /crawler/yara-rules/wallet_connect_phish.yar: -------------------------------------------------------------------------------- 1 | rule wallet_connect_phish 2 | { 3 | meta: 4 | description = "Wallet Connect phishing page" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-08" 7 | strings: 8 | $title = "intergations protocol" nocase 9 | $form_action = "action=\"#\"" nocase 10 | $hidden = "value=\"AAVE\"" nocase 11 | $phrase = "name=\"phrase\"" nocase 12 | $private = "name=\"pkey\"" nocase 13 | $json = "name=\"kjson\"" nocase 14 | condition: 15 | all of them 16 | } 17 | -------------------------------------------------------------------------------- /crawler/yara-rules/obfuscated_script.yar: -------------------------------------------------------------------------------- 1 | rule obfuscated_script 2 | { 3 | meta: 4 | description = "Looks for common functions and patterns to deobfuscate scripts" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-27" 7 | strings: 8 | $eval = "eval(" nocase 9 | $hex = "hex(" nocase 10 | $split = "split(" nocase 11 | $exec = "execute" nocase 12 | $char ="char(" nocase 13 | $from_hex = /([\d]{2,3}[^\d]{1,10}){200,}/ 14 | condition: 15 | ($hex or $split or $char or $from_hex) and ($eval or $exec) 16 | } 17 | -------------------------------------------------------------------------------- /crawler/yara-rules/acridrain_stealer_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule acridrain_stealer_panel_login { 2 | meta: 3 | description = "AcridRain Stealer panel login" 4 | author = "Josh Stroschein josh@m9cyber.com" 5 | date = "2022-12-29" 6 | 7 | strings: 8 | $r1 = "Acrid -" nocase 9 | $r2 = "AcridRain Stealer" nocase 10 | $s1 = "/Account/Login" nocase 11 | $s2 = "name=\"Email\"" nocase 12 | $s3 = "name=\"Password\"" nocase 13 | 14 | condition: 15 | $r1 and $r2 and 1 of ($s*) 16 | } -------------------------------------------------------------------------------- /crawler/yara-rules/mars_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule mars_panel_login 2 | { 3 | meta: 4 | description = "Mars stealer panel login" 5 | author = "Josh Stroschein josh@m9cyber.com" 6 | date = "2022-03-28" 7 | resources = "https://isc.sans.edu/diary/Arkei+Variants%3A+From+Vidar+to+Mars+Stealer/28468" 8 | strings: 9 | $title = "dashboard" nocase 10 | $form_action = "action=\"login.php\"" nocase 11 | $login_btn = "name=\"do_login\"" nocase 12 | $pass = "name=\"password\"" nocase 13 | condition: 14 | all of them 15 | } 16 | -------------------------------------------------------------------------------- /crawler/utils/setup_kafka_topic.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from kafka.admin import KafkaAdminClient, NewTopic 3 | 4 | 5 | def check_topic(): 6 | admin_client = KafkaAdminClient( 7 | bootstrap_servers="kafka:9092", 8 | client_id='test' 9 | ) 10 | if "urls" not in admin_client.list_topics(): 11 | topic_list = [] 12 | topic_list.append(NewTopic(name="urls", num_partitions=10, replication_factor=1)) 13 | admin_client.create_topics(new_topics=topic_list, validate_only=False) 14 | -------------------------------------------------------------------------------- /crawler/yara-rules/huntington_phish.yar: -------------------------------------------------------------------------------- 1 | rule huntington_bank_phish 2 | { 3 | meta: 4 | description = "Huntington Bank Phishing Kit" 5 | author = "josh@m9cyber.com" 6 | date = "2022-02-17" 7 | strings: 8 | $banner = "hgn.png" 9 | $title = "Huntington" 10 | $title_html = "Huntington" 11 | $form = "action=need1.php" 12 | $user = "name=\"ud\"" 13 | $pass = "name=\"pd\"" 14 | condition: 15 | ($title or $title_html) and $banner and $form and $user and $pass 16 | } 17 | -------------------------------------------------------------------------------- /crawler/yara-rules/mana5_panel_login.yar: -------------------------------------------------------------------------------- 1 | rule mana5_panel_login 2 | { 3 | meta: 4 | description = "Mana Tools Panel 5.0" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-17" 7 | strings: 8 | $title = "login" nocase 9 | $banner = "lone wolf version 5.0" nocase 10 | $back_img = "background-image: url('1.jpg')" 11 | $html_title = "

Log-In

" 12 | $user = "name=\"username\"" nocase 13 | $pass = "name=\"password\"" nocase 14 | $button = "Log In" 15 | condition: 16 | all of them 17 | } 18 | -------------------------------------------------------------------------------- /crawler/storage/example_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import json 3 | import pprint 4 | from re import subn 5 | 6 | from utils import SubCrawlColors 7 | from .default_storage import DefaultStorage 8 | 9 | 10 | class ExampleStorage(DefaultStorage): 11 | 12 | cfg = None 13 | logger = None 14 | 15 | def __init__(self, config, logger): 16 | self.cfg = config 17 | self.logger = logger 18 | 19 | def load_scraped_domains(self): 20 | return [] 21 | 22 | def store_result(self, result_data): 23 | pass 24 | -------------------------------------------------------------------------------- /crawler/utils/logos/subcrawl-1.txt: -------------------------------------------------------------------------------- 1 | ******** ** ****** ** 2 | **////// /** **////** /** 3 | /** ** **/** ** // ****** ****** *** ** /** 4 | /*********/** /**/****** /** //**//* //////** //** * /** /** 5 | ////////**/** /**/**///**/** /** / ******* /** ***/** /** 6 | /**/** /**/** /**//** ** /** **////** /****/**** /** 7 | ******** //******/****** //****** /*** //******** ***/ ///** *** 8 | //////// ////// ///// ////// /// //////// /// /// /// -------------------------------------------------------------------------------- /crawler/yara-rules/base64_shellcode_dos_header_pe.yar: -------------------------------------------------------------------------------- 1 | rule base64_shellcode_dos_header_pe 2 | { 3 | meta: 4 | description = "Detects base64 encoded PE files, often used with Powershell, that contains magic bytes that allow for the image_dos_header to contain shellcode.." 5 | author = "josh@m9cyber.com" 6 | date = "2023-01-23" 7 | strings: 8 | $mz_header = /(TVpFUu|uUFpVT|TVpSRQ|QRSpTV|TVpBUg|gUBpVT)/ 9 | $this_program = /(VGhpcyBwcm9ncmFt|tFmcn9mcwBycphGV)/ 10 | $null_bytes = "AAAAA" 11 | condition: 12 | $mz_header at 0 and $this_program and #null_bytes > 2 13 | } 14 | -------------------------------------------------------------------------------- /crawler/utils/banner.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import os 3 | import random 4 | import sys 5 | 6 | 7 | class SubCrawlBanner(): 8 | 9 | logo_path = "" 10 | tag_line = "" 11 | logos = [] 12 | 13 | def __init__(self, logopath, tagline): 14 | self.logo_path = logopath 15 | self.tag_line = tagline 16 | for logo in os.listdir(self.logo_path): 17 | self.logos.append(os.path.join(self.logo_path, logo)) 18 | 19 | def print_banner(self): 20 | logo = self.logos[random.randint(0, len(self.logos) - 1)] 21 | with open(logo) as logodata: 22 | print("\n" + logodata.read()) 23 | print(self.tag_line + "\n") 24 | -------------------------------------------------------------------------------- /crawler/yara-rules/html_webshell_login.yar: -------------------------------------------------------------------------------- 1 | rule protected_webshell 2 | { 3 | meta: 4 | description = "Protected Webshell Login" 5 | author = "HP Threat Research @HPSecurity" 6 | filetype = "PHP" 7 | maltype = "notifier" 8 | date = "2021-06-08" 9 | 10 | strings: 11 | $a1 = /action\s*=\s*\"\"/ 12 | $a2 = /method\s*=\s*\"post\"/ 13 | $a3 = /type\s*=\s*\"submit\"/ 14 | $a4 = /name\s*=\s*\"[a-z]{0,}_{0,}[a-z]{2,}\"/ 15 | 16 | $b1 = /type\s*=\s*\"input\"/ 17 | $b2 = /type\s*=\s*\"text\"/ 18 | 19 | $c1 = /value\s*=\s*\"(\s*>\s*){1,2}\"/ 20 | $c2 = /value\s*=\s*\"(\s?>\s?){1,2}\"/ 21 | 22 | condition: 23 | all of ($a*) and any of ($b*) and any of ($c*) and filesize < 1000 24 | } 25 | -------------------------------------------------------------------------------- /crawler/yara-rules/php_webshell_backend.yar: -------------------------------------------------------------------------------- 1 | rule php_webshell_backend : notifier 2 | { 3 | meta: 4 | description = "PHP webshell backend used by the attacker" 5 | author = "HP Threat Research @HPSecurity" 6 | filetype = "PHP" 7 | maltype = "notifier" 8 | date = "2021-06-08" 9 | 10 | strings: 11 | $a1 = "__construct" 12 | $a2 = "ord" 13 | $a3 = "chr" 14 | $a4 = "class" 15 | $a5 = "strpos" 16 | $a6 = "strlen" 17 | 18 | $b = "array" 19 | $c = "function" 20 | $d = "var" 21 | 22 | $e = /\$\w+\s*\=\s*(\$\w+->\w+\[\d+\]\.?)+;/ 23 | $f = /var\s*\$\w+\s*\=\s*['\"][\w\/\+\=\n\t]+/ 24 | 25 | condition: 26 | all of ($a*) and #b >= 5 and #c == 9 and #d >= 9 and #e >= 5 and $f and filesize < 1MB 27 | } 28 | -------------------------------------------------------------------------------- /crawler/processing/tlsh_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from .default_processing import DefaultProcessing 3 | import tlsh 4 | 5 | 6 | class TLSHProcessing(DefaultProcessing): 7 | 8 | cfg = None 9 | logger = None 10 | 11 | def __init__(self, config, logger): 12 | self.cfg = config 13 | self.logger = logger 14 | 15 | def process(self, url, content): 16 | tlsh_result = {} 17 | if len(content) < 50: 18 | return {} 19 | 20 | try: 21 | tlsh_result["tlsh"] = tlsh.hash(content) 22 | tlsh_result["url"] = url 23 | except Exception as e: 24 | self.logger.ERROR('[TLSH] ' + str(e)) 25 | pass 26 | return tlsh_result 27 | -------------------------------------------------------------------------------- /crawler/yara-rules/agenttesla_webpanel_login.yar: -------------------------------------------------------------------------------- 1 | rule agenttesla_panel_login 2 | { 3 | meta: 4 | description = "AgentTesla panel login page" 5 | author = "josh@m9cyber.com" 6 | date = "2022-03-10" 7 | strings: 8 | $title = "web panel | login" nocase 9 | $form_action = "action=\"login.php\"" nocase 10 | $pass = "name=\"password\"" nocase 11 | $user = "name=\"username\"" nocase 12 | 13 | condition: 14 | all of them 15 | } 16 | 17 | rule agenttesla_panel_login_2 18 | { 19 | meta: 20 | description = "Origin (AgentTesla) Webpanel" 21 | author = "josh@m9cyber.com" 22 | date = "2022-02-21" 23 | strings: 24 | $title = "Login" 25 | $form = "action=\"login.php\"" 26 | $signin = "box-title m-b-20\">Sign In" 27 | $style = "margin: auto;margin-top:100px;}" 28 | condition: 29 | all of them 30 | } 31 | -------------------------------------------------------------------------------- /crawler/processing/jarm_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | from urllib.parse import urlparse 3 | from jarm.scanner.scanner import Scanner 4 | from .default_processing import DefaultProcessing 5 | import requests 6 | 7 | 8 | class JARMProcessing(DefaultProcessing): 9 | 10 | cfg = None 11 | logger = None 12 | 13 | def __init__(self, config, logger): 14 | self.cfg = config 15 | self.logger = logger 16 | 17 | def process(self, url, resp): 18 | jarm_scan = {} 19 | try: 20 | domain = urlparse(url).netloc 21 | res = requests.get("https://" + domain) # Leads on purpose to an exception if connection is refused 22 | result = Scanner.scan(domain, 443) 23 | jarm_scan["fingerprint"] = result[0] 24 | jarm_scan["domain"] = result[1] 25 | jarm_scan["port"] = result[2] 26 | except Exception: 27 | pass 28 | return jarm_scan 29 | -------------------------------------------------------------------------------- /crawler/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | web: 4 | build: . 5 | ports: 6 | - "8000:8000" 7 | volumes: 8 | - "/var/log/subcrawl:/var/log/subcrawl:rw" 9 | depends_on: 10 | - "kafka" 11 | 12 | zookeeper: 13 | image: confluentinc/cp-zookeeper:latest 14 | environment: 15 | ZOOKEEPER_CLIENT_PORT: 2181 16 | ZOOKEEPER_TICK_TIME: 2000 17 | expose: 18 | - 2181 19 | 20 | kafka: 21 | image: confluentinc/cp-kafka:latest 22 | depends_on: 23 | - zookeeper 24 | expose: 25 | - 29092 26 | - 9092 27 | environment: 28 | KAFKA_BROKER_ID: 1 29 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 30 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://kafka:29092 31 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 32 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 33 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 34 | -------------------------------------------------------------------------------- /crawler/utils/helpers.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import hashlib 3 | import re 4 | import sys 5 | from urllib.parse import urlparse 6 | 7 | class SubCrawlHelpers: 8 | 9 | def get_sha256(data): 10 | hash_object = hashlib.sha256(data) 11 | return hash_object.hexdigest() 12 | 13 | def save_content(file_name, data): 14 | with open(file_name, "wb") as file: 15 | file.write(data) 16 | 17 | def defang_url(url): 18 | parsed_url = urlparse(url) 19 | last_dot = parsed_url.netloc.rindex('.') 20 | defanged = parsed_url.netloc[0:last_dot] + '[.]' + parsed_url.netloc[last_dot + 1:] 21 | return url.replace(parsed_url.netloc, defanged).replace('http', 'hxxp') 22 | 23 | def get_config(cfg, collection, key): 24 | try: 25 | return cfg[collection][key] 26 | except Exception as e: 27 | sys.exit("[ENGINE] Error loading configuration: " 28 | + collection + " : " + key) 29 | -------------------------------------------------------------------------------- /crawler/yara-rules/js_webshell_tracking_script.yar: -------------------------------------------------------------------------------- 1 | rule js_webshell_tracking_script : notifier 2 | { 3 | meta: 4 | description = "JavaScript which notifies the attacker when the webshell becomes active" 5 | author = "HP Threat Research @HPSecurity" 6 | filetype = "JavaScript" 7 | maltype = "notifier" 8 | date = "2021-06-08" 9 | 10 | strings: 11 | $a1 = "ndsj===undefined" 12 | $a2 = "ndsw===undefined" 13 | 14 | $b = "function" 15 | 16 | $c = "HttpClient" 17 | 18 | $d1 = "XMLHttpRequest" 19 | $d2 = "Math" 20 | $d3 = "undefined" 21 | 22 | $e1 = "onreadystatechange" 23 | $e2 = "responseText" 24 | $e3 = "random" 25 | $e4 = "ndsx" 26 | $e5 = "GET" 27 | $e6 = "open" 28 | $e7 = "send" 29 | 30 | $f1 = "parseInt" 31 | $f2 = /var\s*\w+\s*\=\s*\[(['\"][\w\.\?\/\:]+['\"][,\]\s]+)+/ 32 | $g = "0x" 33 | 34 | condition: 35 | any of ($a*) and #b > 5 and #c >= 2 and all of ($d*) and (all of ($e*) or (all of ($f*) and #g > 50)) and filesize < 1MB 36 | } 37 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | © Copyright 2021 HP Development Company, L.P. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /crawler/utils/sqlite_model.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import os 3 | from peewee import * 4 | from playhouse.hybrid import hybrid_property 5 | import datetime 6 | 7 | db = SqliteDatabase('utils/subcrawl.db') 8 | 9 | 10 | class BaseModel(Model): 11 | class Meta: 12 | database = db 13 | 14 | 15 | class Domain(BaseModel): 16 | name = CharField(unique=True) 17 | description = TextField(null=True) 18 | 19 | 20 | class Url(BaseModel): 21 | domain = ForeignKeyField(Domain, backref='urls') 22 | url = CharField() 23 | status_code = IntegerField() 24 | title = CharField(null=True) 25 | sha256 = CharField() 26 | last_check = DateTimeField(default=datetime.datetime.utcnow) 27 | 28 | 29 | class Extension(BaseModel): 30 | key = CharField() 31 | value = TextField(null=True) 32 | url = ForeignKeyField(Url, backref='extensions') 33 | 34 | 35 | class Tag(BaseModel): 36 | tag = CharField(unique=True) 37 | description = TextField(null=True) 38 | 39 | 40 | class DomainTag(BaseModel): 41 | domain = ForeignKeyField(Domain, backref='domaintag') 42 | tag = ForeignKeyField(Tag, backref='domaintag') 43 | -------------------------------------------------------------------------------- /crawler/app/templates/domains.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 | 6 |
7 |

Domains

8 |
9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | {% for domain in domains %} 22 | 23 | 24 | 25 | 26 | 27 | 28 | {%- endfor %} 29 | 30 | 31 |
#NameDescriptionUrls
{{- domain.id }}{{- domain.name }}{{- domain.description }}{{- domain.urls.count() }}
32 | 33 |
34 | {% endblock %} 35 | -------------------------------------------------------------------------------- /crawler/service.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import io 3 | import logging 4 | import os 5 | import sys 6 | from datetime import datetime, timedelta 7 | 8 | import yaml 9 | from timeloop import Timeloop 10 | from utils import SubCrawlColors, SubCrawlHelpers 11 | from utils import check_topic 12 | 13 | # check if kafka topic exists and create if needed 14 | check_topic() 15 | tl = Timeloop() 16 | 17 | 18 | @tl.job(interval=timedelta(seconds=10)) 19 | def start_crawling(): 20 | with open("config.yml", "r") as ymlfile: 21 | global_cfg = yaml.safe_load(ymlfile) 22 | 23 | if not global_cfg: 24 | sys.exit(0) 25 | 26 | processing_modules = list() 27 | for processing_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "processing_modules"): 28 | processing_modules.append(processing_module) 29 | 30 | storage_modules = list() 31 | for storage_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "storage_modules"): 32 | storage_modules.append(storage_module) 33 | 34 | try: 35 | os.system("/usr/local/bin/python3 subcrawl.py -k -p " + ",".join(processing_modules) + " -s " + ",".join(storage_modules)) 36 | except Exception as e: 37 | print(e) 38 | 39 | 40 | tl.start(block=True) 41 | -------------------------------------------------------------------------------- /crawler/app/templates/urls.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 | 6 |
7 |

Urls

8 |
9 |
10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | {% for url in urls %} 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | {%- endfor %} 31 | 32 | 33 |
#UrlStatus CodeHashScanned
{{- url.id }}{{- url.url }}{{- url.status_code }}{{- url.sha256 }}{{- url.last_check }}
34 | 35 |
36 | {% endblock %} 37 | -------------------------------------------------------------------------------- /crawler/processing/yara_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import hashlib 3 | 4 | import yara 5 | from utils import SubCrawlColors, SubCrawlHelpers 6 | from .default_processing import DefaultProcessing 7 | 8 | 9 | class YARAProcessing(DefaultProcessing): 10 | 11 | cfg = None 12 | rules = None 13 | logger = None 14 | 15 | def __init__(self, config, logger): 16 | self.cfg = config 17 | self.logger = logger 18 | 19 | def process(self, url, content): 20 | if not self.rules: 21 | self.rules = yara.compile(filepath=SubCrawlHelpers.get_config( 22 | self.cfg, "crawler", "yara_rules")) 23 | 24 | yara_matches = {} 25 | http_resp = content.decode("latin-1") 26 | 27 | matches = self.rules.match(data=http_resp) 28 | if len(matches) > 0: 29 | self.logger.info(SubCrawlColors.CYAN + "[YARA] Matches - " + 30 | ' '.join(map(str, matches)) + 31 | " (" + url + " )" + SubCrawlColors.RESET) 32 | yara_matches["url"] = url 33 | yara_matches["hash"] = SubCrawlHelpers.get_sha256( 34 | http_resp.encode('utf-8')) 35 | for match in matches: 36 | yara_matches.setdefault("matches", []).append(str(match)) 37 | 38 | return yara_matches 39 | -------------------------------------------------------------------------------- /crawler/processing/clamav_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import hashlib 3 | from io import BytesIO 4 | 5 | import clamd 6 | 7 | from .default_processing import DefaultProcessing 8 | from utils import SubCrawlColors, SubCrawlHelpers 9 | 10 | # Installation ClamAV for this Module 11 | # sudo apt-get install clamav-daemon clamav-freshclam clamav-unofficial-sigs 12 | # sudo freshclam 13 | # sudo service clamav-daemon start 14 | 15 | 16 | class ClamAVProcessing(DefaultProcessing): 17 | 18 | cfg = None 19 | cd = None 20 | logger = None 21 | 22 | def __init__(self, config, logger): 23 | self.cfg = config 24 | self.logger = logger 25 | self.cd = clamd.ClamdUnixSocket() 26 | 27 | def process(self, url, content): 28 | scan_results = {} 29 | # self.cd = clamd.ClamdUnixSocket() 30 | # pong = self.cd.ping() # Will crash if not correctly installed. Handled in main crawler. 31 | buffer = BytesIO(content) 32 | scan_results = self.cd.instream(buffer) 33 | scan_results['url'] = url 34 | scan_results['hash'] = SubCrawlHelpers.get_sha256(content) 35 | 36 | try: 37 | if "OK" in scan_results['stream']: 38 | scan_results = {} 39 | else: 40 | clamav_status = str(scan_results['stream']).split(',') 41 | label = clamav_status[1].replace("'", '').replace(')', '').strip() 42 | scan_results['matches'] = label 43 | self.logger.info('[CLAMAV] Found - ' + label) 44 | except Exception as e: 45 | self.logger.error('[CLAMAV] ' + str(e)) 46 | scan_results = {} 47 | return scan_results 48 | -------------------------------------------------------------------------------- /crawler/app/templates/search_results.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 |
6 |

Search results

7 |
8 |
9 | 10 | {% if error %} 11 |
12 |
13 | 14 | 15 | 16 |
17 |
18 | {% endif %} 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | {% for url in urls %} 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | {%- endfor %} 41 | 42 | 43 |
#UrlStatus CodeHashScanned
{{- url.id }}{{- url.url }}{{- url.status_code }}{{- url.sha256 }}{{- url.last_check }}
44 | 45 |
46 | 47 | {% endblock %} 48 | -------------------------------------------------------------------------------- /crawler/input/phishtank.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import csv 3 | import io 4 | import logging 5 | import os 6 | import sys 7 | from datetime import datetime, timedelta 8 | from json import dumps, loads 9 | 10 | import requests 11 | from kafka import KafkaConsumer, KafkaProducer 12 | from timeloop import Timeloop 13 | 14 | producer = KafkaProducer(bootstrap_servers=['kafka:9092'], value_serializer=lambda x: dumps(x).encode('utf-8')) 15 | consumer = KafkaConsumer( 16 | 'urls', 17 | bootstrap_servers=['kafka:9092'], 18 | auto_offset_reset='earliest', 19 | enable_auto_commit=False, 20 | group_id='urls-dedup', 21 | consumer_timeout_ms=2000, 22 | auto_commit_interval_ms=1000, 23 | value_deserializer=lambda x: loads(x.decode('utf-8'))) 24 | 25 | PHISHTANK_API = "http://data.phishtank.com/data/online-valid.csv" 26 | tl = Timeloop() 27 | urls = set() 28 | 29 | 30 | # consume all urls from kafka and dedup 31 | def load_urls(): 32 | global urls 33 | try: 34 | for message in consumer: 35 | urls.add(message.value) 36 | except Exception as e: 37 | print(e) 38 | 39 | 40 | @tl.job(interval=timedelta(seconds=300)) 41 | def phishtank(): 42 | global urls 43 | if len(urls) == 0: 44 | load_urls() 45 | 46 | try: 47 | r = requests.get(PHISHTANK_API, allow_redirects=True) 48 | csv_data = io.StringIO(r.content.decode("utf-8")) 49 | csv_reader = csv.DictReader(csv_data) 50 | for row in csv_reader: 51 | url = row["url"] 52 | if url not in urls: 53 | producer.send('urls', value=url) 54 | urls.add(url) 55 | except Exception as e: 56 | print(e) 57 | pass # Could not download file. Try again in a few seconds. 58 | 59 | 60 | tl.start(block=True) 61 | -------------------------------------------------------------------------------- /crawler/yara-rules/combined-rules.yar: -------------------------------------------------------------------------------- 1 | include "./open_webshell.yar" 2 | include "./html_webshell_login.yar" 3 | include "./js_webshell_tracking_script.yar" 4 | include "./php_webshell_backend.yar" 5 | include "./panels.yar" 6 | include "./huntington_phish.yar" 7 | include "./link_sharing_onedrive.yar" 8 | include "./onedrive_business_phish.yar" 9 | include "./base64_pe.yar" 10 | include "./chase_login_spox_phish.yar" 11 | include "./obfuscated_script.yar" 12 | include "./default_page_xampp_windows.yar" 13 | include "./microsoft_phish.yar" 14 | include "./sharepoint_online_phish.yar" 15 | include "./attachments_onedrive_phish.yar" 16 | include "./default_page_apache.yar" 17 | include "./standard_bank_phish.yar" 18 | include "./wallet_connect_phish.yar" 19 | include "./bapr_banking_phish.yar" 20 | include "./agenttesla_webpanel_login.yar" 21 | include "./mana5_panel_login.yar" 22 | include "./mars_panel_login.yar" 23 | include "./pony_panel_login.yar" 24 | include "./amadey_panel_login.yar" 25 | include "./bankamerica_phish.yar" 26 | include "./royalmail_phish.yar" 27 | include "./wellsfargo_phish.yar" 28 | include "./outlook_phish.yar" 29 | include "./sharepoint_dropbox_online_phish.yar" 30 | include "./office365_review_phish.yar" 31 | include "./office365_verify_pdf_phish.yar" 32 | include "./microsoft_login_phish.yar" 33 | include "./hex-encoded-pe-file.yar" 34 | include "./erbium_discord_panel_login.yar" 35 | include "./php_file_manager_login.yar" 36 | include "./collector_stealer_panel_login.yar" 37 | include "./titan_stealer_panel_login.yar" 38 | include "./modernloader_panel_login.yar" 39 | include "./aurora_stealer_panel_login.yar" 40 | include "./grandamisha_panel_login.yar" 41 | include "./acridrain_stealer_panel_login.yar" 42 | include "./unam_webpanel_login.yar" 43 | include "./h3k_tinyfilemanager_login.yar" 44 | include "./base64_shellcode_dos_header_pe.yar" 45 | -------------------------------------------------------------------------------- /crawler/misp-objects/opendir-url/definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "attributes": { 3 | "url": { 4 | "description": "Scanned URL from opendir", 5 | "misp-attribute": "url", 6 | "ui-priority": 1 7 | }, 8 | "sha256": { 9 | "description": "Secure Hash Algorithm 2 (256 bits)", 10 | "misp-attribute": "sha256", 11 | "ui-priority": 1 12 | }, 13 | "content": { 14 | "description": "Plaintext content of URL response", 15 | "disable_correlation": true, 16 | "misp-attribute": "attachment", 17 | "ui-priority": 1 18 | }, 19 | "title": { 20 | "description": "Title of URL response", 21 | "misp-attribute": "text", 22 | "ui-priority": 1 23 | }, 24 | "sdhash": { 25 | "description": "SDhash of URL content", 26 | "misp-attribute": "text", 27 | "ui-priority": 1 28 | }, 29 | "tlsh": { 30 | "description": "Trend Micro Locality Sensitive Hash of URL content", 31 | "misp-attribute": "text", 32 | "ui-priority": 1 33 | }, 34 | "yara": { 35 | "description": "Matching YARA rule", 36 | "misp-attribute": "text", 37 | "ui-priority": 1 38 | }, 39 | "status-code": { 40 | "description": "Status Code of URL response.", 41 | "disable_correlation": true, 42 | "misp-attribute": "text", 43 | "ui-priority": 0 44 | }, 45 | "header": { 46 | "description": "Headers of URL response.", 47 | "disable_correlation": true, 48 | "misp-attribute": "text", 49 | "multiple": true, 50 | "ui-priority": 0 51 | } 52 | }, 53 | "description": "A scanresult from an opendir url", 54 | "meta-category": "network", 55 | "name": "opendir-url", 56 | "requiredOneOf": [ 57 | "url", 58 | "sha256" 59 | ], 60 | "uuid": "7b4f16a7-7934-42e8-85ac-5e3415c0be5c", 61 | "version": 9 62 | } 63 | -------------------------------------------------------------------------------- /crawler/input/urlhaus.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import csv 3 | import io 4 | import logging 5 | import os 6 | import sys 7 | from datetime import datetime, timedelta 8 | from json import dumps, loads 9 | 10 | import requests 11 | from kafka import KafkaConsumer, KafkaProducer 12 | from timeloop import Timeloop 13 | 14 | producer = KafkaProducer(bootstrap_servers=['kafka:9092'], value_serializer=lambda x: dumps(x).encode('utf-8')) 15 | consumer = KafkaConsumer( 16 | 'urls', 17 | bootstrap_servers=['kafka:9092'], 18 | auto_offset_reset='earliest', 19 | enable_auto_commit=False, 20 | group_id='urls-dedup', 21 | consumer_timeout_ms=2000, 22 | auto_commit_interval_ms=1000, 23 | value_deserializer=lambda x: loads(x.decode('utf-8'))) 24 | 25 | URLHAUS_API = "https://urlhaus.abuse.ch/downloads/csv_recent/" 26 | tl = Timeloop() 27 | urls = set() 28 | 29 | 30 | # consume all urls from kafka and dedup 31 | def load_urls(): 32 | global urls 33 | try: 34 | for message in consumer: 35 | urls.add(message.value) 36 | except Exception as e: 37 | print(e) 38 | 39 | 40 | @tl.job(interval=timedelta(seconds=300)) 41 | def urlhaus(): 42 | global urls 43 | if len(urls) == 0: 44 | load_urls() 45 | 46 | try: 47 | r = requests.get(URLHAUS_API, allow_redirects=True) 48 | csv_data = io.StringIO(r.content.decode("utf-8")) 49 | counter = 0 50 | while counter < 8: 51 | next(csv_data) 52 | counter += 1 53 | 54 | csv_reader = csv.DictReader(csv_data) 55 | for row in csv_reader: 56 | url = row["url"] 57 | if url not in urls: 58 | producer.send('urls', value=url) 59 | urls.add(url) 60 | except Exception as e: 61 | print(e) 62 | pass # Could not download file. Try again in a few seconds. 63 | 64 | 65 | tl.start(block=True) 66 | -------------------------------------------------------------------------------- /crawler/supervisor/supervisord.conf: -------------------------------------------------------------------------------- 1 | ; supervisor config file 2 | 3 | [unix_http_server] 4 | file=/dev/shm/supervisor.sock 5 | chmod=0700 ; sockef file mode (default 0700) 6 | 7 | [supervisord] 8 | ;nodaemon=true 9 | logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log) 10 | pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid) 11 | childlogdir=/var/log/supervisor ; ('AUTO' child log dir, default $TEMP) 12 | 13 | ; the below section must remain in the config file for RPC 14 | ; (supervisorctl/web interface) to work, additional interfaces may be 15 | ; added by defining them in separate rpcinterface: sections 16 | [rpcinterface:supervisor] 17 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 18 | 19 | [supervisorctl] 20 | serverurl=unix:///dev/shm/supervisor.sock 21 | 22 | ; The [include] section can just contain the "files" setting. This 23 | ; setting can list multiple files (separated by whitespace or 24 | ; newlines). It can also contain wildcards. The filenames are 25 | ; interpreted as relative to this file. Included files *cannot* 26 | ; include files themselves. 27 | 28 | [include] 29 | files = /etc/supervisor/conf.d/*.conf 30 | 31 | [program:urlhaus] 32 | command=/usr/local/bin/python3 urlhaus.py 33 | directory=/subcrawl/input 34 | autostart=true 35 | autorestart=true 36 | startretries=3 37 | stderr_logfile=/var/log/subcrawl/urlhaus.err.log 38 | stdout_logfile=/var/log/subcrawl/urlhaus.out.log 39 | user=root 40 | 41 | [program:phishtank] 42 | command=/usr/local/bin/python3 phishtank.py 43 | directory=/subcrawl/input 44 | autostart=true 45 | autorestart=true 46 | startretries=3 47 | stderr_logfile=/var/log/subcrawl/phishtank.err.log 48 | stdout_logfile=/var/log/subcrawl/phishtank.out.log 49 | user=root 50 | 51 | [program:subcrawl] 52 | priority=1 53 | command=/usr/local/bin/python3 service.py 54 | directory=/subcrawl 55 | autostart=true 56 | autorestart=true 57 | startretries=3 58 | stderr_logfile=/var/log/subcrawl/subcrawl.err.log 59 | stdout_logfile=/var/log/subcrawl/subcrawl.out.log 60 | user=root 61 | 62 | -------------------------------------------------------------------------------- /crawler/app/templates/dashboard.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 |
6 |

Dashboard

7 |
8 |
9 |
10 |
11 |
12 | 13 |
14 |
15 |
16 |
17 |
18 |
19 | 20 |
21 |
22 |
23 | 24 |
25 |
26 |
27 | 28 | {% for tag in tags %} 29 | {{ tag | display_tagname}} ({{ tag.count }}) 30 | {%- endfor %} 31 |
32 |
33 |
34 | 35 |
36 |
37 |
38 |

39 | {% for hash in hashes %} 40 | {{ hash.sha256 }} ({{ hash.count }})
41 | {%- endfor %} 42 |

43 |
44 |
45 | 46 |
47 | 48 | {% endblock %} 49 | -------------------------------------------------------------------------------- /crawler/storage/console_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import json 3 | import pprint 4 | from re import subn 5 | 6 | from utils import SubCrawlColors 7 | from .default_storage import DefaultStorage 8 | 9 | 10 | class ConsoleStorage(DefaultStorage): 11 | 12 | cfg = None 13 | logger = None 14 | 15 | def __init__(self, config, logger): 16 | self.cfg = config 17 | self.logger = logger 18 | 19 | def load_scraped_domains(self): 20 | return [] 21 | 22 | def store_result(self, result_data): 23 | total_urls = 0 24 | 25 | print(SubCrawlColors.PURPLE + "\n" + "*" * 25 + 26 | " CONSOLE STORAGE - SUMMARY " + "*" * 26 + "\n" + 27 | SubCrawlColors.RESET) 28 | 29 | for domain in result_data: 30 | results = dict() 31 | 32 | total_urls += len(result_data[domain]) 33 | 34 | for url_content in result_data[domain]: 35 | for module in url_content["modules"]: 36 | if url_content["modules"][module]: 37 | if len(url_content["modules"][module]) > 0: 38 | results.setdefault(module, []).append(url_content["modules"][module]) 39 | 40 | if len(results) > 0: 41 | print(SubCrawlColors.CYAN + "<===== " + str(domain) + 42 | " =====>"+SubCrawlColors.RESET) 43 | 44 | for payload_module in results: 45 | if payload_module == "JARMProcessing": 46 | for result in results[payload_module]: 47 | print("\t[" + payload_module + "] " + 48 | result["fingerprint"] + " (" + 49 | "port: " + str(result["port"]) + ")" + SubCrawlColors.RESET) 50 | else: 51 | for result in results[payload_module]: 52 | print("\t[" + payload_module + "] " + 53 | str(result['matches']) + "( " + 54 | result['url'] + " )" + SubCrawlColors.RESET) 55 | print("\t\t[SHA256] " + result['hash']) 56 | print("") 57 | 58 | return True 59 | -------------------------------------------------------------------------------- /crawler/utils/logger.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | # source: https://www.toptal.com/python/in-depth-python-logging 3 | 4 | import logging 5 | import sys 6 | import enum 7 | from logging.handlers import TimedRotatingFileHandler 8 | from utils.ansi_colors import SubCrawlColors 9 | 10 | 11 | class SubCrawlLogger(): 12 | 13 | formatter = None 14 | log_file = "" 15 | logger_name = "" 16 | log_level = logging.WARN 17 | 18 | def __init__(self, logfile, logger_name, log_level=logging.WARN): 19 | self.log_file = logfile 20 | self.logger_name = logger_name 21 | self.log_level = log_level 22 | self.formatter = CustomFormatter() 23 | 24 | def get_console_handler(self): 25 | console_handler = logging.StreamHandler(sys.stdout) 26 | console_handler.setFormatter(self.formatter) 27 | return console_handler 28 | 29 | def get_file_handler(self): 30 | file_handler = TimedRotatingFileHandler(self.log_file, when='midnight') 31 | file_handler.setFormatter(self.formatter) 32 | return file_handler 33 | 34 | def get_logger(self): 35 | logger = logging.getLogger(self.logger_name) 36 | logger.setLevel(self.log_level) 37 | logger.addHandler(self.get_file_handler()) 38 | logger.addHandler(self.get_console_handler()) 39 | logger.propagate = False 40 | return logger 41 | 42 | 43 | class SubCrawlLoggerLevels(enum.Enum): 44 | NOTSET = 0 45 | DEBUG = 10 46 | INFO = 20 47 | WARN = 30 48 | ERROR = 40 49 | CRITICAL = 50 50 | 51 | 52 | class CustomFormatter(logging.Formatter): 53 | format = "%(asctime)s — %(name)s — %(levelname)s — %(message)s" 54 | 55 | FORMATS = { 56 | logging.DEBUG: SubCrawlColors.GREEN + format + SubCrawlColors.RESET, 57 | logging.INFO: SubCrawlColors.BLUE + format + SubCrawlColors.RESET, 58 | logging.WARNING: SubCrawlColors.YELLOW + format + SubCrawlColors.RESET, 59 | logging.ERROR: SubCrawlColors.RED + format + SubCrawlColors.RESET, 60 | logging.CRITICAL: SubCrawlColors.RED + format + SubCrawlColors.RESET 61 | } 62 | 63 | def format(self, record): 64 | log_fmt = self.FORMATS.get(record.levelno) 65 | formatter = logging.Formatter(log_fmt) 66 | return formatter.format(record) 67 | -------------------------------------------------------------------------------- /crawler/processing/sdhash_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | ##### 3 | # Probably not the easiest module to install. Needs protobuf-2.5.0 and python3.6 and of course sdhash 4 | # 5 | # Protobuf installation: 6 | # > apt-get update 7 | # > apt-get -y install libssl-dev libevent-pthreads-2.1-6 libomp-dev g++ 8 | # > apt-get -y install autoconf automake libtool curl make g++ unzip 9 | # > wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.zip 10 | # > unzip protobuf-2.5.0.zip 11 | # > cd protobuf-2.5.0 12 | # > ./configure 13 | # > make 14 | # > sudo make install 15 | # 16 | # Python3.6 installation. 17 | # > apt-get install python3.6-dev 18 | # > sudo ldconfig 19 | # 20 | # SdHash installation: 21 | # Use binaries from folder minisdhash or compile itself. If you chose the later -> have fun. 22 | # 23 | 24 | import os 25 | import random 26 | import string 27 | 28 | from .default_processing import DefaultProcessing 29 | from .minisdhash import sdbf_class as sdhash 30 | from utils import SubCrawlHelpers, SubCrawlColors 31 | 32 | 33 | class SDhashProcessing(DefaultProcessing): 34 | 35 | cfg = None 36 | logger = None 37 | 38 | def __init__(self, config, logger): 39 | self.cfg = config 40 | self.logger = logger 41 | 42 | def save_content(self, data): 43 | try: 44 | letters = string.ascii_lowercase 45 | filename = ''.join(random.choice(letters) for i in range(10)) 46 | with open(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + filename, "wb") as file: 47 | file.write(data) 48 | return filename 49 | except Exception as e: 50 | self.logger.error("[SDHASH] Error: " + str(e)) 51 | return "" 52 | 53 | def process(self, url, content): 54 | sd_result = {} 55 | if len(content) < 512: 56 | return {} 57 | 58 | try: 59 | file_name = self.save_content(content) 60 | if file_name: 61 | sd = sdhash.sdbf(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + file_name, 0) 62 | sd_result["sdhash"] = sd.to_string() 63 | sd_result["url"] = url 64 | os.remove(SubCrawlHelpers.get_config(self.cfg, "crawler", "temp_dir") + file_name,) 65 | except Exception as e: 66 | self.logger.error("[SDHASH] Error: " + str(e)) 67 | return sd_result 68 | -------------------------------------------------------------------------------- /crawler/config.yml: -------------------------------------------------------------------------------- 1 | crawler: 2 | batch_size: 250 3 | log_level: INFO 4 | scan_simple_domains: False 5 | host_max_crawl_depth: 2 6 | follow_redirects: False 7 | download_dir: samples/ 8 | tmp_dir: tmp/ 9 | save_payload_content: False 10 | yara_rules: yara-rules/combined-rules.yar 11 | logos_path: utils/logos/ 12 | tag_line: ~~ Harvesting the Open Web ~~ 13 | http_request_timeout: 10 14 | delay_execution_time: 0 15 | http_download_timeout: 60 16 | http_max_size: 26214400 17 | processing_modules: 18 | - ClamAVProcessing 19 | - JARMProcessing 20 | - TLSHProcessing 21 | - YARAProcessing 22 | storage_modules: 23 | - SqliteStorage 24 | opendir_title: 25 | - index of 26 | - directory listing for 27 | ext_exclude: 28 | - .js 29 | - .css 30 | - .eot 31 | - .woff 32 | - .woff2 33 | - .png 34 | - .jpg 35 | - .jpeg 36 | - .gif 37 | - .json 38 | - .scss 39 | - .md 40 | - tinymce.php 41 | - .mp4 42 | - .mp3 43 | - .mo 44 | - .svg 45 | - .po 46 | - .crt 47 | - .phar 48 | - .map 49 | - .xml 50 | - .pdf 51 | - .ico 52 | - .ttf 53 | - .go 54 | - .psd 55 | - .csv 56 | - .xap 57 | - .ts 58 | - .stub 59 | - .tpl 60 | - .h 61 | archive_magics: 62 | - zip archive data 63 | pe_magics: 64 | - pe32 65 | - ms-dos 66 | php_magics: 67 | - php script 68 | office_magics: 69 | - "application: microsoft" 70 | - microsoft ooxml 71 | - microsoft excel 72 | - microsoft word 73 | elf_magics: 74 | - "ELF 64" 75 | - "ELF 32" 76 | java_magics: 77 | - "Java archive data" 78 | headers: 79 | User-Agent: Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36 80 | DNT: "1" 81 | Pragma: no-cache 82 | Cache-Control: no-cache 83 | urlhaus_api: https://urlhaus.abuse.ch/downloads/csv_recent/ 84 | misp: 85 | misp_url: https://localhost 86 | misp_api_key: API_KEY_GOES_HERE 87 | domain_event: 0 88 | elasticsearch: 89 | host: localhost 90 | port: 9200 91 | index: subcrawl 92 | archive_response_content: False 93 | archive_log_location: "log/" 94 | external_intel: 95 | vt_api: 96 | urlhaus_api: 97 | bazaar_api: 98 | submit_urlhaus: False 99 | submit_bazaar: False 100 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #VS Code 2 | .vscode 3 | 4 | # SubCrawl specific 5 | subcrawl.log.* 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | -------------------------------------------------------------------------------- /crawler/app/templates/url_details.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 |
6 |

Url Information

7 |
8 |
9 | 10 | 11 | 12 |
13 | 14 | 15 |
16 | 17 |
18 | 19 | 20 |
21 | 22 |
23 | 24 | 25 |
26 | 27 |
28 | 29 | 30 |
31 | 32 |
33 | 34 | 35 |
36 | 37 | {% for ext in extensions %} 38 |
39 | 40 | 41 |
42 | {%- endfor %} 43 | 44 | 45 | 46 | 47 | 48 | 49 | 63 | 64 |
65 | 66 | 81 | 82 | {% endblock %} 83 | -------------------------------------------------------------------------------- /crawler/app/templates/domain_details.html: -------------------------------------------------------------------------------- 1 | 2 | {% extends 'base.html' %} 3 | 4 | {% block content %} 5 |
6 |

Domain Information

7 |
8 |
9 | 10 |
11 | 12 |
13 | 14 | 15 |
16 | 17 |
18 | 19 | 20 |
21 | 22 |
23 | 24 | {% for tag in tags %} 25 | {{ tag.tag }} 26 | {%- endfor %} 27 |

28 | 29 | 30 | 31 | 32 | 33 |
34 | 35 | 49 | 50 |

Urls

51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | {% for url in urls %} 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | {%- endfor %} 73 | 74 | 75 |
#UrlStatus CodeHashScanned
{{- url.id }}{{- url.url }}{{- url.status_code }}{{- url.sha256 }}{{- url.last_check }}
76 |
77 | 78 | 93 | 94 | {% endblock %} 95 | -------------------------------------------------------------------------------- /crawler/processing/payload_processing.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import hashlib 3 | import os 4 | import magic 5 | from utils import SubCrawlColors, SubCrawlHelpers 6 | 7 | from .default_processing import DefaultProcessing 8 | 9 | 10 | class PayloadProcessing(DefaultProcessing): 11 | 12 | cfg = None 13 | logger = None 14 | 15 | def __init__(self, config, logger): 16 | self.cfg = config 17 | self.logger = logger 18 | 19 | if not os.path.exists(SubCrawlHelpers.get_config( 20 | self.cfg, "crawler", "download_dir")): 21 | os.makedirs(SubCrawlHelpers.get_config( 22 | self.cfg, "crawler", "download_dir")) 23 | 24 | def process(self, url, content): 25 | payload = {} 26 | content_match = True 27 | file_ext = "" 28 | 29 | shasum = SubCrawlHelpers.get_sha256(content) 30 | content_magic = magic.from_buffer(content).lower() 31 | matches = content_magic 32 | 33 | if any(partial in content_magic for partial in 34 | SubCrawlHelpers.get_config(self.cfg, "crawler", "pe_magics")): 35 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] PE file found " + 36 | url + " (" + shasum + ")" 37 | + SubCrawlColors.RESET) 38 | 39 | file_ext = ".bin" 40 | if "(dll)" in content_magic: 41 | file_ext = ".dll" + file_ext 42 | elif "x86-64" in content_magic: 43 | file_ext = ".64.exe" + file_ext 44 | else: 45 | file_ext = ".exe" + file_ext 46 | 47 | elif any(partial in content_magic for partial in 48 | SubCrawlHelpers.get_config(self.cfg, "crawler", 49 | "archive_magics")): 50 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] ZIP found at " + 51 | url + " (" + shasum + ")" + 52 | SubCrawlColors.RESET) 53 | file_ext = ".zip.bin" 54 | elif any(partial in content_magic for partial in 55 | SubCrawlHelpers.get_config(self.cfg, "crawler", 56 | "php_magics")): 57 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] PHP found at " + 58 | SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" + 59 | SubCrawlColors.RESET) 60 | file_ext = ".php.bin" 61 | elif any(partial in content_magic for partial in 62 | SubCrawlHelpers.get_config(self.cfg, "crawler", 63 | "office_magics")): 64 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] Doc found at " + 65 | SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" + 66 | SubCrawlColors.RESET) 67 | file_ext = ".office.bin" 68 | elif any(partial in content_magic for partial in 69 | SubCrawlHelpers.get_config(self.cfg, "crawler", 70 | "elf_magics")): 71 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] ELF found at " + 72 | SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" + 73 | SubCrawlColors.RESET) 74 | file_ext = ".elf.bin" 75 | elif any(partial in content_magic for partial in 76 | SubCrawlHelpers.get_config(self.cfg, "crawler", 77 | "java_magics")): 78 | self.logger.info(SubCrawlColors.CYAN + "[PAYLOAD] Java found at " + 79 | SubCrawlHelpers.defang_url(url) + " (" + shasum + ")" + 80 | SubCrawlColors.RESET) 81 | else: 82 | content_match = False 83 | 84 | if content_match: 85 | payload = {"hash": shasum, "url": url, "matches": matches} 86 | 87 | if content_match and \ 88 | SubCrawlHelpers.get_config(self.cfg, "crawler", 89 | "save_payload_content"): 90 | try: 91 | SubCrawlHelpers.save_content( 92 | self.cfg['crawler']['download_dir'] + 93 | shasum + file_ext, content) 94 | self.logger.info(SubCrawlColors.CYAN + 95 | "[PAYLOAD] Saved file " + 96 | SubCrawlHelpers.defang_url(url) + 97 | SubCrawlColors.RESET) 98 | except Exception as e: 99 | self.logger.error("[PAYLOAD] " + str(e)) 100 | pass 101 | 102 | return payload 103 | -------------------------------------------------------------------------------- /crawler/storage/sqlite_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import csv 3 | import io 4 | import logging 5 | from io import StringIO 6 | from urllib.parse import urlparse 7 | 8 | import requests 9 | from utils import Domain, DomainTag, Extension, Tag, Url, db, fn 10 | from utils import SubCrawlColors, SubCrawlHelpers 11 | from .default_storage import DefaultStorage 12 | 13 | 14 | class SqliteStorage(DefaultStorage): 15 | 16 | cfg = None 17 | logger = None 18 | 19 | def __init__(self, config, logger): 20 | self.cfg = config 21 | self.logger = logger 22 | 23 | def load_scraped_domains(self): 24 | domains = Domain.select() 25 | return domains 26 | 27 | def store_result(self, result_data): 28 | # Load URLHaus tags 29 | url_info = dict() 30 | r = requests.get(SubCrawlHelpers.get_config(self.cfg, "crawler", "urlhaus_api"), allow_redirects=True) 31 | csv_data = io.StringIO(r.content.decode("utf-8")) 32 | counter = 0 33 | while counter < 8: 34 | next(csv_data) 35 | counter += 1 36 | 37 | csv_reader = csv.DictReader(csv_data) 38 | for row in csv_reader: 39 | domain = urlparse(row["url"]).netloc 40 | if domain not in url_info: 41 | url_info[domain] = set() 42 | url_info[domain].update(row["tags"].lower().split(",")) 43 | 44 | for domain in result_data: 45 | tags = [] 46 | if domain in url_info: 47 | tags = url_info[domain] 48 | 49 | if len(result_data[domain]) > 0: 50 | domains = Domain.select().where(Domain.name == domain) 51 | 52 | if len(domains) > 0: 53 | ref_domain = domains[0] 54 | else: 55 | ref_domain = Domain(name=domain) 56 | ref_domain.save() 57 | 58 | for tag in tags: 59 | db_tag = Tag.select().where(Tag.tag == tag) 60 | if len(db_tag) == 0: 61 | db_tag = Tag(tag=tag) 62 | db_tag.save() 63 | dt = DomainTag(domain=ref_domain, tag=db_tag) 64 | dt.save() 65 | 66 | for url_content in result_data[domain]: 67 | 68 | url = Url(domain=ref_domain, url=str(url_content["url"]), status_code=url_content["data"]["resp"]["status_code"], title=str(url_content["data"]["title"]), sha256=str(url_content["sha256"])) 69 | url.save() 70 | 71 | if "index of" in str(url_content["data"]["title"]).lower(): 72 | db_tag = Tag.select().where(Tag.tag == "opendir") 73 | if len(db_tag) == 0: 74 | db_tag = Tag(tag="opendir") 75 | db_tag.save() 76 | 77 | dt = DomainTag.select().where(DomainTag.domain == ref_domain, DomainTag.tag == db_tag) 78 | if len(dt) == 0: 79 | dt = DomainTag(domain=ref_domain, tag=db_tag) 80 | dt.save() 81 | 82 | for header in url_content["data"]["resp"]["headers"]: 83 | ext = Extension(key=str(header).lower(), value=url_content["data"]["resp"]["headers"][header], url=url) 84 | ext.save() 85 | 86 | try: 87 | for module in url_content["modules"]: 88 | if len(url_content["modules"][module]) > 0: 89 | if module == "JARMProcessing": 90 | ext = Extension(key="jarm", value=str(url_content["modules"][module]["fingerprint"]), url=url) 91 | ext.save() 92 | 93 | elif module == "SDhashProcessing": 94 | ext = Extension(key="sdhash", value=str(url_content["modules"][module]["sdhash"]), url=url) 95 | ext.save() 96 | 97 | elif module == "TLSHProcessing": 98 | ext = Extension(key="tlsh", value=str(url_content["modules"][module]["tlsh"]), url=url) 99 | ext.save() 100 | 101 | elif module == "YARAProcessing": 102 | for rule in url_content["modules"][module]["rules"]: 103 | ext = Extension(key="yara", value=str(rule), url=url) 104 | ext.save() 105 | 106 | except Exception as e: 107 | self.logger.error('[SQLite] ' + str(e)) 108 | 109 | self.logger.info("[SQLite] Scan results stored: " + domain) 110 | -------------------------------------------------------------------------------- /crawler/storage/elastic_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import os 3 | import zipfile 4 | import io 5 | from datetime import datetime 6 | from urllib.parse import urlparse 7 | from re import subn 8 | from elasticsearch import Elasticsearch, helpers 9 | 10 | from utils import SubCrawlColors, SubCrawlHelpers 11 | from .default_storage import DefaultStorage 12 | 13 | 14 | class ElasticStorage(DefaultStorage): 15 | 16 | cfg = None 17 | logger = None 18 | es = None 19 | index = None 20 | archive_location = None 21 | archive_content = False 22 | max_fields = 0 23 | 24 | def __init__(self, config, logger): 25 | self.cfg = config 26 | self.logger = logger 27 | self.archive_location = SubCrawlHelpers.get_config(self.cfg,'elasticsearch', 'archive_log_location') 28 | self.archive_content = SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'archive_response_content') 29 | self.index = SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'index') 30 | 31 | try: 32 | self.es = Elasticsearch([{'host': SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'host'), 33 | 'port': SubCrawlHelpers.get_config(self.cfg, 'elasticsearch', 'port'), 34 | 'index': self.index}]) 35 | self.es.ping() 36 | 37 | if not self.es.indices.exists(self.index): 38 | self.logger.debug('[ELASTIC] Index did not exist, creating: ' + self.index) 39 | self.es.indices.create(index=self.index) 40 | 41 | if self.archive_content: 42 | if not os.path.isdir(self.archive_location): 43 | os.mkdir(self.archive_location) 44 | self.logger.debug('[ELASTIC] Response content being saved, log created at: ' + self.archive_location) 45 | 46 | except Exception as e: 47 | self.logger.error('[ELASTIC] Problem connecting to Elastic: ' + str(e)) 48 | raise e 49 | 50 | def load_scraped_domains(self): 51 | return [] 52 | 53 | def normalize_field_name(self, field_name): 54 | return field_name.replace(' ','_').replace('-','_').lower() 55 | 56 | def store_content(self, content_buffer, file_name): 57 | 58 | try: 59 | tmp_buffer = io.BytesIO() 60 | 61 | with zipfile.ZipFile(tmp_buffer, mode='w',compression=zipfile.ZIP_DEFLATED) as zip_file: 62 | zip_file.writestr('http.response.payload', str.encode(content_buffer,'utf-8')) 63 | 64 | with open(self.archive_location + file_name,'wb') as tmp_zip: 65 | tmp_zip.write(tmp_buffer.getvalue()) 66 | 67 | except Exception as ex: 68 | self.logger.error('[ELASTIC] Problem adding data: ' + str(ex)) 69 | 70 | 71 | def store_result(self, result_data): 72 | data = {} 73 | doc_list = [] 74 | 75 | try: 76 | for domain in result_data: 77 | for url_content in result_data[domain]: 78 | field_cnt = 0 79 | parsed_url = urlparse(url_content['url']) 80 | 81 | data = { 82 | 'http.request.url': url_content['url'], 83 | 'http.request.scheme': parsed_url.scheme, 84 | 'http.request.netloc': parsed_url.netloc, 85 | 'http.request.path': parsed_url.path, 86 | 'http.request.params': parsed_url.params, 87 | 'http.request.query': parsed_url.query, 88 | 'http.request.fragment': parsed_url.fragment, 89 | 'crawled_on': url_content['scraped_on'], 90 | 'http.response.body.content.sha256': url_content['sha256'], 91 | 'http.response.body.content_magic': url_content['content_type'], 92 | 'http.signature': url_content['signature'], 93 | 'http.response.title': url_content['data']['title'], 94 | 'http.response.status_code': url_content['data']['resp']['status_code'], 95 | } 96 | 97 | for header in url_content['data']['resp']['headers']: 98 | data['http.response.header.' + self.normalize_field_name(header)] = url_content['data']['resp']['headers'][header] 99 | 100 | for module in url_content['modules']: 101 | if len(url_content['modules'][module]) > 0: 102 | if module == 'YARAProcessing': 103 | data['yara_results'] = url_content['modules'][module]['matches'] 104 | 105 | if self.archive_content: 106 | tmp_dt = datetime.strptime(url_content['scraped_on'][:-7], '%Y-%m-%dT%H:%M:%S') 107 | self.store_content(url_content['data']['text'],str(int(tmp_dt.timestamp())) + '_' + url_content['sha256']) 108 | 109 | doc_list.append(data) 110 | 111 | helpers.bulk( 112 | self.es, 113 | doc_list, 114 | index=self.index 115 | ) 116 | 117 | self.logger.info('[ELASTIC] added ' + str(len(doc_list)) + ' items') 118 | 119 | except Exception as e: 120 | self.logger.error('[ELASTIC] Problem adding data: ' + str(e)) 121 | -------------------------------------------------------------------------------- /crawler/app/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 15 | 16 | 17 | 18 | SubCrawl 19 | 24 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 |
107 | 118 |
119 |
120 | 137 |
138 |
139 | 149 |
153 |
154 | {% block content%} 155 | 156 | {% endblock %} 157 |
158 |
159 |
160 | 183 | 184 |
185 |
Loading
186 |
187 | 188 | 189 | -------------------------------------------------------------------------------- /crawler/processing/minisdhash/sdbf_class.py: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by SWIG (http://www.swig.org). 2 | # Version 3.0.12 3 | # 4 | # Do not make changes to this file unless you know what you are doing--modify 5 | # the SWIG interface file instead. 6 | 7 | from sys import version_info as _swig_python_version_info 8 | if _swig_python_version_info >= (2, 7, 0): 9 | def swig_import_helper(): 10 | import importlib 11 | pkg = __name__.rpartition('.')[0] 12 | mname = '.'.join((pkg, '_sdbf_class')).lstrip('.') 13 | try: 14 | return importlib.import_module(mname) 15 | except ImportError: 16 | return importlib.import_module('_sdbf_class') 17 | _sdbf_class = swig_import_helper() 18 | del swig_import_helper 19 | elif _swig_python_version_info >= (2, 6, 0): 20 | def swig_import_helper(): 21 | from os.path import dirname 22 | import imp 23 | fp = None 24 | try: 25 | fp, pathname, description = imp.find_module('_sdbf_class', [dirname(__file__)]) 26 | except ImportError: 27 | import _sdbf_class 28 | return _sdbf_class 29 | try: 30 | _mod = imp.load_module('_sdbf_class', fp, pathname, description) 31 | finally: 32 | if fp is not None: 33 | fp.close() 34 | return _mod 35 | _sdbf_class = swig_import_helper() 36 | del swig_import_helper 37 | else: 38 | import _sdbf_class 39 | del _swig_python_version_info 40 | 41 | try: 42 | _swig_property = property 43 | except NameError: 44 | pass # Python < 2.2 doesn't have 'property'. 45 | 46 | try: 47 | import builtins as __builtin__ 48 | except ImportError: 49 | import __builtin__ 50 | 51 | def _swig_setattr_nondynamic(self, class_type, name, value, static=1): 52 | if (name == "thisown"): 53 | return self.this.own(value) 54 | if (name == "this"): 55 | if type(value).__name__ == 'SwigPyObject': 56 | self.__dict__[name] = value 57 | return 58 | method = class_type.__swig_setmethods__.get(name, None) 59 | if method: 60 | return method(self, value) 61 | if (not static): 62 | if _newclass: 63 | object.__setattr__(self, name, value) 64 | else: 65 | self.__dict__[name] = value 66 | else: 67 | raise AttributeError("You cannot add attributes to %s" % self) 68 | 69 | 70 | def _swig_setattr(self, class_type, name, value): 71 | return _swig_setattr_nondynamic(self, class_type, name, value, 0) 72 | 73 | 74 | def _swig_getattr(self, class_type, name): 75 | if (name == "thisown"): 76 | return self.this.own() 77 | method = class_type.__swig_getmethods__.get(name, None) 78 | if method: 79 | return method(self) 80 | raise AttributeError("'%s' object has no attribute '%s'" % (class_type.__name__, name)) 81 | 82 | 83 | def _swig_repr(self): 84 | try: 85 | strthis = "proxy of " + self.this.__repr__() 86 | except __builtin__.Exception: 87 | strthis = "" 88 | return "<%s.%s; %s >" % (self.__class__.__module__, self.__class__.__name__, strthis,) 89 | 90 | try: 91 | _object = object 92 | _newclass = 1 93 | except __builtin__.Exception: 94 | class _object: 95 | pass 96 | _newclass = 0 97 | 98 | KB = _sdbf_class.KB 99 | 100 | def new_intp(): 101 | return _sdbf_class.new_intp() 102 | new_intp = _sdbf_class.new_intp 103 | 104 | def copy_intp(value): 105 | return _sdbf_class.copy_intp(value) 106 | copy_intp = _sdbf_class.copy_intp 107 | 108 | def delete_intp(obj): 109 | return _sdbf_class.delete_intp(obj) 110 | delete_intp = _sdbf_class.delete_intp 111 | 112 | def intp_assign(obj, value): 113 | return _sdbf_class.intp_assign(obj, value) 114 | intp_assign = _sdbf_class.intp_assign 115 | 116 | def intp_value(obj): 117 | return _sdbf_class.intp_value(obj) 118 | intp_value = _sdbf_class.intp_value 119 | class sdbf_conf(_object): 120 | __swig_setmethods__ = {} 121 | __setattr__ = lambda self, name, value: _swig_setattr(self, sdbf_conf, name, value) 122 | __swig_getmethods__ = {} 123 | __getattr__ = lambda self, name: _swig_getattr(self, sdbf_conf, name) 124 | __repr__ = _swig_repr 125 | 126 | def __init__(self, thread_cnt, warnings, max_elem_ct, max_elem_ct_dd): 127 | this = _sdbf_class.new_sdbf_conf(thread_cnt, warnings, max_elem_ct, max_elem_ct_dd) 128 | try: 129 | self.this.append(this) 130 | except __builtin__.Exception: 131 | self.this = this 132 | __swig_destroy__ = _sdbf_class.delete_sdbf_conf 133 | __del__ = lambda self: None 134 | sdbf_conf_swigregister = _sdbf_class.sdbf_conf_swigregister 135 | sdbf_conf_swigregister(sdbf_conf) 136 | 137 | class sdbf(_object): 138 | __swig_setmethods__ = {} 139 | __setattr__ = lambda self, name, value: _swig_setattr(self, sdbf, name, value) 140 | __swig_getmethods__ = {} 141 | __getattr__ = lambda self, name: _swig_getattr(self, sdbf, name) 142 | __repr__ = _swig_repr 143 | 144 | def __init__(self, *args): 145 | this = _sdbf_class.new_sdbf(*args) 146 | try: 147 | self.this.append(this) 148 | except __builtin__.Exception: 149 | self.this = this 150 | __swig_destroy__ = _sdbf_class.delete_sdbf 151 | __del__ = lambda self: None 152 | 153 | def name(self): 154 | return _sdbf_class.sdbf_name(self) 155 | 156 | def size(self): 157 | return _sdbf_class.sdbf_size(self) 158 | 159 | def input_size(self): 160 | return _sdbf_class.sdbf_input_size(self) 161 | 162 | def compare(self, other, sample): 163 | return _sdbf_class.sdbf_compare(self, other, sample) 164 | 165 | def to_string(self): 166 | return _sdbf_class.sdbf_to_string(self) 167 | 168 | def get_index_results(self): 169 | return _sdbf_class.sdbf_get_index_results(self) 170 | 171 | def clone_filter(self, position): 172 | return _sdbf_class.sdbf_clone_filter(self, position) 173 | 174 | def filter_count(self): 175 | return _sdbf_class.sdbf_filter_count(self) 176 | __swig_setmethods__["config"] = _sdbf_class.sdbf_config_set 177 | __swig_getmethods__["config"] = _sdbf_class.sdbf_config_get 178 | if _newclass: 179 | config = _swig_property(_sdbf_class.sdbf_config_get, _sdbf_class.sdbf_config_set) 180 | if _newclass: 181 | get_elem_count = staticmethod(_sdbf_class.sdbf_get_elem_count) 182 | else: 183 | get_elem_count = _sdbf_class.sdbf_get_elem_count 184 | sdbf_swigregister = _sdbf_class.sdbf_swigregister 185 | sdbf_swigregister(sdbf) 186 | cvar = _sdbf_class.cvar 187 | 188 | def sdbf_get_elem_count(mine, index): 189 | return _sdbf_class.sdbf_get_elem_count(mine, index) 190 | sdbf_get_elem_count = _sdbf_class.sdbf_get_elem_count 191 | 192 | # This file is compatible with both classic and new-style classes. 193 | 194 | 195 | -------------------------------------------------------------------------------- /crawler/storage/misp_storage.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import csv 3 | import io 4 | import logging 5 | from io import StringIO 6 | from urllib.parse import urlparse 7 | 8 | import requests 9 | from pymisp import ExpandedPyMISP, MISPAttribute, MISPEvent, MISPObject 10 | from utils import SubCrawlColors, SubCrawlHelpers 11 | from .default_storage import DefaultStorage 12 | 13 | 14 | class MISPStorage(DefaultStorage): 15 | 16 | cfg = None 17 | logger = None 18 | 19 | def __init__(self, config, logger): 20 | logging.getLogger("pymisp").setLevel(logging.CRITICAL) 21 | self.cfg = config 22 | self.logger = logger 23 | 24 | def load_scraped_domains(self): 25 | misp = ExpandedPyMISP(SubCrawlHelpers.get_config(self.cfg, "misp", "misp_url"), SubCrawlHelpers.get_config(self.cfg, "misp", "misp_api_key"), False) 26 | 27 | domains = set() 28 | domain_event = None 29 | if SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event") != 0: 30 | domain_event = misp.get_event(SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event"), pythonify=True) 31 | for att in domain_event.attributes: 32 | if att.type == "domain": 33 | domains.add(att.value) 34 | else: 35 | self.logger.warning('[MISP] No domain MISP event configured') 36 | 37 | return domains 38 | 39 | def store_result(self, result_data): 40 | misp = ExpandedPyMISP(SubCrawlHelpers.get_config(self.cfg, "misp", "misp_url"), SubCrawlHelpers.get_config(self.cfg, "misp", "misp_api_key"), False) 41 | 42 | domain_event = None 43 | if SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event") != 0: 44 | domain_event = misp.get_event(SubCrawlHelpers.get_config(self.cfg, "misp", "domain_event"), pythonify=True) 45 | 46 | url_info = dict() 47 | r = requests.get(SubCrawlHelpers.get_config(self.cfg, "crawler", "urlhaus_api"), allow_redirects=True) 48 | csv_data = io.StringIO(r.content.decode("utf-8")) 49 | counter = 0 50 | while counter < 8: 51 | next(csv_data) 52 | counter += 1 53 | 54 | csv_reader = csv.DictReader(csv_data) 55 | for row in csv_reader: 56 | domain = urlparse(row["url"]).netloc 57 | if domain not in url_info: 58 | url_info[domain] = set() 59 | url_info[domain].update(row["tags"].lower().split(",")) 60 | 61 | for domain in result_data: 62 | tags = [] 63 | if domain in url_info: 64 | tags = url_info[domain] 65 | 66 | if len(result_data[domain]) > 0: 67 | 68 | jarm_added = False 69 | event_data = misp.search_index(eventinfo=domain, pythonify=True) 70 | if len(event_data) > 0: 71 | event = event_data[0] 72 | else: 73 | event = MISPEvent() 74 | event.distribution = 1 75 | event.threat_level_id = 4 76 | event.analysis = 1 77 | event.info = domain 78 | 79 | for tag in tags: 80 | event.add_tag(tag) 81 | event.add_tag("tlp:green") 82 | 83 | event = misp.add_event(event, pythonify=True) 84 | 85 | server_created = False 86 | scripttech_created = False 87 | 88 | attribute = MISPAttribute() 89 | attribute.type = "domain" 90 | attribute.value = domain 91 | misp.add_attribute(event, attribute) 92 | if domain_event: 93 | dom_attribute = MISPAttribute() # Not beautiful but new attribute must be generated due to the UUID 94 | dom_attribute.type = "domain" 95 | dom_attribute.value = domain 96 | misp.add_attribute(domain_event, dom_attribute) 97 | 98 | for url_content in result_data[domain]: 99 | 100 | obj = MISPObject(name='opendir-url', strict=True, misp_objects_path_custom='./misp-objects') 101 | obj.add_attribute('url', value=str(url_content["url"])) 102 | obj.add_attribute('sha256', value=str(url_content["sha256"])) 103 | 104 | # obj.add_attribute("content", value=content_data[:20], data=content_data, expand='store_true') 105 | 106 | if "index of" in str(url_content["data"]["title"]).lower(): 107 | event.add_tag("opendir") 108 | misp.update_event(event) 109 | 110 | obj.add_attribute('title', value=str(url_content["data"]["title"])) 111 | obj.add_attribute('status-code', value=url_content["data"]["resp"]["status_code"]) 112 | 113 | for header in url_content["data"]["resp"]["headers"]: 114 | obj.add_attribute('header', comment=header, value=url_content["data"]["resp"]["headers"][header]) 115 | 116 | if not server_created: 117 | if "Server" in url_content["data"]["resp"]["headers"]: 118 | attribute = MISPAttribute() 119 | attribute.type = "other" 120 | attribute.comment = "Webserver" 121 | attribute.value = url_content["data"]["resp"]["headers"]["Server"] 122 | misp.add_attribute(event, attribute) 123 | server_created = True 124 | 125 | if not scripttech_created: 126 | if "X-Powered-By" in url_content["data"]["resp"]["headers"]: 127 | attribute = MISPAttribute() 128 | attribute.type = "other" 129 | attribute.comment = "Scripting Technology" 130 | attribute.value = url_content["data"]["resp"]["headers"]["X-Powered-By"] 131 | misp.add_attribute(event, attribute) 132 | scripttech_created = True 133 | 134 | try: 135 | for module in url_content["modules"]: 136 | if len(url_content["modules"][module]) > 0: 137 | if module == "JARMProcessing" and not jarm_added: 138 | jarm_obj = MISPObject(name='jarm', strict=True) 139 | jarm_obj.add_attribute("jarm", value=str(url_content["modules"][module]["fingerprint"])) 140 | misp.add_object(event, jarm_obj) 141 | jarm_added = True 142 | elif module == "SDhashProcessing": 143 | obj.add_attribute('sdhash', value=str(url_content["modules"][module]["sdhash"])) 144 | elif module == "TLSHProcessing": 145 | obj.add_attribute('tlsh', value=str(url_content["modules"][module]["tlsh"])) 146 | elif module == "YARAProcessing": 147 | for rule in url_content["modules"][module]["rules"]: 148 | obj.add_attribute('yara', value=str(rule)) 149 | 150 | except Exception as e: 151 | self.logger.error('[MISP] ' + str(e)) 152 | 153 | misp.add_object(event, obj) 154 | 155 | misp.publish(event) 156 | self.logger.info("[MISP] Event created: " + domain) 157 | 158 | if domain_event: 159 | misp.publish(domain_event) 160 | -------------------------------------------------------------------------------- /crawler/app/main.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import os 3 | import falcon 4 | from jinja2 import Environment, FileSystemLoader 5 | from utils import db, Domain, Url, Extension, Tag, DomainTag, fn 6 | 7 | db.connect() 8 | if len(db.get_tables()) == 0: 9 | db.create_tables([Domain, Url, Extension, Tag, DomainTag]) 10 | 11 | colors = ["orange", "yellow", "olive", "green", "teal", "blue", "violet", "purple", "pink", "brown", "grey"] 12 | 13 | 14 | def display_tagname(value): 15 | try: 16 | return value.tag.tag 17 | except Exception as e: 18 | return "None" 19 | 20 | 21 | def load_template(name): 22 | file_loader = FileSystemLoader('app/templates') 23 | env = Environment(loader=file_loader) 24 | env.filters['display_tagname'] = display_tagname 25 | return env.get_template(name) 26 | 27 | 28 | class SearchResource(object): 29 | def on_get(self, req, resp): 30 | template = load_template('search_results.html') 31 | error = "" 32 | urls = list() 33 | 34 | if ":" not in req.params['search']: 35 | error = "Error: No valid search pattern!

Examples:
  • url:hp.com
  • sha256:da3b8d283051c5615f359e376c0d908e6d0539bceed19e6a5667a27d01bf9fef
  • yara:protected_webshell
  • server:nginx
" 36 | else: 37 | search_arr = req.params['search'].split(":") 38 | key = search_arr[0] 39 | value = "".join(search_arr[1:]) 40 | 41 | if key == "sha256": 42 | urls = Url.select().where(Url.sha256 == value) 43 | elif key == "title": 44 | urls = Url.select().where(Url.title.contains(value)) 45 | elif key == "url": 46 | urls = Url.select().where(Url.url.contains(value)) 47 | elif key == "tag": 48 | urls = (Url.select().join(Domain).join(DomainTag).join(Tag).where(Tag.tag == value)) 49 | else: 50 | urls = (Url.select().join(Extension).where((Extension.key == key) & (Extension.value.contains(value)))) 51 | 52 | resp.status = falcon.HTTP_200 53 | resp.content_type = 'text/html' 54 | resp.body = template.render(error=error, urls=urls) 55 | 56 | 57 | class DashboardResource(object): 58 | # TODO: Create useful charts as dashboard and show stats. 59 | 60 | def on_get(self, req, resp): 61 | template = load_template('dashboard.html') 62 | 63 | domains = Domain.select().count() 64 | urls = Url.select().count() 65 | 66 | tags = DomainTag.select(DomainTag.tag, fn.COUNT(DomainTag.tag).alias('count')).group_by(DomainTag.tag).order_by(fn.COUNT(DomainTag.tag).desc()).limit(5) 67 | hashes = Url.select(Url.sha256, fn.COUNT(Url.sha256).alias('count')).group_by(Url.sha256).order_by(fn.COUNT(Url.sha256).desc()).limit(5) 68 | 69 | i = 0 70 | for tag in tags: 71 | tag.color = colors[i % len(colors)] 72 | i += 1 73 | 74 | resp.status = falcon.HTTP_200 75 | resp.content_type = 'text/html' 76 | resp.body = template.render(dashboard_active='active', domains=domains, urls=urls, tags=tags, hashes=hashes) 77 | 78 | 79 | class DomainResource(object): 80 | def on_get(self, req, resp): 81 | template = load_template('domains.html') 82 | domains = Domain.select() 83 | 84 | resp.status = falcon.HTTP_200 85 | resp.content_type = 'text/html' 86 | resp.body = template.render(domains_active='active', domains=domains) 87 | 88 | 89 | class DomainDetailsResource(object): 90 | def on_delete(self, req, resp, did): 91 | domain = Domain.get(Domain.id == did) 92 | 93 | urls = Url.select().where(Url.domain == domain) 94 | for u in urls: 95 | ext_query = Extension.delete().where(Extension.url == u) 96 | ext_query.execute() 97 | 98 | query = Url.delete().where(Url.domain == domain) 99 | query.execute() 100 | 101 | query_domtag = DomainTag.delete().where(DomainTag.domain == domain) 102 | query_domtag.execute() 103 | 104 | domain.delete_instance() 105 | 106 | template = load_template('domains.html') 107 | domains = Domain.select() 108 | resp.status = falcon.HTTP_200 109 | resp.content_type = 'text/html' 110 | resp.body = template.render(domains_active='active', domains=domains) 111 | 112 | def on_get(self, req, resp, did): 113 | template = load_template('domain_details.html') 114 | domain = Domain.get(Domain.id == did) 115 | urls = Url.select().where(Url.domain == domain) 116 | tags = (Tag.select().join(DomainTag).join(Domain).where(Domain.id == did)) 117 | 118 | i = 0 119 | for tag in tags: 120 | tag.color = colors[i % len(colors)] 121 | i += 1 122 | 123 | resp.status = falcon.HTTP_200 124 | resp.content_type = 'text/html' 125 | resp.body = template.render(domain=domain, urls=urls, tags=tags) 126 | 127 | def on_post(self, req, resp, did): 128 | if "delete" in req.params: 129 | self.on_delete(req, resp, did) 130 | return 131 | template = load_template('domain_details.html') 132 | 133 | domain = Domain.get(Domain.id == did) 134 | domain.description = req.params['description'] 135 | domain.save() 136 | 137 | urls = Url.select().where(Url.domain == domain) 138 | tags = (Tag.select().join(DomainTag).join(Domain).where(Domain.id == did)) 139 | 140 | i = 0 141 | for tag in tags: 142 | tag.color = colors[i % len(colors)] 143 | i += 1 144 | 145 | resp.status = falcon.HTTP_200 146 | resp.content_type = 'text/html' 147 | resp.body = template.render(domain=domain, urls=urls, tags=tags) 148 | 149 | 150 | class UrlResource(object): 151 | def on_get(self, req, resp): 152 | template = load_template('urls.html') 153 | urls = Url.select() 154 | 155 | resp.status = falcon.HTTP_200 156 | resp.content_type = 'text/html' 157 | resp.body = template.render(urls_active='active', urls=urls) 158 | 159 | 160 | class UrlDetailsResource(object): 161 | def on_delete(self, req, resp, uid): 162 | url = Url.get(Url.id == uid) 163 | 164 | ext_query = Extension.delete().where(Extension.url == url) 165 | ext_query.execute() 166 | 167 | url.delete_instance() 168 | 169 | template = load_template('urls.html') 170 | urls = Url.select() 171 | resp.status = falcon.HTTP_200 172 | resp.content_type = 'text/html' 173 | resp.body = template.render(urls_active='active', urls=urls) 174 | 175 | def on_get(self, req, resp, uid): 176 | template = load_template('url_details.html') 177 | url = Url.get(Url.id == uid) 178 | extensions = Extension.select().where(Extension.url == url) 179 | 180 | resp.status = falcon.HTTP_200 181 | resp.content_type = 'text/html' 182 | resp.body = template.render(url=url, extensions=extensions) 183 | 184 | def on_post(self, req, resp, uid): 185 | if "delete" in req.params: 186 | self.on_delete(req, resp, uid) 187 | return 188 | template = load_template('url_details.html') 189 | 190 | url = Url.get(Url.id == uid) 191 | extensions = Extension.select().where(Extension.url == url) 192 | 193 | resp.status = falcon.HTTP_200 194 | resp.content_type = 'text/html' 195 | resp.body = template.render(url=url) 196 | 197 | 198 | # api initialization 199 | app = falcon.API() 200 | app.req_options.auto_parse_form_urlencoded = True 201 | dashboard = DashboardResource() 202 | domains = DomainResource() 203 | domain_details = DomainDetailsResource() 204 | urls = UrlResource() 205 | url_details = UrlDetailsResource() 206 | search = SearchResource() 207 | 208 | app.add_route('/', dashboard) 209 | app.add_route('/domain', domains) 210 | app.add_route('/domain/{did:int}', domain_details) 211 | 212 | app.add_route('/url', urls) 213 | app.add_route('/url/{uid:int}', url_details) 214 | 215 | app.add_route('/search', search) 216 | -------------------------------------------------------------------------------- /crawler/processing/external_intel_processing.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import magic 4 | import requests 5 | import json 6 | from utils import SubCrawlColors, SubCrawlHelpers 7 | 8 | from .default_processing import DefaultProcessing 9 | 10 | 11 | class ExternalIntelProcessing(DefaultProcessing): 12 | 13 | cfg = None 14 | logger = None 15 | vt_api = None 16 | urlhaus_api = None 17 | bazaar_api = None 18 | submit_urlhaus = False 19 | submit_bazaar = False 20 | 21 | vt_api_url = "https://www.virustotal.com/api/v3/files/" 22 | urlhaus_api_url = "https://urlhaus-api.abuse.ch/v1/payload/" 23 | urlhaus_api_submit = "https://urlhaus.abuse.ch/api/" 24 | bazaar_api_url = "https://mb-api.abuse.ch/api/v1/" 25 | 26 | def __init__(self, config, logger): 27 | self.cfg = config 28 | self.logger = logger 29 | 30 | if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "vt_api"): 31 | self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] VirusTotal API Key not set' + 32 | SubCrawlColors.RESET) 33 | else: 34 | self.vt_api = SubCrawlHelpers.get_config( 35 | self.cfg, "external_intel", "vt_api") 36 | 37 | if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "urlhaus_api"): 38 | self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] URLHaus API Key not set' + 39 | SubCrawlColors.RESET) 40 | else: 41 | self.urlhaus_api = SubCrawlHelpers.get_config(self.cfg, "external_intel", "urlhaus_api") 42 | 43 | if "<" in SubCrawlHelpers.get_config(self.cfg, "external_intel", "bazaar_api"): 44 | self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Bazaar API Key not set' + 45 | SubCrawlColors.RESET) 46 | else: 47 | self.bazaar_api = SubCrawlHelpers.get_config(self.cfg, "external_intel", "bazaar_api") 48 | 49 | self.submit_urlhaus = SubCrawlHelpers.get_config(self.cfg, "external_intel", "submit_urlhaus") 50 | if not self.submit_urlhaus: 51 | self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Not uploading to URLHaus' + SubCrawlColors.RESET) 52 | 53 | self.submit_bazaar = SubCrawlHelpers.get_config(self.cfg, "external_intel", "submit_bazaar") 54 | if not self.submit_bazaar: 55 | self.logger.info(SubCrawlColors.YELLOW + '[ExternalIntel] Not uploading to Bazaar' + SubCrawlColors.RESET) 56 | 57 | 58 | def process(self, url, content): 59 | payload = {} 60 | content_match = True 61 | signature = None 62 | 63 | shasum = SubCrawlHelpers.get_sha256(content) 64 | content_magic = magic.from_buffer(content).lower() 65 | 66 | tags = [] 67 | if content_magic and any(partial in content_magic for partial in 68 | SubCrawlHelpers.get_config(self.cfg, "crawler", "pe_magics")): 69 | 70 | if "(dll)" in content_magic: 71 | tags.append("dll") 72 | else: 73 | tags.append("exe") 74 | 75 | if "x86-64" in content_magic: 76 | tags.append("x64") 77 | 78 | if "mono/.net" in content_magic: 79 | tags.append('.NET') 80 | tags.append('MSIL') 81 | 82 | if not self.urlhaus_api is None: 83 | signature = self.check_urlhaus(shasum, url, tags) 84 | 85 | if not self.bazaar_api is None: 86 | signature = self.check_bazaar(shasum, url, content, tags) 87 | 88 | if not self.vt_api is None: 89 | self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on VirusTotal:\t" + 90 | self.check_virustotal(shasum) + " \t\t(" + shasum + ")" + SubCrawlColors.RESET) 91 | elif content_magic and any(partial in content_magic for partial in 92 | SubCrawlHelpers.get_config(self.cfg, "crawler", "office_magics")): 93 | 94 | if "Microsoft Word" in content_magic or "Microsoft Office Word" in content_magic: 95 | tags.append("doc") 96 | elif "Microsoft Excel" in content_magic: 97 | tags.append('xls') 98 | elif "Rich Text Format" in content_magic: 99 | tags.append('rtf') 100 | elif "CDFV2 Encrypted" in content_magic: 101 | tags.append('encrypted') 102 | 103 | if not self.urlhaus_api is None: 104 | signature = self.check_urlhaus(shasum, url, tags) 105 | 106 | if not self.bazaar_api is None: 107 | signature = self.check_bazaar(shasum, url, content, tags) 108 | 109 | else: 110 | content_match = False 111 | 112 | if content_match: 113 | payload = {"hash": shasum, "url": url, "signature": signature} 114 | 115 | return payload 116 | 117 | def check_urlhaus(self, sha256, url, tags): 118 | status = SubCrawlColors.YELLOW + "NOT FOUND" + SubCrawlColors.CYAN 119 | signature = None 120 | sample_found = False 121 | post_data = {'sha256_hash': sha256} 122 | resp = requests.post(self.urlhaus_api_url, data = post_data) 123 | 124 | results = json.loads(resp.text) 125 | 126 | if results["query_status"] == "ok": 127 | status = "FOUND - " 128 | sample_found = True 129 | if not results['signature'] is None: 130 | status += results['signature'] 131 | signature = results['signature'] 132 | else: 133 | status += "No Signature" 134 | 135 | self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on URLHaus:\t" + status + "\t\t(" + sha256 + ")" + SubCrawlColors.RESET) 136 | 137 | if not sample_found and self.submit_urlhaus: 138 | self.logger.info(SubCrawlColors.PURPLE + "[ExternalIntel] Submitting file to URLHaus:\t" + url + SubCrawlColors.RESET) 139 | jsonDataURLHaus = { 140 | 'token' : self.urlhaus_api, 141 | 'anonymous' : '0', 142 | 'submission' : [ 143 | { 144 | 'url': url, 145 | 'threat': 'malware_download', 146 | 'tags': 147 | tags 148 | } 149 | ] 150 | } 151 | 152 | headers = { 153 | "Content-Type" : "application/json" 154 | } 155 | r = requests.post(self.urlhaus_api_submit, json=jsonDataURLHaus, timeout=15, headers=headers) 156 | if "inserted" in r.content.decode("utf-8"): 157 | self.logger.info(SubCrawlColors.GREEN + "[ExternalIntel] URL Submitted on URLHaus :)" + SubCrawlColors.RESET) 158 | else: 159 | self.logger.error(SubCrawlColors.RED + "[ExternalIntel] Problem Submitting URL on URLHaus :(\t" + r.content.decode("utf-8").replace("\n","") + SubCrawlColors.RESET) 160 | return signature 161 | 162 | def check_bazaar(self, sha256, url, content, tags): 163 | status = SubCrawlColors.YELLOW + "NOT FOUND" + SubCrawlColors.CYAN 164 | signature = None 165 | sample_found = False 166 | post_data = {'query':'get_info','hash': sha256} 167 | resp = requests.post(self.bazaar_api_url, data = post_data) 168 | results = json.loads(resp.text) 169 | 170 | if results["query_status"] == "ok": 171 | sig = "no sig" 172 | sample_found = True 173 | for sample in results['data']: 174 | if not sample['signature'] is None: 175 | sig = sample['signature'] 176 | signature = sample['signature'] 177 | else: 178 | sig = "No Signature" 179 | status = "FOUND - " + sig 180 | 181 | self.logger.info(SubCrawlColors.CYAN + "[ExternalIntel] File status on Bazaar:\t" + status + "\t\t(" + sha256 + ")" + SubCrawlColors.RESET) 182 | 183 | if not sample_found and self.submit_bazaar: 184 | self.logger.info(SubCrawlColors.PURPLE + "[ExternalIntel] Submitting file to Bazaar:\t" + url + SubCrawlColors.RESET) 185 | 186 | jsonDataBazaar = { 187 | 'anonymous' : '0', 188 | 'delivery_method' : 'web_download', 189 | 'tags' : 190 | tags, 191 | 'context': { 192 | 'comment' : 'Found at ' + SubCrawlHelpers.defang_url(url) + ' by #subcrawl', 193 | } 194 | } 195 | 196 | files = { 197 | 'json_data' : (None,json.dumps(jsonDataBazaar), 'application/json'), 198 | 'file' : content 199 | } 200 | headers = {'API-KEY' : self.bazaar_api } 201 | 202 | r = requests.post(self.bazaar_api_url, files=files, verify=False, headers=headers) 203 | 204 | if "inserted" in r.content.decode("utf-8"): 205 | self.logger.info(SubCrawlColors.GREEN + "[ExternalIntel] Payload Submitted on Bazaar :)" + SubCrawlColors.RESET) 206 | else: 207 | self.logger.error(SubCrawlColors.RED + "[ExternalIntel] Problem Submitting Payload on Bazaar :(\t" + r.content.decode("utf-8").replace("\n","") + SubCrawlColors.RESET) 208 | return signature 209 | 210 | def check_virustotal(self,sha256): 211 | result = "NOT FOUND" 212 | headers = {'x-apikey':self.vt_api} 213 | resp = requests.get(self.vt_api_url + sha256, headers = headers) 214 | 215 | results = json.loads(resp.text) 216 | 217 | if not "error" in results: 218 | result = "FOUND" 219 | 220 | return result 221 | -------------------------------------------------------------------------------- /crawler/subcrawl.py: -------------------------------------------------------------------------------- 1 | # © Copyright 2021 HP Development Company, L.P. 2 | import argparse 3 | import base64 4 | import datetime 5 | import hashlib 6 | import inspect 7 | import io 8 | import json 9 | import os 10 | import re 11 | import sys 12 | import time 13 | from concurrent.futures import ProcessPoolExecutor 14 | from io import BytesIO 15 | from multiprocessing import Pool, cpu_count 16 | from urllib.parse import urljoin, urlparse 17 | 18 | import magic 19 | import requests 20 | import yaml 21 | from bs4 import BeautifulSoup 22 | from mergedeep import Strategy, merge 23 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 24 | 25 | from processing import * 26 | from storage import * 27 | from utils import (SubCrawlBanner, SubCrawlColors, SubCrawlHelpers, 28 | SubCrawlLogger, SubCrawlLoggerLevels) 29 | 30 | try: 31 | from kafka import KafkaConsumer 32 | consumer = KafkaConsumer( 33 | 'urls', 34 | bootstrap_servers=['kafka:9092'], 35 | auto_offset_reset='earliest', 36 | enable_auto_commit=True, 37 | group_id='urls-crawler', 38 | auto_commit_interval_ms=1000, 39 | consumer_timeout_ms=2000, 40 | value_deserializer=lambda x: json.loads(x.decode('utf-8'))) 41 | except: 42 | consumer = None 43 | 44 | # region global variables and configs 45 | 46 | # ignore TLS cert errors 47 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 48 | 49 | process_pool = None 50 | 51 | logger = None 52 | global_cfg = None # used in the main process 53 | process_cfg = None # used in the scraper processes 54 | process_processing_modules = None # used in the scraper process 55 | 56 | init_pages = [] # initial found pages by splitting the url 57 | crawl_pages = [] # found pages by scraping the initial urls 58 | 59 | storage_modules = [] 60 | processing_modules = [] 61 | 62 | # endregion 63 | 64 | 65 | def initialize(): 66 | global logger, global_cfg, process_pool 67 | 68 | with open("config.yml", "r") as ymlfile: 69 | global_cfg = yaml.safe_load(ymlfile) 70 | 71 | if not global_cfg: 72 | print('[!] Error loading configuration file, engine could not start') 73 | sys.exit(0) 74 | 75 | logger = SubCrawlLogger("subcrawl.log", "SubCrawl", 76 | SubCrawlLoggerLevels[SubCrawlHelpers.get_config( 77 | global_cfg, 'crawler', 78 | 'log_level').upper()].value).get_logger() 79 | 80 | 81 | def main(argv): 82 | 83 | banner = SubCrawlBanner(SubCrawlHelpers.get_config( 84 | global_cfg, "crawler", "logos_path"), 85 | SubCrawlHelpers.get_config(global_cfg, 86 | "crawler", "tag_line")) 87 | banner.print_banner() 88 | 89 | options = setup_args(argv) 90 | 91 | start_time = datetime.datetime.now() 92 | 93 | # region process storage/payload modules 94 | 95 | str_storage_modules = list() 96 | if options.storage_modules: 97 | for storage_module in options.storage_modules.split(","): 98 | str_storage_modules.append(storage_module) 99 | else: 100 | for storage_module in SubCrawlHelpers.get_config(global_cfg, "crawler", 101 | "storage_modules"): 102 | str_storage_modules.append(storage_module) 103 | 104 | for storage_module in str_storage_modules: 105 | try: 106 | dynamic_class = str2Class(storage_module.strip()) 107 | storage_modules.append(dynamic_class(global_cfg, logger)) 108 | logger.info("[ENGINE] Loaded storage module: " + storage_module) 109 | except Exception as e: 110 | logger.error("[ENGINE] Error loading storage module: " + storage_module) 111 | 112 | str_processing_modules = list() 113 | if options.processing_modules: 114 | for processing_module in options.processing_modules.split(","): 115 | str_processing_modules.append(processing_module) 116 | else: 117 | for processing_module in SubCrawlHelpers.get_config(global_cfg, "crawler", "processing_modules"): 118 | str_processing_modules.append(str(processing_module)) 119 | 120 | for processing_module in str_processing_modules: 121 | try: 122 | dynamic_class = str2Class(processing_module.strip()) 123 | processing_modules.append(dynamic_class(global_cfg, logger)) 124 | logger.info("[ENGINE] Loaded processing module: " + processing_module) 125 | except Exception as e: 126 | logger.error("[ENGINE] Error loading processing module: " + processing_module + ": " + str(e)) 127 | 128 | # endregion 129 | 130 | cpus = cpu_count() 131 | if cpus > 1: 132 | cpus = cpus - 1 133 | process_pool = ProcessPoolExecutor(cpus) 134 | 135 | scrape_urls = set() 136 | scraped_domains = set() 137 | for s_module in storage_modules: 138 | scraped_domains.update(s_module.load_scraped_domains()) 139 | 140 | logger.info("[ENGINE] Parsing input sources...") 141 | 142 | # region gather input URLs 143 | if options.kafka and consumer: 144 | logger.info("[ENGINE] Using Kafka queue for URL processing...") 145 | for message in consumer: 146 | url = message.value 147 | if SubCrawlHelpers.is_valid_url(url): 148 | parsed = urlparse(url) 149 | if parsed.netloc not in scraped_domains: 150 | parsed_url = url 151 | if not url.endswith("/"): 152 | parsed_url = remove_url_resource(url) 153 | if parsed_url: 154 | scrape_urls.add(parsed_url) 155 | scraped_domains.add(parsed.netloc) 156 | else: 157 | logger.debug("[~] Domain already added to the scanning queue: " 158 | + SubCrawlHelpers.defang_url(str(parsed.netloc))) 159 | else: 160 | logger.info("[ENGINE] Using file input for URL processing...") 161 | try: 162 | with open(options.file_path, 'r') as f: 163 | for url in f: 164 | try: 165 | url = url.strip() 166 | parsed = urlparse(url) 167 | if parsed.netloc not in scraped_domains: 168 | parsed_url = url 169 | if not url.endswith('exe') and not url.endswith("/"): 170 | parsed_url = remove_url_resource(url) 171 | if parsed_url: 172 | scrape_urls.add(parsed_url) 173 | scraped_domains.add(parsed.netloc) 174 | else: 175 | logger.debug("[ENGINE] Domain already added to the scanning queue: " 176 | + str(parsed.netloc)) 177 | except Exception as e: 178 | logger.error("[ENGINE] Error reading input file for URL processing: " + str(e)) 179 | except Exception as e: 180 | logger.error("[ENGINE] Error reading input file for URL processing: " + str(e)) 181 | sys.exit(-1) 182 | 183 | logger.info("[ENGINE] Found " + str(len(scrape_urls)) + " hosts to scrape") 184 | 185 | # endregion 186 | 187 | # region generate new URLs 188 | 189 | domain_urls = dict() 190 | distinct_urls = list() 191 | for start_url in scrape_urls: 192 | # This will add the full URL if it ends with an extension, then passes it along for parsing 193 | if start_url.endswith('.exe'): 194 | logger.debug("[ENGINGE] Adding EXE URL directly: " + SubCrawlHelpers.defang_url(start_url)) 195 | if start_url not in distinct_urls: 196 | distinct_urls.append(start_url) 197 | domain_urls.setdefault(parsed.netloc, []).append(start_url) 198 | start_url = remove_url_resource(start_url) 199 | 200 | parsed = urlparse(start_url) 201 | base = parsed.scheme + "://" + parsed.netloc 202 | paths = parsed.path[:-1].split('/') # remove the trailing '/' to avoid an empty path 203 | tmp_url = base 204 | 205 | if not SubCrawlHelpers.get_config(global_cfg, "crawler", "scan_simple_domains") and len(paths) == 1 and paths[0] == "": 206 | continue # don't scan simple domains. 207 | 208 | for path in paths: 209 | try: 210 | tmp_url = urljoin(tmp_url, path) + "/" 211 | tmp_url_parsed = urlparse(tmp_url) 212 | 213 | logger.debug("Generated new URL: " + SubCrawlHelpers.defang_url(tmp_url)) 214 | 215 | if tmp_url not in distinct_urls: 216 | distinct_urls.append(tmp_url) 217 | domain_urls.setdefault(parsed.netloc, []).append(tmp_url) 218 | except Exception as e: 219 | logger.debug("[ENGINE] error parsing generated url: " + str(e)) 220 | 221 | # endregion 222 | 223 | logger.info("[ENGINE] Done parsing URLs, ready to begin scraping " + str(len(domain_urls)) + " hosts and " + str(len(distinct_urls)) + " URLs... starting in " + str(SubCrawlHelpers.get_config(global_cfg, "crawler", "delay_execution_time")) + " seconds!") 224 | time.sleep(int(SubCrawlHelpers.get_config(global_cfg, "crawler", 225 | "delay_execution_time"))) 226 | 227 | # region crawl 228 | 229 | # used to convert url dict per domain into list of lists 230 | list_of_domains = list() 231 | for domain in domain_urls: 232 | url_list = list() 233 | for url in domain_urls[domain]: 234 | url_list.append(url) 235 | list_of_domains.append((url_list, global_cfg, processing_modules)) 236 | 237 | # batch defines amount of domains to scan before calling storage modules 238 | for batch_urls in chunks(list_of_domains, 239 | SubCrawlHelpers.get_config(global_cfg, "crawler", 240 | "batch_size")): 241 | scrape_data = [] # result data of url scraping 242 | final_crawl_pages = set() 243 | result_dicts = process_pool.map(scrape_manager, batch_urls) 244 | 245 | original = dict() 246 | for result in result_dicts: 247 | merge(original, result, strategy=Strategy.ADDITIVE) 248 | 249 | scrape_data = original["scrape_data"] if "scrape_data" in original \ 250 | else dict() 251 | crawl_pages = set(original["crawl_pages"]) if "crawl_pages" in \ 252 | original else set() 253 | final_crawl_pages.update(crawl_pages) 254 | 255 | for s_module in storage_modules: 256 | s_module.store_result(scrape_data) 257 | 258 | elapsed = datetime.datetime.now() - start_time 259 | logger.info("Execution time (D:H:M:S): %02d:%02d:%02d:%02d" % (elapsed.days, elapsed.seconds // 3600, elapsed.seconds // 60 % 60, elapsed.seconds % 60)) 260 | 261 | # endregion 262 | 263 | 264 | def scrape_manager(data): 265 | domain_urls, cfg, processing_modules = data 266 | global process_cfg 267 | global init_pages 268 | global process_processing_modules 269 | 270 | process_cfg = cfg 271 | init_pages = domain_urls 272 | process_processing_modules = processing_modules 273 | 274 | logger.debug("[ENGINE] Starting down path... " + SubCrawlHelpers.defang_url(domain_urls[0])) 275 | 276 | result_dicts = list() 277 | for url in domain_urls: 278 | s_data = [] 279 | scrape_result = scrape(url, s_data) 280 | result_dicts.append(scrape_result) 281 | 282 | original = dict() 283 | for result in result_dicts: 284 | if "scrape_data" in result: 285 | result["scrape_data"] = json.loads(result["scrape_data"]) 286 | merge(original, result, strategy=Strategy.ADDITIVE) 287 | 288 | return original 289 | 290 | 291 | def scrape(start_url, s_data): 292 | try: 293 | scrape_domain = dict() 294 | request_start = datetime.datetime.now() 295 | logger.debug("[ENGINE] Scanning URL: " + SubCrawlHelpers.defang_url(start_url)) 296 | resp = requests.get(start_url, timeout=SubCrawlHelpers.get_config( 297 | process_cfg, "crawler", "http_request_timeout"), 298 | headers=SubCrawlHelpers.get_config(process_cfg, "crawler", 299 | "headers"), 300 | verify=False, allow_redirects=SubCrawlHelpers.get_config(process_cfg, "crawler", 301 | "follow_redirects"),) 302 | 303 | if resp.status_code == 200: 304 | response_size_ok = True 305 | size = 0 306 | maxsize = SubCrawlHelpers.get_config(process_cfg, "crawler", 307 | "http_max_size") 308 | ctt = BytesIO() 309 | 310 | for chunk in resp.iter_content(2048): 311 | size += len(chunk) 312 | ctt.write(chunk) 313 | current_time = datetime.datetime.now() 314 | if size > maxsize or \ 315 | (current_time - request_start).total_seconds() > \ 316 | SubCrawlHelpers.get_config(process_cfg, "crawler", 317 | "http_download_timeout"): 318 | resp.close() 319 | response_size_ok = False 320 | logger.debug("[ENGINE] Response too large or download timeout: " + start_url) 321 | break 322 | 323 | if response_size_ok: 324 | content = ctt.getvalue() 325 | signature = "" 326 | title = None 327 | bs = None 328 | content_magic = "NONE" 329 | try: 330 | bs = BeautifulSoup(str(content), "html.parser") 331 | title = bs.find('title') 332 | except: 333 | bs = None 334 | content_magic = magic.from_buffer(content).lower() 335 | module_results = {} 336 | if title is not None and bs is not None\ 337 | and any(partial in title.get_text().lower() for partial in \ 338 | SubCrawlHelpers.get_config(process_cfg, "crawler", "opendir_title")): 339 | 340 | for link in bs.find_all('a'): 341 | if link.has_attr('href'): 342 | href = link.attrs['href'] 343 | if href is not None and not href.startswith("?"): 344 | next_page = urljoin(start_url, href) 345 | 346 | if next_page not in crawl_pages and next_page not in init_pages \ 347 | and not next_page.lower().endswith(tuple(SubCrawlHelpers.get_config(process_cfg, "crawler", "ext_exclude"))): 348 | logger.debug("[ENGINE] Discovered: " + SubCrawlHelpers.defang_url(next_page)) 349 | crawl_pages.append(next_page) 350 | scrape(next_page, s_data) 351 | else: 352 | for p_module in process_processing_modules: 353 | mod_res = p_module.process(start_url, content) 354 | if mod_res: 355 | module_results[type(p_module).__name__] = mod_res 356 | 357 | title = bs.select_one('title') 358 | if title: 359 | title = title.string 360 | 361 | try: 362 | text = base64.b64encode(content).decode('utf-8', errors='ignore') 363 | except Exception as e: 364 | logger.error("[ENGINE] " + str(e)) 365 | 366 | scrape_entry = { 367 | 'scraped_on': datetime.datetime.now().isoformat(), 368 | 'sha256': SubCrawlHelpers.get_sha256(content), 369 | 'url': start_url, 370 | 'content_type': content_magic, 371 | 'signature': signature, 372 | 'data': { 373 | 'text': text, 374 | 'title': title, 375 | 'resp': { 376 | 'headers': dict(resp.headers) if resp else '', 377 | 'status_code': resp.status_code if resp else '', 378 | }, 379 | }, 380 | "modules": {} 381 | } 382 | 383 | scrape_entry["modules"] = module_results 384 | s_data.append(scrape_entry) 385 | parsed = urlparse(start_url) 386 | scrape_domain = {parsed.netloc: s_data} 387 | 388 | except Exception as e: 389 | logger.debug("[ENGINE] " + str(e)) 390 | 391 | return {"crawl_pages": crawl_pages, "scrape_data": json.dumps(scrape_domain)} 392 | 393 | 394 | def remove_url_resource(unparsed_url): 395 | try: 396 | parsed_url = urlparse(unparsed_url) 397 | last_slash = parsed_url.path.rindex('/') 398 | return unparsed_url.replace(parsed_url.path[last_slash +1:], "") 399 | except Exception as e: 400 | logger.error("[URL_PARSER] Error with URL " + unparsed_url + str(e)) 401 | return None 402 | 403 | 404 | def chunks(lst, n): 405 | """Yield successive n-sized chunks from lst.""" 406 | for i in range(0, len(lst), n): 407 | yield lst[i:i + n] 408 | 409 | 410 | def unique_content(content): 411 | unique_dict = dict() 412 | for key in content: 413 | unique_dict[key] = set(content[key]) 414 | return unique_dict 415 | 416 | 417 | def str2Class(str): 418 | return getattr(sys.modules[__name__], str) 419 | 420 | 421 | def print_classes(): 422 | clsmembers_storage = inspect.getmembers(sys.modules["storage"], inspect.isclass) 423 | clsmembers_processing = inspect.getmembers(sys.modules["processing"], inspect.isclass) 424 | 425 | print("\n Available processing modules: ") 426 | for mod in clsmembers_processing: 427 | print(" - " + mod[0]) 428 | 429 | print("\n Available storage modules: ") 430 | for mod in clsmembers_storage: 431 | print(" - " + mod[0]) 432 | 433 | 434 | def setup_args(argv): 435 | parser = argparse.ArgumentParser(description="") 436 | 437 | parser.add_argument('-f', '--file', action="store", dest="file_path", help="Path of input URL file") 438 | 439 | parser.add_argument('-k', '--kafka', action="store_true", dest="kafka", help="Use Kafka Queue as input") 440 | 441 | parser.add_argument('-p', '--processing', action="store", dest="processing_modules", help="Processing modules to be executed comma separated.") 442 | 443 | parser.add_argument('-s', '--storage', action="store", dest="storage_modules", help="Storage modules to be executed comma separated.") 444 | 445 | if len(argv) == 0: 446 | parser.print_help() 447 | print_classes() 448 | sys.exit(0) 449 | 450 | return parser.parse_args() 451 | 452 | 453 | initialize() 454 | 455 | if __name__ == '__main__': 456 | main(sys.argv[1:]) 457 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SubCrawl 2 | 3 | SubCrawl is a framework developed by [Patrick Schläpfer](https://twitter.com/stoerchl), [Josh Stroschein](https://twitter.com/jstrosch) and [Alex Holland](https://twitter.com/cryptogramfan) of HP Inc’s [Threat Research](https://threatresearch.ext.hp.com/blog/) team. SubCrawl is designed to find, scan and analyze open directories. The framework is modular, consisting of four components: input modules, processing modules, output modules and the core crawling engine. URLs are the primary input values, which the framework parses and adds to a queuing system before crawling them. The parsing of the URLs is an important first step, as this takes a submitted URL and generates additional URLs to be crawled by removing sub-directories, one at a time until none remain. This process ensures a more complete scan attempt of a web server and can lead to the discovery of additional content. Notably, SubCrawl does not use a brute-force method for discovering URLs. All the content scanned comes from the input URLs, the process of parsing the URL and discovery during crawling. When an open directory is discovered, the crawling engine extracts links from the directory for evaluation. The crawling engine determines if the link is another directory or if it is a file. Directories are added to the crawling queue, while files undergo additional analysis by the processing modules. Results are generated and stored for each scanned URL, such as the SHA256 and fuzzy hashes of the content, if an open directory was found, or matches against YARA rules. Finally, the result data is processed according to one or more output modules, of which there are currently three. The first provides integration with MISP, the second simply prints the data to the console, and the third stores the data in an SQLite database. Since the framework is modular, it is not only easy to configure which input, processing and output modules are desired, but also straightforward to develop new modules. 4 | 5 | ![Framework Architecture](images/architecture.png) 6 | _Figure 1 - SubCrawl architecture_ 7 | 8 | SubCrawl supports two different modes of operation. First, SubCrawl can be started in a run-once mode. In this mode, the user supplies the URLs to be scanned in a file where each input value is separated by a line break. The second mode of operation is service mode. In this mode, SubCrawl runs in the background and relies on the input modules to supply the URLs to be scanned. Figure 1 shows an overview of SubCrawl’s architecture. The components that are used in both modes of operation are blue, run-once mode components are yellow, and service mode components are green. 9 | 10 | ## Requirements 11 | 12 | Based on the chosen run mode, other preconditions must be met. 13 | 14 | ### Run-Once Mode Requirements 15 | 16 | SubCrawl is written in Python3. In addition, there are several packages that are required before running SubCrawl. The following command can be used to install all required packages before running SubCrawl. From the *crawler* directory, run the following command: 17 | 18 | ``` 19 | $ sudo apt install build-essential 20 | $ pip3 install -r requirements.txt 21 | ``` 22 | 23 | ### Service Mode Requirements 24 | 25 | If SubCrawl is started in service mode, this can be done using Docker. For this reason, the installation of Docker and Docker Compose is required. Good installation instructions for this can be found directly on the Docker.com website. 26 | - [Installing Docker Engine](https://docs.docker.com/engine/install/ubuntu/) 27 | - [Installing Docker Compose](https://docs.docker.com/compose/install/) 28 | 29 | ## Getting Help 30 | 31 | SubCrawl has built-in help through the _-h/--help_ argument or by simply executing the script without any arguments. 32 | 33 | ``` 34 | ******** ** ****** ** 35 | **////// /** **////** /** 36 | /** ** **/** ** // ****** ****** *** ** /** 37 | /*********/** /**/****** /** //**//* //////** //** * /** /** 38 | ////////**/** /**/**///**/** /** / ******* /** ***/** /** 39 | /**/** /**/** /**//** ** /** **////** /****/**** /** 40 | ******** //******/****** //****** /*** //******** ***/ ///** *** 41 | //////// ////// ///// ////// /// //////// /// /// /// 42 | ~~ Harvesting the Open Web ~~ 43 | 44 | usage: subcrawl.py [-h] [-f FILE_PATH] [-k] [-p PROCESSING_MODULES] [-s STORAGE_MODULES] 45 | 46 | optional arguments: 47 | -h, --help show this help message and exit 48 | -f FILE_PATH, --file FILE_PATH 49 | Path of input URL file 50 | -k, --kafka Use Kafka Queue as input 51 | -p PROCESSING_MODULES, --processing PROCESSING_MODULES 52 | Processing modules to be executed comma separated. 53 | -s STORAGE_MODULES, --storage STORAGE_MODULES 54 | Storage modules to be executed comma separated. 55 | 56 | Available processing modules: 57 | - ExternalIntelProcessing 58 | - ClamAVProcessing 59 | - JARMProcessing 60 | - PayloadProcessing 61 | - TLSHProcessing 62 | - YARAProcessing 63 | 64 | Available storage modules: 65 | - ElasticStorage 66 | - ConsoleStorage 67 | - MISPStorage 68 | - SqliteStorage 69 | ``` 70 | 71 | ## Run-Once Mode 72 | 73 | This mode is suitable if you want to quickly scan a manageable amount of domains. For this purpose, the URLs to be scanned must be saved in a file, which then serves as input for the crawler. The following is an example of executing in run-once mode, not the _-f_ argument is used with a path to a file. 74 | 75 | ``` 76 | python3 subcrawl.py -f urls.txt -p YARAProcessing,PayloadProcessing -s ConsoleStorage 77 | ``` 78 | 79 | ## Service Mode 80 | 81 | With the service mode, a larger amount of domains can be scanned and the results saved. Based on the selected storage module, the data can then be analyzed and evaluated in more detail. To make running the service mode as easy as possible for the user, we built all the functionalities into a Docker image. In service mode, the domains to be scanned are obtained via Input modules. By default, new malware and phishing URLs are downloaded from [URLhaus](https://urlhaus.abuse.ch/) and [PhishTank](https://www.phishtank.com/) and queued for scanning. The desired processing and storage modules can be entered directly in the `config.yml`. By default, the following processing modules are activated, utilizing the SQLite storage: 82 | - ClamAVProcessing 83 | - JARMProcessing 84 | - TLSHProcessing 85 | - YARAProcessing 86 | 87 | In addition to the SQLite storage module, a simple web UI was developed that allows viewing and managing the scanned domains and URLs. 88 | 89 | ![Web UI for SQLite storage module](images/webui.png) 90 | 91 | However, if this UI is not sufficient for the subsequent evaluation of the data, the MISP storage module can be activated alternatively or additionally. The corresponding settings must be made in `config.yml` under the `MISP` section. 92 | 93 | The following two commands are enough to clone the GIT repository, create the Docker container and start it directly. Afterwards the web UI can be reached at the address `https://localhost:8000/`. Please note, once the containers have started the input modules will begin to add URLs to the processing queue and the engine will begin crawling hosts. 94 | 95 | 96 | ``` 97 | git clone https://github.com/hpthreatresearch/subcrawl.git 98 | 99 | docker-compose up --build 100 | ``` 101 | 102 | ## SubCrawl Modules 103 | 104 | ### Input Modules 105 | 106 | Input modules are only used in service mode. If SubCrawl started using the run-once mode then a file containing the URLs to scan must be supplied. The following two input modules have been implemented. 107 | 108 | #### URLhaus 109 | 110 | [URLhaus](https://urlhaus.abuse.ch/) is a prominent web service tracking malicious URLs. The web service also provides exports containing new detected URLs. Those malware URLs serve as perfect input to our crawler as we mainly want to analyze malicious domains. Recently submitted URLS are retrieved and search results are not refined through the API request (i.e. through tags or other parameters available). The HTTP request made in this [input module](crawler/input/urlhaus.py) to the URLHaus API can be modifed to further refine the results obtained. 111 | 112 | #### PhishTank 113 | 114 | [PhishTank](https://www.phishtank.com/) is a website that collects phishing URLs. Users have the possibility to submit new found phishing pages. An export with active phishing URLs can be generated and downloaded from this web service via API. So this is also an ideal collection for our crawler. 115 | 116 | ### Processing Modules 117 | 118 | SubCrawl comes with several processing modules. The processing modules all follow similar behavior on how they provide results back to the core engine. If matches are found, results are returned to the core engine and later provided to the storage modules. Below is a list of processing modules. 119 | 120 | #### External Intelligence (Abuse.ch, VirusTotal) 121 | 122 | The [ExternalIntel](https://github.com/jstrosch/subcrawl/blob/main/crawler/processing/external_intel_processing.py) processing module is used to check for the presense of a URL on the URLHaus, or a payload (via SHA256 hash) on the malware Bazaar. If the value exists, the module will parse the response and print the family tag associated with it. Optionally, this module can be used to submit samples and URLs to each respective Abuse.ch service. This module depends on configuration of the appropriate API key in the external_intel section in the [primary configuration](https://github.com/jstrosch/subcrawl/blob/main/crawler/config.yml). 123 | 124 | ![external intelligence processing module output](images/external_intel.png) 125 | 126 | #### SDHash 127 | 128 | The [SDHash](https://github.com/sdhash/sdhash) processing modue is used to calculate a similarity hash of the HTTP response. The minimum size of the content must is 512 bytes to be able to successfully calculate a hash. This is probably the most complicated processing module to install, as it requires Protobuf and depending on the target host it must be recompiled. Therefore this processing module is deactivated by default. An already compiled version can be found in crawler/processing/minisdhash/ which requires protobuf-2.5.0 and python3.6. Those binaries were compiled on an Ubuntu 18.04.5 LTS x64. Following the installation instructions: 129 | 130 | ``` 131 | # Protobuf installation 132 | > apt-get update 133 | > apt-get -y install libssl-dev libevent-pthreads-2.1-6 libomp-dev g++ 134 | > apt-get -y install autoconf automake libtool curl make g++ unzip 135 | > wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/protobuf-2.5.0.zip 136 | > unzip protobuf-2.5.0.zip 137 | > cd protobuf-2.5.0 138 | > ./configure 139 | > make 140 | > sudo make install 141 | 142 | # Python3.6 installation 143 | > apt-get install python3.6-dev 144 | > sudo ldconfig 145 | 146 | # SDHash installation 147 | > git clone https://github.com/sdhash/sdhash.git 148 | > cd sdhash 149 | > make 150 | > make install 151 | > ldconfig 152 | ``` 153 | 154 | 155 | #### JARM 156 | 157 | [JARM](https://github.com/salesforce/jarm) is a tool that fingerprints TLS connections developed by Salesforce. The JARM processing module performs a scan of the domain and returns a JARM hash with the domain to the core engine. Depending on the configuration of a web server, the TLS handshake has different properties. By calculating a hash of the attributes of this handshake, these differences can be used to track web server configurations. 158 | 159 | #### TLSH 160 | 161 | The [TLSH](https://github.com/trendmicro/tlsh) processing module is similar to the SDHash processing module used to calculate a similarity hash. The advantage of the TLSH is, that the installation is much simpler and the input minium is smaller with 50 bytes. As most webshell logins are rather small and were the focus of our research, we activated this processing module by default. 162 | 163 | #### YARA 164 | 165 | The YARA processing module is used to scan HTTP response content with YARA rules. To invoke this processing module, provide the value *YARAProcessing* as a processing module argument. For example, the following command will load the YARA processing module and produce output to the console via the ConsoleStorage storage module. 166 | 167 | ``` 168 | python3 subcrawl.py -p YARAProcessing -s ConsoleStorage 169 | ``` 170 | 171 | Currently, the YARA processing module is used to identify webshell logins and various other interesting content. YARA rules included with this project: 172 | 173 | * protected_webshell: Identifies login pages of password-protected webshells 174 | * js_webshell_tracking_script: Identifies backdoored plugins/themes that use JavaScript 175 | to notifies the attacker when the webshell becomes active 176 | * open_webshell: Identifies open webshells (i.e. webshells that are not protected via login) 177 | * php_webshell_backend: Identifies PHP webshell backend used by the attacker 178 | 179 | Sample output: 180 | ![Yara processing output](images/yara-output.png) 181 | 182 | To add additional YARA rules, you can add .YAR files to the *yara-rules* folder, and then include the rule file by adding an *include* statement to *combined-rules.yar*. 183 | 184 | #### ClamAV 185 | 186 | The ClamAV processing module is used to scan HTTP response content during scanning with ClamAV. If a match is found, it is provided to the various output modules. To invoke this processing module, provide the value *ClamAVProcessing* as a processing module argument. For example, the following command will load the ClamAV processing module and produce output to the console via the ConsoleStorage storage module. 187 | 188 | ``` 189 | python3 subcrawl.py -p ClamAVProcessing -s ConsoleStorage 190 | ``` 191 | 192 | Sample output: 193 | ![ClamAV Processing Module](images/clamav-output.png) 194 | 195 | To utilize this module, ClamAV must be installed. From a terminal, install ClamAV using the APT package manager: 196 | 197 | ``` 198 | $ sudo apt-get install clamav-daemon clamav-freshclam clamav-unofficial-sigs 199 | ``` 200 | Once installed, the ClamAV update service should already be running. However, if you want to manually update using *freshclam*, ensure that the service is stopped: 201 | ``` 202 | sudo systemctl stop clamav-freshclam.service 203 | ``` 204 | And then run *freshclam* manually: 205 | ``` 206 | $ sudo freshclam 207 | ``` 208 | Finally, check the status of the ClamAV service: 209 | ``` 210 | $ sudo systemctl status clamav-daemon.service 211 | ``` 212 | If the service is not running, you can use *systemctl* to start it: 213 | ``` 214 | $ sudo systemctl start clamav-daemon.service 215 | ``` 216 | 217 | #### Payload 218 | 219 | The Payload processing module is used to identify HTTP response content using the *libmagic* library. Additionally, SubCrawl can be configured to save content of interest, such as PE files or archives. To invoke this processing module, provide the value *PayloadProcessing* as a processing module argument. For example, the following command will load the Payload processing module and produce output to the console: 220 | 221 | ``` 222 | python3 subcrawl.py -p PayloadProcessing -s ConsoleStorage 223 | ``` 224 | 225 | There are no additional dependencies for this module. 226 | 227 | Sample output: 228 | ![Payload processing output](images/payload-output.png) 229 | 230 | 231 | ### Storage Modules 232 | 233 | Storage modules are called by the SubCrawl engine after all URLs from the queue have been scanned. They were designed with two objectives in mind. First, to obtain the results from scanning immediately after finishing the scan queue and secondly to enable long-term storage and analysis. Therefore we not only implemented a ConsoleStorage module but also an integration for MISP and an SQLite storage module. 234 | 235 | #### Console 236 | 237 | To quickly analyse results directly after scanning URLs, a well-formatted output is printed to the console. This output is best suited for when SubCrawl is used in run-once mode. While this approach worked well for scanning single domains or generating quick output, it is unwieldy for long-term research and analysis. 238 | 239 | ![Console Storage UI](images/console-storage.png) 240 | 241 | #### Elastic 242 | 243 | Integration with an Elastic cluster is also available. Each URL along with it's data will be indexed as an event, this will include output from other modules such as Yara. A default dashboard has also been added to help get started using this module. Updates the _elasticsearch_ section will need to be made, this will include: 244 | 245 | * Elastic search host (default localhost) 246 | * Port to find elastic on (default 9200) 247 | * Index name (default subcrawl) 248 | * Archive response content - this saves the HTTP response body to disk (default False) 249 | * Archive log location - location to save response content (default log/) 250 | 251 | To use this output module, provide the value *ElasticStorage* with the _-s_ argument. 252 | 253 | #### SQLite 254 | 255 | Since the installation and configuration of MISP can be time-consuming, we implemented another module which stores the data in an SQLite database. To present the data to the user as simply and clearly as possible, we also developed a simple web GUI. Using this web application, the scanned domains and URLs can be viewed and searched with all their attributes. Since this is only an early version, no complex comparison features have been implemented yet. 256 | 257 | ![SQLite UI](images/sqlite-storage.png) 258 | 259 | #### MISP 260 | 261 | [MISP](https://www.misp-project.org/) is an open-source threat intelligence platform with a flexible data model and API to store and analyze threat data. SubCrawl stores crawled data in MISP events, publishing one event per domain and adding any identified open directories as attributes. MISP also allows users to define tags for events and attributes. This is helpful for event comparison and link analyses. Since this was one of our primary research goals, we enriched the data from URLHaus when exporting SubCrawl’s output to MISP. URLHaus annotates its data using tags which can be used to identify a malware family or threat actor associated with a URL. For each open directory URL, the module queries locally-stored URLHaus data and adds URLHaus tags to the MISP event if they match. To avoid having a collection of unrelated attributes for each MISP event, we created a new MISP object for scanned URLs, called opendir-url. This ensures that related attributes are kept together, making it easier to get an overview of the data. 262 | 263 | ![MISP UI](images/misp-overview.png) 264 | 265 | ## Building your own Modules 266 | 267 | Templates for processing and storage modules are provided as part of the framework. 268 | 269 | ### Processing Modules 270 | 271 | Processing modules can be found under `crawler->processing` and a sample module file `example_processing.py` found in this directory. The template provides the necessary inheritance and imports to ensure execution by the framework. The _init_ function provides for module initialization and receives an instance of the logger and the global configuration. The logger is used to provide logging information from the processing modules, as well as throughout the framework. 272 | 273 | The _process_ function is implemented to process each HTTP response. To this end, it receives the URL and the raw response content. This is where the work of the module is implemented. This function should return a dictionary with the following fields: 274 | 275 | - hash: the sha256 of the content 276 | - url: the URL the content was retrieved from 277 | - matches: any matching results in the module, For example, libmagic or YARA results. 278 | 279 | A unique class name must be defined and is used to define this module when including it via the _-p_ argument or as a default processing module in the configuration file. 280 | 281 | Finally, add an import statement in [`__init__.py`](crawler/processing/__init__.py), using your class name: 282 | 283 | ``` 284 | from ._processing import Processing 285 | ``` 286 | 287 | ### Storage Modules 288 | 289 | Storage modules can be found under `crawler->storage` and a sample module file `example_storage.py` found in this directory. Similar to the processing modules, _init_ function provides for module initialization and receives an instance of the logger and the global configuration. The _store_results_ function receives structured data from the engine at intervals defined by the batch size in the configuration file. 290 | 291 | A unique class name must be defined and is used to load the module when including it via the _-s_ argument or as a default processing module in the configuration file. 292 | 293 | ## Presentations and Other Resources 294 | 295 | 2021: 296 | 297 | - [BlackHat Arsenal USA](https://www.blackhat.com/us-21/arsenal/schedule/index.html#introducing-subcrawl-a-framework-for-the-analysis-and-clustering-of-hacking-tools-found-using-open-directories-24081) 298 | - [VirusBulletin Localhost - Upcoming](https://vblocalhost.com/presentations/introducing-subcrawl-a-framework-for-the-analysis-and-clustering-of-hacking-tools-found-using-open-directories) 299 | 300 | ## License 301 | SubCrawl is licensed under the MIT license 302 | -------------------------------------------------------------------------------- /crawler/storage/kibana-dashboard/overview-dashboard.ndjson: -------------------------------------------------------------------------------- 1 | {"attributes":{"fieldAttrs":"{\"crawled_on\":{\"count\":1},\"http.request.url\":{\"count\":1}}","fields":"[]","runtimeFieldMap":"{}","timeFieldName":"crawled_on","title":"subcrawl*","typeMeta":"{}"},"coreMigrationVersion":"7.17.0","id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","migrationVersion":{"index-pattern":"7.11.0"},"references":[],"type":"index-pattern","updated_at":"2021-12-09T21:28:47.683Z","version":"Wzc5MCwyXQ=="} 2 | {"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filter\":[]}"},"optionsJSON":"{\"useMargins\":true,\"syncColors\":false,\"hidePanelTitles\":false}","panelsJSON":"[{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":0,\"w\":48,\"h\":16,\"i\":\"ef96c601-3c40-4166-91a6-0041d38b29f2\"},\"panelIndex\":\"ef96c601-3c40-4166-91a6-0041d38b29f2\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsXY\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-0085efda-2a22-462d-8f0f-2ea2e0b401d5\"}],\"state\":{\"visualization\":{\"legend\":{\"isVisible\":true,\"position\":\"right\"},\"valueLabels\":\"hide\",\"fittingFunction\":\"None\",\"yLeftExtent\":{\"mode\":\"full\"},\"yRightExtent\":{\"mode\":\"full\"},\"axisTitlesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"tickLabelsVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"labelsOrientation\":{\"x\":0,\"yLeft\":0,\"yRight\":0},\"gridlinesVisibilitySettings\":{\"x\":true,\"yLeft\":true,\"yRight\":true},\"preferredSeriesType\":\"line\",\"layers\":[{\"layerId\":\"0085efda-2a22-462d-8f0f-2ea2e0b401d5\",\"accessors\":[\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\"],\"position\":\"top\",\"seriesType\":\"line\",\"showGridlines\":false,\"layerType\":\"data\",\"xAccessor\":\"4047c20f-bfcc-4a39-8735-24a54ceab82b\"}]},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"0085efda-2a22-462d-8f0f-2ea2e0b401d5\":{\"columns\":{\"4047c20f-bfcc-4a39-8735-24a54ceab82b\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1d\"}},\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"4047c20f-bfcc-4a39-8735-24a54ceab82b\",\"e1d71cfa-ed8b-427f-9734-a0a62ec7ee26\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":16,\"w\":26,\"h\":19,\"i\":\"d34b0422-8934-45c9-8b4f-a0697f8788e3\"},\"panelIndex\":\"d34b0422-8934-45c9-8b4f-a0697f8788e3\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsPie\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-12c41522-f747-410f-883f-a393b947cd19\"}],\"state\":{\"visualization\":{\"shape\":\"donut\",\"layers\":[{\"layerId\":\"12c41522-f747-410f-883f-a393b947cd19\",\"groups\":[\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\"],\"metric\":\"20a68eb5-b559-4475-8c81-f3bf76094a61\",\"numberDisplay\":\"percent\",\"categoryDisplay\":\"default\",\"legendDisplay\":\"default\",\"nestedLegend\":false,\"layerType\":\"data\"}]},\"query\":{\"query\":\"NOT http.response.body.content_magic : (\\\"empty\\\" or \\\"html*\\\" or \\\"ascii*\\\")\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"12c41522-f747-410f-883f-a393b947cd19\":{\"columns\":{\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":10,\"orderBy\":{\"type\":\"column\",\"columnId\":\"20a68eb5-b559-4475-8c81-f3bf76094a61\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"20a68eb5-b559-4475-8c81-f3bf76094a61\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"1110a1b0-07f9-4d05-8e56-afd9c49216d4\",\"20a68eb5-b559-4475-8c81-f3bf76094a61\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Overview of Activity\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":26,\"y\":16,\"w\":8,\"h\":10,\"i\":\"f275f2db-bfc8-4e39-b0ed-4b141168374c\"},\"panelIndex\":\"f275f2db-bfc8-4e39-b0ed-4b141168374c\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-36dd60e9-fb14-44c7-b298-ae035186dda3\"}],\"state\":{\"visualization\":{\"layerId\":\"36dd60e9-fb14-44c7-b298-ae035186dda3\",\"accessor\":\"192b6a30-eaca-4fa1-adae-81812804b9e9\",\"layerType\":\"data\"},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"36dd60e9-fb14-44c7-b298-ae035186dda3\":{\"columns\":{\"192b6a30-eaca-4fa1-adae-81812804b9e9\":{\"label\":\"Unique Hosts\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"192b6a30-eaca-4fa1-adae-81812804b9e9\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":34,\"y\":16,\"w\":7,\"h\":10,\"i\":\"3de30073-057a-453d-91df-0327dcde3840\"},\"panelIndex\":\"3de30073-057a-453d-91df-0327dcde3840\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40\"}],\"state\":{\"visualization\":{\"layerId\":\"b9253695-4d27-42ac-a159-fdc690673b40\",\"accessor\":\"246b5da5-1e3f-4920-be1f-580e536698a2\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : pe32*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"b9253695-4d27-42ac-a159-fdc690673b40\":{\"columns\":{\"246b5da5-1e3f-4920-be1f-580e536698a2\":{\"label\":\"PE\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"246b5da5-1e3f-4920-be1f-580e536698a2\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":41,\"y\":16,\"w\":7,\"h\":10,\"i\":\"0548058f-744b-46cc-876e-f3ebd94d47fc\"},\"panelIndex\":\"0548058f-744b-46cc-876e-f3ebd94d47fc\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\"}],\"state\":{\"visualization\":{\"layerId\":\"68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\",\"accessor\":\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\",\"layerType\":\"data\"},\"query\":{\"query\":\"yara_results : \\\"protected_webshell\\\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"68fdd6c3-da83-40cd-8b00-bbf44d29b4ee\":{\"columns\":{\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\":{\"label\":\"Protected Webshells\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"3a3f3c91-1039-42f4-96d2-c4ae0cefa306\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":26,\"y\":26,\"w\":8,\"h\":9,\"i\":\"79fe7c6f-146a-4acd-a06d-96bcf36d75c6\"},\"panelIndex\":\"79fe7c6f-146a-4acd-a06d-96bcf36d75c6\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-98b138d5-6ef9-44ce-9bae-0dd42795a47d\"}],\"state\":{\"visualization\":{\"layerId\":\"98b138d5-6ef9-44ce-9bae-0dd42795a47d\",\"accessor\":\"4635bde2-23fc-438e-8919-73cc0c2a0d32\",\"layerType\":\"data\"},\"query\":{\"query\":\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"98b138d5-6ef9-44ce-9bae-0dd42795a47d\":{\"columns\":{\"4635bde2-23fc-438e-8919-73cc0c2a0d32\":{\"label\":\"URLs\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"4635bde2-23fc-438e-8919-73cc0c2a0d32\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":34,\"y\":26,\"w\":7,\"h\":9,\"i\":\"0d48db17-ea78-484c-922c-dd26997c7dbc\"},\"panelIndex\":\"0d48db17-ea78-484c-922c-dd26997c7dbc\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40\"}],\"state\":{\"visualization\":{\"layerId\":\"b9253695-4d27-42ac-a159-fdc690673b40\",\"accessor\":\"246b5da5-1e3f-4920-be1f-580e536698a2\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : zip*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"b9253695-4d27-42ac-a159-fdc690673b40\":{\"columns\":{\"246b5da5-1e3f-4920-be1f-580e536698a2\":{\"label\":\"ZIPs\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\",\"customLabel\":true}},\"columnOrder\":[\"246b5da5-1e3f-4920-be1f-580e536698a2\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":41,\"y\":26,\"w\":7,\"h\":9,\"i\":\"7ae2c21a-737b-49dc-94cd-5324621f8549\"},\"panelIndex\":\"7ae2c21a-737b-49dc-94cd-5324621f8549\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsMetric\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-04e472c4-38dd-4b8f-afdd-ef598fddda94\"}],\"state\":{\"visualization\":{\"layerId\":\"04e472c4-38dd-4b8f-afdd-ef598fddda94\",\"accessor\":\"50fcc77d-0b49-4246-9838-ede31c217e3f\",\"layerType\":\"data\"},\"query\":{\"query\":\"http.response.body.content_magic.keyword : composite document * or http.response.body.content_magic.keyword : *word* or http.response.body.content_magic.keyword : *excel*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"04e472c4-38dd-4b8f-afdd-ef598fddda94\":{\"columns\":{\"50fcc77d-0b49-4246-9838-ede31c217e3f\":{\"label\":\"Office Docs\",\"dataType\":\"number\",\"operationType\":\"unique_count\",\"scale\":\"ratio\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":false,\"customLabel\":true}},\"columnOrder\":[\"50fcc77d-0b49-4246-9838-ede31c217e3f\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{}}},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":35,\"w\":48,\"h\":19,\"i\":\"ccb17e1b-56b5-4a20-9974-978e7c33e7f0\"},\"panelIndex\":\"ccb17e1b-56b5-4a20-9974-978e7c33e7f0\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\"},{\"isTransposed\":false,\"columnId\":\"ad41a585-a15a-45a4-a419-9b3d962962c3\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":174.66666666666666},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : zip*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"ad41a585-a15a-45a4-a419-9b3d962962c3\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\",\"ad41a585-a15a-45a4-a419-9b3d962962c3\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"ZIPs\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":89,\"w\":48,\"h\":19,\"i\":\"93d0dc66-32d1-4019-ab1a-2432f29637b6\"},\"panelIndex\":\"93d0dc66-32d1-4019-ab1a-2432f29637b6\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"c71beaa7-8c97-474d-acf1-25e7a887179f\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":174.66666666666666},{\"isTransposed\":false,\"columnId\":\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\"},{\"isTransposed\":false,\"columnId\":\"ad41a585-a15a-45a4-a419-9b3d962962c3\"},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\",\"hidden\":true}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : php*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"c71beaa7-8c97-474d-acf1-25e7a887179f\":{\"label\":\"Top values of http.response.body.content.sha256.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content.sha256.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":3,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"ad41a585-a15a-45a4-a419-9b3d962962c3\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"975a5cc5-7142-437a-b24f-0d2564ae2d3b\",\"ad41a585-a15a-45a4-a419-9b3d962962c3\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"c71beaa7-8c97-474d-acf1-25e7a887179f\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"PHP\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":54,\"w\":48,\"h\":19,\"i\":\"a32a8543-4c96-46dc-bfee-3aed03c2d97d\"},\"panelIndex\":\"a32a8543-4c96-46dc-bfee-3aed03c2d97d\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73\"}],\"state\":{\"visualization\":{\"layerId\":\"969461de-aa88-4fae-ac5d-bfb548452b73\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"e98609fe-29e7-4788-aab1-7f70c38c5c49\"},{\"isTransposed\":false,\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"width\":132.66666666666666},{\"isTransposed\":false,\"columnId\":\"7f2787cd-1444-4ef7-91a0-d18199296217\"},{\"isTransposed\":false,\"columnId\":\"6d3db337-d347-4a26-8a9e-ca229172b7f3\",\"width\":484},{\"isTransposed\":false,\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"}],\"sorting\":{\"columnId\":\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"direction\":\"desc\"}},\"query\":{\"query\":\"http.response.body.content_magic.keyword : pe32*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"969461de-aa88-4fae-ac5d-bfb548452b73\":{\"columns\":{\"e98609fe-29e7-4788-aab1-7f70c38c5c49\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":3,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7f2787cd-1444-4ef7-91a0-d18199296217\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"6d3db337-d347-4a26-8a9e-ca229172b7f3\":{\"label\":\"Top values of http.request.url.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}},\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"6a318dc8-a181-4ac3-8c49-45aaa24336ae\",\"7f2787cd-1444-4ef7-91a0-d18199296217\",\"6d3db337-d347-4a26-8a9e-ca229172b7f3\",\"e98609fe-29e7-4788-aab1-7f70c38c5c49\",\"b601fd7b-d3df-4aa8-ab59-28c284874d1e\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"PEs\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":0,\"y\":73,\"w\":24,\"h\":16,\"i\":\"42f79d55-ad71-463c-bcac-fd044e24454e\"},\"panelIndex\":\"42f79d55-ad71-463c-bcac-fd044e24454e\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-e4c26798-d7f1-48cc-9824-b5c0e2c3c940\"}],\"state\":{\"visualization\":{\"layerId\":\"e4c26798-d7f1-48cc-9824-b5c0e2c3c940\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"e985240d-4bec-47ae-a8fe-21c45ec993e9\"},{\"isTransposed\":false,\"columnId\":\"0ba9110d-3535-402d-a5c4-39e1a0549a89\"},{\"isTransposed\":false,\"columnId\":\"14cf9b50-057e-4d19-8d77-56126e7449be\"},{\"isTransposed\":false,\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\",\"hidden\":true}],\"sorting\":{\"columnId\":\"14cf9b50-057e-4d19-8d77-56126e7449be\",\"direction\":\"desc\"}},\"query\":{\"query\":\"yara_results : \\\"protected_webshell\\\"\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"e4c26798-d7f1-48cc-9824-b5c0e2c3c940\":{\"columns\":{\"e985240d-4bec-47ae-a8fe-21c45ec993e9\":{\"label\":\"URL\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.url.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false},\"customLabel\":true},\"0ba9110d-3535-402d-a5c4-39e1a0549a89\":{\"label\":\"SHA256\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content.sha256.keyword\",\"isBucketed\":true,\"params\":{\"size\":20,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false},\"customLabel\":true},\"14cf9b50-057e-4d19-8d77-56126e7449be\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"}},\"columnOrder\":[\"14cf9b50-057e-4d19-8d77-56126e7449be\",\"e985240d-4bec-47ae-a8fe-21c45ec993e9\",\"0ba9110d-3535-402d-a5c4-39e1a0549a89\",\"7bc1f1d0-602c-4158-91fd-c6800fbc2e07\"],\"incompleteColumns\":{}}}}}}},\"enhancements\":{},\"hidePanelTitles\":false},\"title\":\"Protected Webshells\"},{\"version\":\"7.17.0\",\"type\":\"lens\",\"gridData\":{\"x\":24,\"y\":73,\"w\":24,\"h\":16,\"i\":\"3d1b22c5-a23a-49a2-b862-8b79177aa6c8\"},\"panelIndex\":\"3d1b22c5-a23a-49a2-b862-8b79177aa6c8\",\"embeddableConfig\":{\"attributes\":{\"title\":\"\",\"visualizationType\":\"lnsDatatable\",\"type\":\"lens\",\"references\":[{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-current-indexpattern\"},{\"type\":\"index-pattern\",\"id\":\"2ffe4bf0-5124-11ec-9a97-899d8810a3c2\",\"name\":\"indexpattern-datasource-layer-96ee832f-8e02-453c-83c8-c4cec031d5dc\"}],\"state\":{\"visualization\":{\"layerId\":\"96ee832f-8e02-453c-83c8-c4cec031d5dc\",\"layerType\":\"data\",\"columns\":[{\"isTransposed\":false,\"columnId\":\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\",\"width\":330.41666666666663},{\"isTransposed\":false,\"columnId\":\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\",\"width\":349.75},{\"isTransposed\":false,\"columnId\":\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\"},{\"isTransposed\":false,\"columnId\":\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"},{\"columnId\":\"fd8b632c-b733-4538-b2c0-53907b9e7e32\",\"isTransposed\":false}],\"sorting\":{\"columnId\":\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\",\"direction\":\"desc\"}},\"query\":{\"query\":\" http.response.body.content_magic.keyword : composite document* or http.response.body.content_magic.keyword : *word* or http.response.body.content_magic.keyword : *excel*\",\"language\":\"kuery\"},\"filters\":[],\"datasourceStates\":{\"indexpattern\":{\"layers\":{\"96ee832f-8e02-453c-83c8-c4cec031d5dc\":{\"columns\":{\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\":{\"label\":\"Top values of http.request.netloc.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.netloc.keyword\",\"isBucketed\":true,\"params\":{\"size\":30,\"orderBy\":{\"type\":\"alphabetical\",\"fallback\":false},\"orderDirection\":\"asc\",\"otherBucket\":true,\"missingBucket\":false}},\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\":{\"label\":\"Top values of http.request.path.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.request.path.keyword\",\"isBucketed\":true,\"params\":{\"size\":30,\"orderBy\":{\"type\":\"alphabetical\",\"fallback\":false},\"orderDirection\":\"asc\",\"otherBucket\":true,\"missingBucket\":false}},\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\":{\"label\":\"crawled_on\",\"dataType\":\"date\",\"operationType\":\"date_histogram\",\"sourceField\":\"crawled_on\",\"isBucketed\":true,\"scale\":\"interval\",\"params\":{\"interval\":\"1s\"}},\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\":{\"label\":\"Count of records\",\"dataType\":\"number\",\"operationType\":\"count\",\"isBucketed\":false,\"scale\":\"ratio\",\"sourceField\":\"Records\"},\"fd8b632c-b733-4538-b2c0-53907b9e7e32\":{\"label\":\"Top values of http.response.body.content_magic.keyword\",\"dataType\":\"string\",\"operationType\":\"terms\",\"scale\":\"ordinal\",\"sourceField\":\"http.response.body.content_magic.keyword\",\"isBucketed\":true,\"params\":{\"size\":5,\"orderBy\":{\"type\":\"column\",\"columnId\":\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"},\"orderDirection\":\"desc\",\"otherBucket\":true,\"missingBucket\":false}}},\"columnOrder\":[\"639bfd78-7bb4-47cf-abb6-9c00f2447b6e\",\"fd8b632c-b733-4538-b2c0-53907b9e7e32\",\"fcf030a4-9d9b-4f76-ba73-41766bba0a09\",\"b9da2172-b00e-4d73-95b7-21f95a6ea76d\",\"7c1b259c-2dba-4656-bf9d-98b1db9e82db\"],\"incompleteColumns\":{}}}}}}},\"hidePanelTitles\":false,\"enhancements\":{}},\"title\":\"Office Docs\"}]","timeRestore":false,"title":"Subcrawl - Main","version":1},"coreMigrationVersion":"7.17.0","id":"eaccd8b0-5382-11ec-9a97-899d8810a3c2","migrationVersion":{"dashboard":"7.17.0"},"references":[{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ef96c601-3c40-4166-91a6-0041d38b29f2:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ef96c601-3c40-4166-91a6-0041d38b29f2:indexpattern-datasource-layer-0085efda-2a22-462d-8f0f-2ea2e0b401d5","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"d34b0422-8934-45c9-8b4f-a0697f8788e3:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"d34b0422-8934-45c9-8b4f-a0697f8788e3:indexpattern-datasource-layer-12c41522-f747-410f-883f-a393b947cd19","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"f275f2db-bfc8-4e39-b0ed-4b141168374c:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"f275f2db-bfc8-4e39-b0ed-4b141168374c:indexpattern-datasource-layer-36dd60e9-fb14-44c7-b298-ae035186dda3","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3de30073-057a-453d-91df-0327dcde3840:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3de30073-057a-453d-91df-0327dcde3840:indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0548058f-744b-46cc-876e-f3ebd94d47fc:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0548058f-744b-46cc-876e-f3ebd94d47fc:indexpattern-datasource-layer-68fdd6c3-da83-40cd-8b00-bbf44d29b4ee","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"79fe7c6f-146a-4acd-a06d-96bcf36d75c6:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"79fe7c6f-146a-4acd-a06d-96bcf36d75c6:indexpattern-datasource-layer-98b138d5-6ef9-44ce-9bae-0dd42795a47d","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0d48db17-ea78-484c-922c-dd26997c7dbc:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"0d48db17-ea78-484c-922c-dd26997c7dbc:indexpattern-datasource-layer-b9253695-4d27-42ac-a159-fdc690673b40","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"7ae2c21a-737b-49dc-94cd-5324621f8549:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"7ae2c21a-737b-49dc-94cd-5324621f8549:indexpattern-datasource-layer-04e472c4-38dd-4b8f-afdd-ef598fddda94","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ccb17e1b-56b5-4a20-9974-978e7c33e7f0:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"ccb17e1b-56b5-4a20-9974-978e7c33e7f0:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"93d0dc66-32d1-4019-ab1a-2432f29637b6:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"93d0dc66-32d1-4019-ab1a-2432f29637b6:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"a32a8543-4c96-46dc-bfee-3aed03c2d97d:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"a32a8543-4c96-46dc-bfee-3aed03c2d97d:indexpattern-datasource-layer-969461de-aa88-4fae-ac5d-bfb548452b73","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"42f79d55-ad71-463c-bcac-fd044e24454e:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"42f79d55-ad71-463c-bcac-fd044e24454e:indexpattern-datasource-layer-e4c26798-d7f1-48cc-9824-b5c0e2c3c940","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3d1b22c5-a23a-49a2-b862-8b79177aa6c8:indexpattern-datasource-current-indexpattern","type":"index-pattern"},{"id":"2ffe4bf0-5124-11ec-9a97-899d8810a3c2","name":"3d1b22c5-a23a-49a2-b862-8b79177aa6c8:indexpattern-datasource-layer-96ee832f-8e02-453c-83c8-c4cec031d5dc","type":"index-pattern"}],"type":"dashboard","updated_at":"2022-02-21T21:03:10.544Z","version":"WzMwNDI5LDJd"} 3 | {"excludedObjects":[],"excludedObjectsCount":0,"exportedCount":2,"missingRefCount":0,"missingReferences":[]} --------------------------------------------------------------------------------