├── core ├── __init__.py ├── crawl │ ├── __init__.py │ ├── lib │ │ ├── __init__.py │ │ ├── crawl_result.py │ │ ├── shared.py │ │ ├── urlfinder.py │ │ ├── probe.py │ │ └── utils.py │ ├── probe │ │ ├── .gitignore │ │ ├── chrome_extension │ │ │ ├── content.js │ │ │ ├── manifest.json │ │ │ ├── example_navigation_away_test_case.html │ │ │ └── background.js │ │ ├── package.json │ │ ├── logger.js │ │ ├── src │ │ │ ├── constants.js │ │ │ ├── utils.js │ │ │ └── page-handler.js │ │ ├── .eslintrc │ │ └── index.js │ └── crawler_thread.py ├── lib │ ├── __init__.py │ ├── thirdparty │ │ ├── __init__.py │ │ ├── pysocks │ │ │ ├── __init__.py │ │ │ └── sockshandler.py │ │ └── simhash │ │ │ └── __init__.py │ ├── exception.py │ ├── shell.py │ ├── request_pattern.py │ ├── utils.py │ ├── cookie.py │ ├── request.py │ ├── http_get.py │ └── database.py ├── scan │ ├── __init__.py │ ├── scanners │ │ ├── __init__.py │ │ ├── curl.py │ │ ├── ck401.py │ │ ├── sqlmap.py │ │ ├── wapiti.py │ │ └── arachni.py │ ├── scanner.py │ └── base_scanner.py ├── util │ ├── __init__.py │ ├── utilities │ │ ├── __init__.py │ │ ├── lsvuln.py │ │ ├── updcookie.py │ │ ├── lsajax.py │ │ ├── usgen.py │ │ ├── login.py │ │ ├── htmlreport │ │ │ ├── report.html │ │ │ └── style.css │ │ ├── login │ │ │ └── login.js │ │ └── report.py │ ├── base_util.py │ └── util.py └── constants.py ├── requirements.txt ├── .gitignore ├── requirements-dev.txt ├── .travis.yml ├── scripts ├── htmlreport.py └── quickscan.sh ├── tests ├── lib_tests │ ├── shell_tests.py │ ├── request_tests.py │ └── database_tests.py └── crawl_tests │ ├── probe_tests.py │ ├── crawler_tests.py │ └── urlfinder_tests.py ├── htcap.py └── README.md /core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/crawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/scan/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/crawl/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/lib/thirdparty/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/scan/scanners/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/util/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | HTMLParser==0.0.2 -------------------------------------------------------------------------------- /core/lib/thirdparty/pysocks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | tmp.py 3 | .idea 4 | *.db 5 | tmp 6 | /dist/ 7 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | coverage==4.2 3 | mock==2.0.0 4 | -------------------------------------------------------------------------------- /core/crawl/probe/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /package-lock.json 3 | /yarn.lock 4 | /node_modules/ 5 | /*.log 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: "python" 2 | python: "2.7" 3 | install: "pip install -r requirements.txt && pip install -r requirements-dev.txt" 4 | script: "nosetests" 5 | -------------------------------------------------------------------------------- /core/crawl/probe/chrome_extension/content.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | // transmitting url received from the background page to the page 4 | chrome.runtime.onMessage.addListener(function(msg) { 5 | window.postMessage({from: 'javascript-probe', name: 'navigation-blocked', url: msg.url}, '*'); 6 | }); 7 | })(); 8 | -------------------------------------------------------------------------------- /core/crawl/probe/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "javascript-probe", 3 | "version": "1.0.0", 4 | "main": "index.js", 5 | "license": "GPL-2.0", 6 | "dependencies": { 7 | "argparse": "1.0.9", 8 | "puppeteer": "1.2.0", 9 | "winston": "2.4.0" 10 | }, 11 | "devDependencies": { 12 | "eslint": "4.8.0" 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /core/crawl/probe/chrome_extension/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "manifest_version": 2, 3 | "name": "JavaScript Probe extension - Navigation blocker", 4 | "version": "1.0", 5 | "license": "GPL-2.0", 6 | "permissions": [ 7 | "webRequest", 8 | "webRequestBlocking", 9 | "", 10 | "tabs" 11 | ], 12 | "background": { 13 | "persistent": true, 14 | "scripts": [ 15 | "background.js" 16 | ] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /core/lib/exception.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | class NotHtmlException(Exception): 15 | pass 16 | 17 | 18 | class RedirectException(Exception): 19 | pass 20 | 21 | 22 | class ThreadExitRequestException(Exception): 23 | pass 24 | 25 | # class MalformedUrlException(Exception): 26 | # pass 27 | -------------------------------------------------------------------------------- /core/crawl/lib/crawl_result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | class CrawlResult: 15 | def __init__(self, request, found_requests=None, errors=None): 16 | self.request = request 17 | self.found_requests = found_requests if found_requests else [] 18 | self.errors = errors if errors else [] 19 | -------------------------------------------------------------------------------- /core/crawl/lib/shared.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | # TODO: make sure that only shared data are stored in this object 15 | 16 | class Shared: 17 | """ 18 | data shared between threads 19 | """ 20 | 21 | def __init__(self): 22 | pass 23 | 24 | main_condition = None 25 | th_condition = None 26 | 27 | requests = [] 28 | requests_index = 0 29 | crawl_results = [] 30 | 31 | start_url = "" 32 | start_cookies = [] 33 | end_cookies = [] 34 | allowed_domains = set() 35 | excluded_urls = set() 36 | 37 | probe_cmd = [] 38 | 39 | options = {} 40 | -------------------------------------------------------------------------------- /core/crawl/probe/chrome_extension/example_navigation_away_test_case.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | test 6 | 7 | 8 | 9 | 10 | pouet (should be blocked) 11 |

12 | pouet2 (should be blocked) 13 |

14 | pouet3 (should "navigate") 15 |

16 | 17 | 18 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /scripts/htmlreport.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """ 6 | HTCAP - beta 1 7 | Author: filippo.cavallarin@wearesegment.com 8 | 9 | This program is free software; you can redistribute it and/or modify it under 10 | the terms of the GNU General Public License as published by the Free Software 11 | Foundation; either version 2 of the License, or (at your option) any later 12 | version. 13 | """ 14 | 15 | import sys 16 | import os 17 | import sqlite3 18 | import json 19 | from urlparse import urlsplit 20 | import glob 21 | import importlib 22 | 23 | reload(sys) 24 | sys.setdefaultencoding('utf8') 25 | 26 | sys.path.append(os.path.realpath(os.path.dirname(os.path.realpath(__file__)) + os.sep + "..")) 27 | 28 | print "* WARNING: this script is here for back compatibility reasons and will be removed soon!!\n* Use 'htcap util report' instead" 29 | 30 | mod = importlib.import_module("core.util.utilities.report") 31 | run = getattr(mod, "Report") 32 | run(['report'] + sys.argv[1::]) 33 | -------------------------------------------------------------------------------- /core/crawl/probe/logger.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | const winston = require('winston'); 4 | 5 | let outputLogger = new winston.Logger({ 6 | transports: [ 7 | new (winston.transports.Console)( 8 | { 9 | formatter: (options) => { 10 | return options.message; 11 | }, 12 | }, 13 | ), 14 | ], 15 | exitOnError: true, 16 | }); 17 | 18 | let debugLogger = new winston.Logger({ 19 | transports: [ 20 | new (winston.transports.File)( 21 | { 22 | level: 'debug', 23 | filename: __dirname + '/debug.log', 24 | prettyPrint: true, 25 | timestamp: true, 26 | json: false, 27 | }, 28 | ), 29 | ], 30 | exceptionHandlers: [ 31 | new (winston.transports.Console)({json: false, timestamp: true, prettyPrint: true}), 32 | ], 33 | }); 34 | module.exports = {output: outputLogger, debug: debugLogger}; 35 | 36 | })(); 37 | -------------------------------------------------------------------------------- /core/scan/scanners/curl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | 15 | import re 16 | from core.scan.base_scanner import BaseScanner 17 | 18 | 19 | class Curl(BaseScanner): 20 | def init(self, argv): 21 | return True 22 | 23 | def get_settings(self): 24 | return dict( 25 | request_types = "link,redirect", 26 | num_threads = 10, 27 | process_timeout = 20 , 28 | scanner_exe = "/usr/bin/env curl" 29 | ) 30 | 31 | def get_cmd(self, request, tmp_dir): 32 | cmd = [ "-I", request.url] 33 | print self.scanner_name 34 | 35 | return cmd 36 | 37 | def scanner_executed(self, request, out, err, tmp_dir, cmd): 38 | if not re.search("^X-XSS-Protection\:", out, re.M): 39 | self.save_vulnerability(request, "xss-portection-missing", "X-XSS-Protection header is not set") 40 | -------------------------------------------------------------------------------- /core/util/base_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import getopt 16 | 17 | class BaseUtil: 18 | 19 | @staticmethod 20 | def get_settings(): 21 | return dict( 22 | descr = "", 23 | optargs = '', 24 | minargs = 0 25 | ) 26 | 27 | def usage(self): 28 | return ( 29 | "%s\n" 30 | "usage: %s\n" 31 | % (self.get_settings()['descr'], self.utilname) 32 | ) 33 | 34 | def __init__(self, argv): 35 | self.utilname = argv[0] 36 | settings = self.get_settings() 37 | 38 | if len(argv) < (settings['minargs'] + 1): 39 | print self.usage() 40 | sys.exit(1) 41 | 42 | try: 43 | opts, args = getopt.getopt(argv[1:], settings['optargs']) 44 | except getopt.GetoptError as err: 45 | print str(err) 46 | sys.exit(1) 47 | 48 | self.main(args, opts) 49 | -------------------------------------------------------------------------------- /core/util/utilities/lsvuln.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import sqlite3 4 | import json 5 | 6 | from core.util.base_util import BaseUtil 7 | 8 | reload(sys) 9 | sys.setdefaultencoding('utf8') 10 | 11 | class Lsvuln(BaseUtil): 12 | 13 | @staticmethod 14 | def get_settings(): 15 | return dict( 16 | descr = "List all vulnerabilities", 17 | optargs = '', 18 | minargs = 1 19 | ) 20 | 21 | def usage(self): 22 | return ( 23 | "%s\n" 24 | "usage: %s []\n" 25 | % (self.get_settings()['descr'], self.utilname) 26 | ) 27 | 28 | def main(self, args, opts): 29 | qry = """ 30 | SELECT scanner,start_date,end_date,id_request,type,description FROM assessment a 31 | INNER JOIN vulnerability av ON a.id=av.id_assessment 32 | WHERE 33 | %s 34 | """ 35 | 36 | dbfile = args[0] 37 | where = args[1] if len(args) > 1 else "1=1" 38 | 39 | conn = sqlite3.connect(dbfile) 40 | conn.row_factory = sqlite3.Row 41 | 42 | cur = conn.cursor() 43 | cur.execute(qry % where) 44 | for vuln in cur.fetchall(): 45 | print vuln['description'] 46 | print "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " 47 | 48 | -------------------------------------------------------------------------------- /tests/lib_tests/shell_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import time 3 | import os 4 | import subprocess 5 | from mock import patch 6 | from core.lib.shell import CommandExecutor 7 | 8 | 9 | class ExecutorTest(unittest.TestCase): 10 | 11 | @patch('core.lib.shell.subprocess') 12 | def test_command_responds(self, mock_process): 13 | mock_process.Popen.side_effect = time.sleep(1) 14 | mock_process.Popen().communicate.return_value = ('hurray', 'err') 15 | 16 | executor = CommandExecutor(['cmd']) 17 | result = executor.execute(2) 18 | 19 | self.assertEqual(result, "hurray") 20 | 21 | 22 | def test_command_timeout_with_results(self): 23 | cmd = ['tail', '-f', os.path.realpath(__file__)] 24 | executor = CommandExecutor(cmd, stderr=True) 25 | result = executor.execute(1) 26 | 27 | self.assertIn("result", result[0]) 28 | 29 | @patch.object(subprocess.Popen, 'communicate') 30 | def test_command_timeout_with_errors(self, mock_comm): 31 | mock_comm.return_value= (None, "error") 32 | cmd = ['sleep', '10'] 33 | executor = CommandExecutor(cmd, stderr=True) 34 | result = executor.execute(1) 35 | 36 | self.assertEquals("error", result[1]) 37 | -------------------------------------------------------------------------------- /core/util/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import importlib 16 | from glob import glob 17 | from core.lib.utils import * 18 | 19 | class Util: 20 | 21 | def get_mod(self, path, name): 22 | mod = importlib.import_module("%s.%s" % (path, name)) 23 | return getattr(mod, name.title()) 24 | 25 | 26 | def __init__(self, argv): 27 | util = argv[0] if len(argv) >= 1 else "" 28 | mp = "core.util.utilities" 29 | fp = "%s%sutilities" % (getrealdir(__file__), os.sep) 30 | utils = [os.path.basename(m).split(".")[0] for m in glob(os.path.join(fp , '[a-z]*[a-z].py'))] 31 | 32 | if not util in utils: 33 | utils.sort() 34 | print "Available utilities are:" 35 | for u in utils: 36 | run = self.get_mod(mp, u) 37 | print " %s%s%s" % (u, " "*(20 - len(u)), run.get_settings()['descr'].split("\n")[0]) 38 | sys.exit(1) 39 | 40 | run = self.get_mod(mp, util) 41 | run([util] + argv[1:]) -------------------------------------------------------------------------------- /htcap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | HTCAP - beta 1 6 | Author: filippo.cavallarin@wearesegment.com 7 | 8 | This program is free software; you can redistribute it and/or modify it under 9 | the terms of the GNU General Public License as published by the Free Software 10 | Foundation; either version 2 of the License, or (at your option) any later 11 | version. 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | import sys 16 | import os 17 | import datetime 18 | import time 19 | import getopt 20 | 21 | from core.lib.utils import * 22 | from core.crawl.crawler import Crawler 23 | from core.scan.scanner import Scanner 24 | 25 | from core.util.util import Util 26 | 27 | reload(sys) 28 | sys.setdefaultencoding('utf8') 29 | 30 | 31 | def usage(): 32 | infos = get_program_infos() 33 | print ("htcap ver " + infos['version'] + "\n" 34 | "usage: htcap \n" 35 | "Commands: \n" 36 | " crawl run crawler\n" 37 | " scan run scanner\n" 38 | " util run utility\n" 39 | ) 40 | 41 | 42 | if __name__ == '__main__': 43 | 44 | if len(sys.argv) < 2: 45 | usage() 46 | sys.exit(1) 47 | 48 | elif sys.argv[1] == "crawl": 49 | crawler = Crawler(sys.argv[2:]) 50 | crawler.run() 51 | elif sys.argv[1] == "scan": 52 | Scanner(sys.argv[2:]) 53 | elif sys.argv[1] == "util": 54 | Util(sys.argv[2:]) 55 | else: 56 | usage(); 57 | sys.exit(1) 58 | 59 | sys.exit(0) 60 | -------------------------------------------------------------------------------- /core/util/utilities/updcookie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import sqlite3 4 | import json 5 | import getopt 6 | import os 7 | 8 | from core.util.base_util import BaseUtil 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | class Updcookie(BaseUtil): 14 | 15 | @staticmethod 16 | def get_settings(): 17 | return dict( 18 | descr = "Update the value of a cookie of saved requests", 19 | optargs = '', 20 | minargs = 3 21 | ) 22 | 23 | def usage(self): 24 | return ( 25 | "%s\n" 26 | "usage: %s []\n" 27 | % (self.get_settings()['descr'], self.utilname) 28 | ) 29 | 30 | 31 | def main(self, argv): 32 | qry = """ 33 | SELECT id, cookies 34 | FROM request 35 | WHERE %s 36 | """ 37 | 38 | dbfile = args[0] 39 | cname = args[1] 40 | cvalue = args[2] 41 | 42 | if not os.path.exists(dbfile): 43 | print "No such file %s" % dbfile 44 | sys.exit(1) 45 | 46 | where = args[3] if len(args) > 3 else "1=1" 47 | 48 | conn = sqlite3.connect(dbfile) 49 | conn.row_factory = sqlite3.Row 50 | 51 | cur = conn.cursor() 52 | wcur = conn.cursor() 53 | cur.execute(qry % where) 54 | pages = {} 55 | for res in cur.fetchall(): 56 | cookies = res['cookies'] 57 | if cookies: 58 | #print cookies 59 | cookies = json.loads(cookies) 60 | for cookie in cookies: 61 | if cookie['name'] == cname: 62 | cookie['value'] = cvalue 63 | wcur.execute("update request set cookies=? where id=?",(json.dumps(cookies), res['id'])) 64 | 65 | conn.commit() 66 | cur.close() 67 | wcur.close() 68 | conn.close() 69 | -------------------------------------------------------------------------------- /core/scan/scanners/ck401.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | 15 | import re 16 | from core.scan.base_scanner import BaseScanner 17 | 18 | 19 | class Ck401(BaseScanner): 20 | def init(self, argv): 21 | return True 22 | 23 | def get_settings(self): 24 | return dict( 25 | request_types = "link,redirect,xhr,form", 26 | num_threads = 10, 27 | process_timeout = 20 , 28 | scanner_exe = "/usr/bin/env curl" 29 | ) 30 | 31 | def get_cmd(self, request, tmp_dir): 32 | #cookies = ["%s=%s" % (c.name,c.value) for c in request.cookies] 33 | #cookies_str = " -H 'Cookie: %s'" % " ;".join(cookies) if len(cookies) > 0 else "" 34 | method = ["-X", "POST"] if request.method == "POST" else [] 35 | referer = ["-H", "'Referer: %s'" % request.referer] if request.referer else [] 36 | data = ["--data", "'%s'" % request.data] if request.data else [] 37 | 38 | 39 | cmd = [ "-i" ] + referer + method + data + [request.url] 40 | # print " ".join(cmd) 41 | # return False 42 | 43 | return cmd 44 | 45 | def scanner_executed(self, request, out, err, tmp_dir, cmd): 46 | if not re.search("^HTTP/1.1 401 Unauthorized", out) and not re.search("action\[login\]",out): 47 | self.save_vulnerability(request, "cross-session", " ".join(cmd)+ "\n" + out) 48 | -------------------------------------------------------------------------------- /core/crawl/lib/urlfinder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import re 14 | from HTMLParser import HTMLParser 15 | from urlparse import urljoin, urlparse 16 | 17 | 18 | class UrlFinder: 19 | def __init__(self, html): 20 | self.html = html 21 | 22 | def get_urls(self): 23 | 24 | try: 25 | parser = UrlHTMLParser() 26 | parser.feed(self.html) 27 | except: 28 | raise 29 | 30 | return parser.urls 31 | 32 | 33 | class UrlHTMLParser(HTMLParser): 34 | def __init__(self): 35 | 36 | HTMLParser.__init__(self) 37 | self.base_url = "" 38 | self.urls = [] 39 | 40 | def handle_starttag(self, tag, attrs): 41 | # more info about the tag: https://www.w3.org/wiki/HTML/Elements/base 42 | if tag == "base": 43 | for key, val in attrs: 44 | if key == "href": 45 | self.base_url = urlparse(val.strip()).geturl() 46 | 47 | elif tag == "a": 48 | for key, val in attrs: 49 | if key == "href": 50 | if re.match("^https?://", val, re.I): 51 | self.urls.extend([val]) 52 | elif not re.match("^[a-z]+:", val, re.I) and not val.startswith("#"): 53 | self.urls.extend([urljoin(self.base_url, val)]) 54 | -------------------------------------------------------------------------------- /tests/lib_tests/request_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mock import call, patch 4 | 5 | from core.lib.request import Request 6 | 7 | 8 | class RequestTestCase(unittest.TestCase): 9 | @patch('core.lib.request.remove_tokens') 10 | def test___eq__(self, remove_tokens_mock): 11 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 12 | b = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 13 | self.assertTrue(a == b) 14 | 15 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 16 | b = Request("type2", "method1", "url1", data="data1", http_auth="auth1") 17 | self.assertFalse(a == b) 18 | 19 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 20 | b = Request("type1", "method2", "url1", data="data1", http_auth="auth1") 21 | self.assertFalse(a == b) 22 | 23 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 24 | b = Request("type1", "method1", "url2", data="data1", http_auth="auth1") 25 | self.assertFalse(a == b) 26 | 27 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 28 | b = Request("type1", "method1", "url1", data="data2", http_auth="auth1") 29 | self.assertFalse(a == b) 30 | 31 | a = Request("type1", "method1", "url1", data="data1", http_auth="auth1") 32 | b = Request("type1", "method1", "url1", data="data1", http_auth="auth2") 33 | self.assertFalse(a == b) 34 | 35 | a = Request("type1", "method1", "url1") 36 | b = None 37 | self.assertFalse(a == b) 38 | self.assertEqual(remove_tokens_mock.call_count, 0) 39 | 40 | @patch('core.lib.request.remove_tokens', return_value="some data") 41 | def test___eq__with_post(self, remove_tokens_mock): 42 | a = Request("type1", "POST", "url1", data="dataXXXX") 43 | b = Request("type1", "POST", "url1", data="dataYYYY") 44 | 45 | self.assertTrue(a == b) 46 | self.assertEqual(remove_tokens_mock.call_args_list, [call("dataXXXX"), call("dataYYYY")]) 47 | -------------------------------------------------------------------------------- /core/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | THSTAT_WAITING = 0 14 | THSTAT_RUNNING = 1 15 | 16 | CRAWLSCOPE_DOMAIN = "domain" 17 | CRAWLSCOPE_DIRECTORY = "directory" 18 | CRAWLSCOPE_URL = "url" 19 | 20 | CRAWLOUTPUT_RENAME = "rename" 21 | CRAWLOUTPUT_OVERWRITE = "overwrite" 22 | CRAWLOUTPUT_RESUME = "resume" 23 | CRAWLOUTPUT_COMPLETE = "complete" 24 | 25 | CRAWLMODE_PASSIVE = "passive" 26 | CRAWLMODE_ACTIVE = "active" 27 | CRAWLMODE_AGGRESSIVE = "aggressive" 28 | 29 | REQTYPE_LINK = "link" 30 | REQTYPE_XHR = "xhr" 31 | REQTYPE_WS = "websocket" 32 | REQTYPE_JSONP = "jsonp" 33 | REQTYPE_FORM = "form" 34 | REQTYPE_REDIRECT = "redirect" 35 | REQTYPE_UNKNOWN = "unknown" 36 | 37 | ERROR_CONTENTTYPE = "contentType" 38 | ERROR_TIMEOUT = "timeout" 39 | ERROR_PROBE_TO = "probe_timeout" 40 | ERROR_FORCE_STOP = "interruptReceived" 41 | ERROR_PROBEKILLED = "probe_killed" 42 | ERROR_PROBEFAILURE = "probe_failure" 43 | ERROR_MAXREDIRECTS = "too_many_redirects" 44 | ERROR_CRAWLDEPTH = "crawler_depth_limit_reached" 45 | VULNTYPE_SQLI = "sqli" 46 | VULNTYPE_XSS = "xss" 47 | 48 | CRAWLER_DEFAULTS = { 49 | "process_timeout": 300, # when lots of element(~25000) are added dynamically it can take some time.. 50 | "num_threads": 10, 51 | "max_redirects": 10, 52 | "max_depth": 100, 53 | "max_post_depth": 10, 54 | "output_mode": CRAWLOUTPUT_RENAME, 55 | "scope": CRAWLSCOPE_DOMAIN, 56 | "mode": CRAWLMODE_AGGRESSIVE, 57 | "proxy": None, 58 | "group_qs": False, 59 | "user_agent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' 60 | '(KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', 61 | "override_timeout_functions": True, 62 | "crawl_forms": True, # only if mode == CRAWLMODE_AGGRESSIVE 63 | "random_seed": "", 64 | "use_urllib_onerror": True, 65 | "set_referer": True, 66 | } 67 | -------------------------------------------------------------------------------- /core/lib/shell.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import subprocess 14 | import sys 15 | import threading 16 | import time 17 | 18 | 19 | class CommandExecutor: 20 | """ 21 | Executes shell command and returns its output. 22 | The process is killed afer seconds 23 | """ 24 | 25 | def __init__(self, cmd, stderr=False): 26 | # self.cmd = cmd 27 | self.cmd = [c.encode("utf-8") for c in cmd] 28 | self.stderr = stderr 29 | self.out = None 30 | self.err = None 31 | self.process = None 32 | self.thread = None 33 | self.result = None 34 | 35 | def close(self, kill_timeout): 36 | tries = 0 37 | self.process.terminate() 38 | while tries < kill_timeout: 39 | if self.process.poll() is not None: 40 | return 41 | else: 42 | time.sleep(1) 43 | tries += 1 44 | self.process.kill() 45 | self.thread.join() 46 | self.out = None 47 | self.err = "Executor: execution timeout" 48 | 49 | def execute(self, timeout): 50 | 51 | def executor(): 52 | try: 53 | # close_fds=True is needed in threaded programs 54 | 55 | self.process = subprocess.Popen(self.cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=0, 56 | close_fds=sys.platform != "win32") 57 | self.out, self.err = self.process.communicate() 58 | except Exception as e: 59 | raise 60 | 61 | self.thread = threading.Thread(target=executor) 62 | self.thread.start() 63 | 64 | self.thread.join(int(timeout)) 65 | 66 | if self.thread.is_alive(): 67 | self.close(5) 68 | 69 | return self.out if not self.stderr else (self.out, self.err) 70 | -------------------------------------------------------------------------------- /core/crawl/lib/probe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from core.constants import * 14 | from core.lib.cookie import Cookie 15 | from core.lib.request import Request 16 | 17 | 18 | class Probe: 19 | def __init__(self, data, parent): 20 | self.status = "ok" 21 | self.requests = [] 22 | self.cookies = [] 23 | self.redirect = None 24 | # if True the probe returned no error BUT the json is not closed properly 25 | self.partialcontent = False 26 | self.user_output = [] 27 | 28 | status = data.pop() 29 | 30 | if status['status'] == "error": 31 | self.status = "error" 32 | self.errcode = status['code'] 33 | 34 | if "partialcontent" in status: 35 | self.partialcontent = status['partialcontent'] 36 | 37 | # grap cookies before creating rquests 38 | for key, val in data: 39 | if key == "cookies": 40 | for cookie in val: 41 | self.cookies.append(Cookie(cookie, parent.url)) 42 | 43 | if "redirect" in status: 44 | self.redirect = status['redirect'] 45 | r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, 46 | parent_db_id=parent.db_id) 47 | self.requests.append(r) 48 | 49 | for key, val in data: 50 | if key == "request": 51 | trigger = val['trigger'] if 'trigger' in val else None 52 | r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, 53 | data=val['data'], trigger=trigger, parent_db_id=parent.db_id) 54 | self.requests.append(r) 55 | elif key == "user": 56 | self.user_output.append(val) 57 | 58 | 59 | 60 | # @TODO handle cookies set by ajax (in probe too) 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## HTCAP 2 | 3 | Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting xhr calls and DOM changes. 4 | Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications. 5 | 6 | More infos at [htcap.org](http://htcap.org). 7 | 8 | ### Difference with the upstream version 9 | 10 | * Use Chrome + Puppeteer instead of PhantomJS as crawl engine 11 | * Add option to restart/complete a crawl 12 | * Rewrite the injected code of the javascript crawler to take into account the [javascript event loop](https://www.youtube.com/watch?v=8aGhZQkoFbQ) (ie. javascript is async, stop using `setTimeout` calls) and make use of the [DOM mutation event handler](https://developer.mozilla.org/en-US/docs/Web/API/MutationObserver) 13 | * Drop the flimsy supported feature "custom user script" in the crawler 14 | * Add unittest for the crawler part 15 | * Mainly this fixes issues (and other): [#9](https://github.com/segment-srl/htcap/issues/9), [#11](https://github.com/segment-srl/htcap/issues/11), [#16](https://github.com/segment-srl/htcap/issues/16), [#19](https://github.com/segment-srl/htcap/issues/19), [#22](https://github.com/segment-srl/htcap/issues/22), [#23](https://github.com/segment-srl/htcap/issues/23), [#28](https://github.com/segment-srl/htcap/issues/28) and [#31](https://github.com/segment-srl/htcap/issues/31) 16 | 17 | ## SETUP 18 | 19 | ### Requirements 20 | 21 | 1. Python 2.7 22 | 2. NodeJS v8.9.4 (for the crawler) 23 | 3. Sqlmap (for sqlmap scanner module) 24 | 4. Arachni (for arachni scanner module) 25 | 26 | ### Installation 27 | 28 | ```console 29 | git clone git@github.com:delvelabs/htcap.git htcap 30 | cd htcap 31 | pip install -r requirements.txt 32 | cd core/crawl/probe/ 33 | npm install 34 | ``` 35 | 36 | ## Documentation 37 | Try `python htcap.py -h` for help 38 | 39 | ## LICENSE 40 | 41 | This program is free software; you can redistribute it and/or modify it under the terms of the [GNU General Public License](https://www.gnu.org/licenses/gpl-2.0.html) as published by the Free Software Foundation; either version 2 of the License, or(at your option) any later version. 42 | -------------------------------------------------------------------------------- /core/util/utilities/lsajax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import sqlite3 4 | import json 5 | import os 6 | 7 | from core.util.base_util import BaseUtil 8 | 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | 12 | 13 | class Lsajax(BaseUtil): 14 | 15 | @staticmethod 16 | def get_settings(): 17 | return dict( 18 | descr = "List all pages and related ajax calls", 19 | optargs = 'd', 20 | minargs = 1 21 | ) 22 | 23 | def usage(self): 24 | return ( 25 | "usage: %s []\n" 26 | " Options:\n -d print POST data\n\n" 27 | % self.utilname 28 | ) 29 | 30 | def main(self, args, opts): 31 | qry = """ 32 | SELECT r.id, r.url as page, r.referer, a.method, a.url,a.data,a.trigger 33 | FROM request r inner join request a on r.id=a.id_parent 34 | WHERE (a.type='xhr') 35 | AND 36 | %s 37 | """ 38 | 39 | # try: 40 | # opts, args = getopt.getopt(argv[1:], 'd') 41 | # except getopt.GetoptError as err: 42 | # print str(err) 43 | # sys.exit(1) 44 | 45 | 46 | # if len(args) < 1: 47 | # print ( 48 | # "usage: %s []\n" 49 | # " Options:\n -d print POST data\n\n" 50 | # " Base query: %s" % (argv[0], qry) 51 | # ) 52 | # sys.exit(1) 53 | 54 | 55 | print_post_data = False 56 | 57 | for o, v in opts: 58 | if o == '-d': 59 | print_post_data = True 60 | 61 | 62 | dbfile = args[0] 63 | 64 | if not os.path.exists(dbfile): 65 | print "No such file %s" % dbfile 66 | sys.exit(1) 67 | 68 | where = args[1] if len(args) > 1 else "1=1" 69 | 70 | conn = sqlite3.connect(dbfile) 71 | conn.row_factory = sqlite3.Row 72 | 73 | cur = conn.cursor() 74 | cur.execute(qry % where) 75 | pages = {} 76 | for res in cur.fetchall(): 77 | page = (res['id'], res['page'], res['referer']) 78 | trigger = json.loads(res['trigger']) if res['trigger'] else None 79 | trigger_str = "%s.%s() -> " % (trigger['element'], trigger['event']) if trigger else "" 80 | data = " data: %s" % (res['data']) if print_post_data and res['data'] else "" 81 | descr = " %s%s %s%s" % (trigger_str, res['method'], res['url'], data) 82 | 83 | if page in pages: 84 | pages[page].append(descr) 85 | else: 86 | pages[page] = [descr] 87 | 88 | for page,ajax in pages.items(): 89 | print "Request ID: %s\nPage URL: %s\nReferer: %s\nAjax requests:" % page 90 | for aj in ajax: 91 | print aj 92 | print "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n" 93 | -------------------------------------------------------------------------------- /tests/crawl_tests/probe_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import mock 3 | 4 | from core.constants import CRAWLER_DEFAULTS 5 | from core.crawl.crawler import Crawler 6 | from core.crawl.crawler_thread import CrawlerThread 7 | from core.crawl.lib.shared import Shared 8 | from core.lib.request import Request 9 | 10 | 11 | class SetProbeTest(unittest.TestCase): 12 | def setup_shared_object(self, 13 | mode=CRAWLER_DEFAULTS['mode'], 14 | timeout=CRAWLER_DEFAULTS['process_timeout'], 15 | user_agent=CRAWLER_DEFAULTS['user_agent'], 16 | proxy=CRAWLER_DEFAULTS['proxy'], 17 | seed=CRAWLER_DEFAULTS['random_seed'], 18 | override=CRAWLER_DEFAULTS['override_timeout_functions'], 19 | excluded='', 20 | ): 21 | Shared.excluded_urls = excluded 22 | Shared.options['random_seed'] = seed 23 | Shared.options['proxy'] = proxy 24 | Shared.options['mode'] = mode 25 | Shared.options['process_timeout'] = timeout 26 | Shared.options['user_agent'] = user_agent 27 | Shared.options['override_timeout_functions'] = override 28 | 29 | @mock.patch('core.crawl.crawler.get_probe_cmd', return_value=['/usr/bin/node']) 30 | def test_setting_probe_calls_node(self, mock_probe_cmd): 31 | args = ['http://example.com', 'out.txt'] 32 | crawler = Crawler(args) 33 | self.setup_shared_object() 34 | crawler._set_probe() 35 | self.assertIn("index.js", crawler._probe["cmd"][1]) 36 | self.assertIn('node', crawler._probe["cmd"][0]) 37 | 38 | @mock.patch('core.crawl.crawler.get_probe_cmd', return_value=['/usr/bin/node']) 39 | def test_set_probe_puts_proxy_in_options(self, mock_probe_cmd): 40 | args = ['http://example.com', 'out.txt'] 41 | crawler = Crawler(args) 42 | self.setup_shared_object(proxy={'proto': 'http', 'host': '254.254.254.254', 'port': '1'}) 43 | crawler._set_probe() 44 | 45 | self.assertIn('--proxy=http://254.254.254.254:1', crawler._probe["options"]) 46 | self.assertEqual(len(crawler._probe["cmd"]), 2) 47 | 48 | 49 | class SendProbeTest(unittest.TestCase): 50 | def setup_request_object(self): 51 | pass 52 | 53 | def test_set_params_for_probe(self): 54 | req = Request("type1", "POST", "http://example.com", data="example data", http_auth="auth1") 55 | Shared.options['set_referer'] = None 56 | thread = CrawlerThread() 57 | params = thread._set_probe_params(req) 58 | print(req) 59 | self.assertIn("http://example.com/", params) 60 | pass 61 | -------------------------------------------------------------------------------- /tests/crawl_tests/crawler_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mock import patch 4 | 5 | from core.constants import * 6 | from core.crawl.crawler import Crawler 7 | 8 | 9 | class CrawlerTest(unittest.TestCase): 10 | @patch('core.crawl.crawler.generate_filename', return_value='my_out_file-1') 11 | @patch('core.crawl.crawler.Database') 12 | def test__get_database_rename_outfile(self, database_mock, generate_filename_mock): 13 | db = Crawler._get_database('my_out_file', CRAWLOUTPUT_RENAME) 14 | 15 | generate_filename_mock.assert_called_once_with('my_out_file', out_file_overwrite=False) 16 | database_mock.assert_called_once_with('my_out_file-1') 17 | db.initialize.assert_called_once() 18 | 19 | @patch('core.crawl.crawler.Database') 20 | @patch('core.crawl.crawler.os.path.exists', return_value=True) 21 | @patch('core.crawl.crawler.os.path.getsize', return_value=0) 22 | @patch('core.crawl.crawler.os.remove') 23 | def test__get_database_overwrite_outfile( 24 | self, 25 | os_remove_mock, 26 | os_path_getsize_mock, 27 | os_path_exists_mock, 28 | database_mock): 29 | db = Crawler._get_database('my_out_file', CRAWLOUTPUT_OVERWRITE) 30 | 31 | os_path_getsize_mock.assert_called_with('my_out_file') 32 | os_path_exists_mock.assert_called_with('my_out_file') 33 | self.assertEqual(os_path_exists_mock.call_count, 3) 34 | os_remove_mock.assert_called_once_with('my_out_file') 35 | database_mock.assert_called_once_with('my_out_file') 36 | db.initialize.assert_called_once() 37 | 38 | @patch('core.crawl.crawler.Database') 39 | @patch('core.crawl.crawler.os.path.exists', return_value=True) 40 | @patch('core.crawl.crawler.os.path.getsize', return_value=2) 41 | def test__get_database_complete_outfile(self, os_path_getsize_mock, os_path_exists_mock, database_mock): 42 | db = Crawler._get_database('my_out_file', CRAWLOUTPUT_COMPLETE) 43 | 44 | database_mock.assert_called_once_with('my_out_file') 45 | os_path_getsize_mock.assert_called_with('my_out_file') 46 | os_path_exists_mock.assert_called_with('my_out_file') 47 | self.assertEqual(os_path_exists_mock.call_count, 2) 48 | self.assertEqual(db.initialize.call_count, 0) 49 | 50 | @patch('core.crawl.crawler.Database') 51 | @patch('core.crawl.crawler.os.path.exists', return_value=False) 52 | def test__get_database_resume_new_outfile(self, os_path_exists_mock, database_mock): 53 | db = Crawler._get_database('my_out_file', CRAWLOUTPUT_RESUME) 54 | 55 | database_mock.assert_called_once_with('my_out_file') 56 | os_path_exists_mock.assert_called_once_with('my_out_file') 57 | self.assertEqual(db.initialize.call_count, 1) 58 | -------------------------------------------------------------------------------- /scripts/quickscan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PY="/usr/bin/python" 4 | #THISFILE=$(readlink $0 || echo $0) 5 | CURDIR=$( cd "$(dirname "$(readlink $0 || echo $0)")" ; pwd -P ) 6 | 7 | # EXPORT="\"$CURDIR/../scripts/htmlreport.py\"" 8 | # VULNS="\"$CURDIR/../scripts/vulns.py\"" 9 | 10 | HTCAP="\"$CURDIR/../htcap.py\"" 11 | EXPORT="$HTCAP util report" 12 | VULNS="$HTCAP util lsvuln" 13 | yes=false 14 | requests='link,redirect,form,xhr,jsonp' 15 | cookies="" 16 | excluded="" 17 | 18 | function yesno { 19 | if [ $1 = false ]; then 20 | read yesno 21 | else 22 | yesno="y" 23 | fi 24 | 25 | echo $yesno 26 | } 27 | 28 | if [ $# -lt 1 ];then 29 | echo "usage "$(basename $0) "[options]" "" 30 | echo "options:" 31 | echo " -r set request types (default: " $requests ")" 32 | echo " -y say yes to all questions" 33 | echo " -c set cookies" 34 | echo " -x set excluded urls" 35 | exit 1 36 | fi 37 | 38 | while getopts "r:yc:x:" opt; do 39 | case "$opt" in 40 | r) requests=$OPTARG 41 | ;; 42 | y) yes=true 43 | ;; 44 | c) cookies="-c '$OPTARG'" 45 | ;; 46 | x) excluded="-x '$OPTARG'" 47 | ;; 48 | esac 49 | done 50 | shift $((OPTIND-1)) 51 | 52 | HOST=$1 53 | 54 | 55 | OUTFILE=`echo $HOST | sed -E 's/^https?:\/\///' | sed 's/\./_/g' | sed 's/\/.*//g'` 56 | 57 | if [ -e "$OUTFILE.db" ];then 58 | echo -n "$OUTFILE.db already exists. Overwrite it? (y/N): " && $yes && echo "y" 59 | if [ "$(yesno $yes)" = "y" ]; then 60 | rm "$OUTFILE.db" 61 | else 62 | exit 1 63 | fi 64 | fi 65 | 66 | 67 | echo $HTCAP crawl $cookies $excluded $HOST $OUTFILE.db | xargs $PY || exit 1 68 | echo -n "Run arachni? (y/N): " && $yes && echo "y" 69 | if [ "$(yesno $yes)" = "y" ]; then 70 | echo $HTCAP scan -r $requests arachni $OUTFILE.db | xargs $PY || exit 1 71 | fi 72 | echo -n "Run sqlmap? (y/N): " && $yes && echo "y" 73 | if [ "$(yesno $yes)" = "y" ]; then 74 | echo $HTCAP scan -r $requests sqlmap $OUTFILE.db | xargs $PY || exit 1 75 | fi 76 | echo 77 | 78 | if [ "`echo $VULNS $OUTFILE.db | xargs $PY`" = "" ];then 79 | echo "No vulnerabilities found" 80 | else 81 | echo "Detected vulnerabilities:" 82 | /bin/bash -c 'sqlite3 -version' > /dev/null 2>&1 83 | if [ $? = 0 ]; then 84 | echo "SELECT ' Type: ',type, ', found ',count(type) FROM vulnerability GROUP BY type ORDER BY count(type) DESC;" | sqlite3 -separator "" $OUTFILE.db 85 | else 86 | echo " Warning: unable to run sqlite3 command" 87 | fi 88 | echo 89 | fi 90 | 91 | rm "$OUTFILE".html 2> /dev/null 92 | echo $EXPORT $OUTFILE.db $OUTFILE.html | xargs $PY 93 | 94 | opener="" 95 | while [ "$opener" = "" ];do 96 | echo -n "Open $OUTFILE.html with command (^C to abort): " 97 | read opener 98 | if [ "$opener" != "" ];then 99 | $opener "$OUTFILE".html 100 | fi 101 | done 102 | 103 | exit 0 104 | -------------------------------------------------------------------------------- /core/crawl/lib/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import posixpath 14 | import re 15 | from urlparse import urlsplit 16 | 17 | from core.constants import * 18 | from core.crawl.lib.shared import Shared 19 | from core.lib.utils import group_qs_params 20 | 21 | 22 | def request_in_scope(request): 23 | url = request.url 24 | purl = urlsplit(url) 25 | spurl = urlsplit(Shared.start_url) 26 | scope = Shared.options['scope'] 27 | in_scope = False 28 | 29 | # check for scopes 30 | if scope == CRAWLSCOPE_DOMAIN: 31 | for pattern in Shared.allowed_domains: 32 | if re.match(pattern, purl.hostname): 33 | in_scope = True 34 | break 35 | 36 | elif scope == CRAWLSCOPE_DIRECTORY: 37 | if purl.hostname != spurl.hostname: 38 | in_scope = False 39 | else: 40 | path = [p for p in posixpath.dirname(purl.path).split("/") if p] 41 | spath = [p for p in posixpath.dirname(spurl.path).split("/") if p] 42 | in_scope = path[:len(spath)] == spath 43 | 44 | elif scope == CRAWLSCOPE_URL: 45 | in_scope = url == Shared.start_url 46 | 47 | # check for excluded urls 48 | for pattern in Shared.excluded_urls: 49 | if re.match(pattern, request.url): 50 | in_scope = False 51 | break 52 | 53 | return in_scope 54 | 55 | 56 | def adjust_requests(requests): 57 | """ 58 | adjust an array of requsts according to current status/settings 59 | 1. sets the out_of_scope property 60 | 2. normalize url accoding to user settings 61 | """ 62 | 63 | for request in requests: 64 | if request.type == REQTYPE_UNKNOWN or not request_in_scope(request): 65 | request.out_of_scope = True 66 | 67 | if Shared.options['group_qs']: 68 | request.url = group_qs_params(request.url) 69 | 70 | return requests 71 | 72 | 73 | def request_depth(request): 74 | if request.parent == None: 75 | return 1 76 | 77 | return 1 + request_depth(request.parent) 78 | 79 | 80 | def request_post_depth(request): 81 | if request.method != "POST": 82 | return 0 83 | 84 | if request.parent == None or request.parent.method != "POST": 85 | return 1 86 | 87 | return 1 + request_post_depth(request.parent) 88 | 89 | 90 | def request_is_crawlable(request): 91 | if request.out_of_scope: 92 | return False 93 | 94 | types = [REQTYPE_LINK, REQTYPE_REDIRECT] 95 | if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: 96 | types.append(REQTYPE_FORM) 97 | 98 | return request.type in types and re.match("^https?://", request.url, re.I) 99 | -------------------------------------------------------------------------------- /core/util/utilities/usgen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - htcap.org 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import sys 14 | import json 15 | import os 16 | 17 | from core.lib.utils import * 18 | from core.util.base_util import BaseUtil 19 | 20 | 21 | class Usgen(BaseUtil): 22 | 23 | @staticmethod 24 | def get_settings(): 25 | return dict( 26 | descr = "Generate a sample user script", 27 | optargs = '', 28 | minargs = 1 29 | ) 30 | 31 | def usage(self): 32 | return ( 33 | "%s\n" 34 | "usage: %s \n" 35 | % (self.get_settings()['descr'], self.utilname) 36 | ) 37 | 38 | 39 | def main(self, args, opts): 40 | usfile = generate_filename(args[0], 'js', False, True) 41 | try: 42 | with open(usfile,'w') as f: 43 | f.write(CONTENT) 44 | print "User Script saved to %s" % usfile 45 | except Exception as e: 46 | print "Unable to write file %s" % usfile 47 | sys.exit(1) 48 | 49 | 50 | CONTENT = """/* 51 | UI Methods: 52 | ui.print(message) - save a per-request user message into the request table 53 | ui.fread(path_to_file) - read from file 54 | ui.fwrite(path_to_file, content, mode) - write to file 55 | ui.render(path_to_file) - save a screenshot of the page current state 56 | ui.triggerEvent(element, event) - trigger an event 57 | */ 58 | 59 | { 60 | onInit: function(ui){ 61 | // override natove methods 62 | window.prompt = function(){ return "AAA" }; 63 | // init local variables 64 | ui.vars.cnt = 0; 65 | }, 66 | 67 | onStart: function(ui){}, 68 | 69 | onTriggerEvent: function(ui, element, event){ 70 | // cancel trigger if element has class kill-all 71 | if(element.matches(".kill-all")) return false; 72 | }, 73 | 74 | onEventTriggered: function(ui, element, event){}, 75 | 76 | onFillInput: function(ui, element){ 77 | // here it's possible to force a value or prevent it to be filled 78 | // WARNING: do NOT set dynamic values! for instance something like 79 | // element.value = Math.random() 80 | // will lead to INFINITE CRAWLING if you crawl forms 81 | 82 | if(element.id == "car_vendor"){ 83 | element.value = "Ferrari"; 84 | return false; 85 | } 86 | }, 87 | 88 | onXhr: function(ui, request){ 89 | // cancel XHR request if url matches XXX 90 | if(request.url.match(/XXX/)) 91 | return false 92 | }, 93 | 94 | onAllXhrsCompleted: function(ui){}, 95 | 96 | onDomModified: function(ui, rootElements, allElements){ 97 | // save a screenshot on every DOM change 98 | ui.render(ui.id + "-screen-" + ui.vars.cnt + ".png"); 99 | ui.vars.cnt++; 100 | }, 101 | 102 | onEnd: function(ui){} 103 | } 104 | """ 105 | 106 | -------------------------------------------------------------------------------- /core/lib/thirdparty/pysocks/sockshandler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | SocksiPy + urllib2 handler 4 | 5 | version: 0.3 6 | author: e 7 | 8 | This module provides a Handler which you can use with urllib2 to allow it to tunnel your connection through a socks.sockssocket socket, with out monkey patching the original socket... 9 | """ 10 | import ssl 11 | 12 | try: 13 | import urllib2 14 | import httplib 15 | except ImportError: # Python 3 16 | import urllib.request as urllib2 17 | import http.client as httplib 18 | 19 | import socks # $ pip install PySocks 20 | 21 | def merge_dict(a, b): 22 | d = a.copy() 23 | d.update(b) 24 | return d 25 | 26 | class SocksiPyConnection(httplib.HTTPConnection): 27 | def __init__(self, proxytype, proxyaddr, proxyport=None, rdns=True, username=None, password=None, *args, **kwargs): 28 | self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password) 29 | httplib.HTTPConnection.__init__(self, *args, **kwargs) 30 | 31 | def connect(self): 32 | self.sock = socks.socksocket() 33 | self.sock.setproxy(*self.proxyargs) 34 | if type(self.timeout) in (int, float): 35 | self.sock.settimeout(self.timeout) 36 | self.sock.connect((self.host, self.port)) 37 | 38 | class SocksiPyConnectionS(httplib.HTTPSConnection): 39 | def __init__(self, proxytype, proxyaddr, proxyport=None, rdns=True, username=None, password=None, *args, **kwargs): 40 | self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password) 41 | httplib.HTTPSConnection.__init__(self, *args, **kwargs) 42 | 43 | def connect(self): 44 | sock = socks.socksocket() 45 | sock.setproxy(*self.proxyargs) 46 | if type(self.timeout) in (int, float): 47 | sock.settimeout(self.timeout) 48 | sock.connect((self.host, self.port)) 49 | self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file) 50 | 51 | class SocksiPyHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler): 52 | def __init__(self, *args, **kwargs): 53 | self.args = args 54 | self.kw = kwargs 55 | urllib2.HTTPHandler.__init__(self) 56 | 57 | def http_open(self, req): 58 | def build(host, port=None, timeout=0, **kwargs): 59 | kw = merge_dict(self.kw, kwargs) 60 | conn = SocksiPyConnection(*self.args, host=host, port=port, timeout=timeout, **kw) 61 | return conn 62 | return self.do_open(build, req) 63 | 64 | def https_open(self, req): 65 | def build(host, port=None, timeout=0, **kwargs): 66 | kw = merge_dict(self.kw, kwargs) 67 | conn = SocksiPyConnectionS(*self.args, host=host, port=port, timeout=timeout, **kw) 68 | return conn 69 | return self.do_open(build, req) 70 | 71 | if __name__ == "__main__": 72 | import sys 73 | try: 74 | port = int(sys.argv[1]) 75 | except (ValueError, IndexError): 76 | port = 9050 77 | opener = urllib2.build_opener(SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "localhost", port)) 78 | print("HTTP: " + opener.open("http://httpbin.org/ip").read().decode()) 79 | print("HTTPS: " + opener.open("https://httpbin.org/ip").read().decode()) 80 | -------------------------------------------------------------------------------- /core/scan/scanners/sqlmap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import time 16 | import re 17 | import json 18 | import base64 19 | import uuid 20 | import getopt 21 | 22 | import threading 23 | 24 | from urlparse import urlparse, urljoin, parse_qs, parse_qsl, urlsplit 25 | 26 | from core.lib.exception import * 27 | from core.lib.cookie import Cookie 28 | 29 | from core.scan.base_scanner import BaseScanner 30 | from core.lib.utils import * 31 | 32 | 33 | 34 | class Sqlmap(BaseScanner): 35 | 36 | def init(self, argv): 37 | self.skip_duplicates = True 38 | 39 | try: 40 | opts, args = getopt.getopt(argv, 'hs') 41 | except getopt.GetoptError as err: 42 | print str(err) 43 | self.exit(1) 44 | 45 | for o, v in opts: 46 | if o == '-h': 47 | self.usage() 48 | self.exit(0) 49 | elif o == '-s': 50 | self.skip_duplicates = False 51 | 52 | 53 | 54 | def usage(self): 55 | print ( "htcap sqlmap module\nusage: scan sqlmap [options]\n" 56 | "Options are:\n" 57 | " -h this help\n" 58 | " -s do not skip duplicated urls\n" 59 | ) 60 | 61 | def get_settings(self): 62 | return dict( 63 | request_types = "xhr,link,form,jsonp,redirect", 64 | num_threads = 5, 65 | process_timeout = 300, 66 | scanner_exe = "/usr/share/sqlmap/sqlmap.py" 67 | ) 68 | 69 | # return False to skip current request 70 | def get_cmd(self, request, tmp_dir): 71 | 72 | if self.skip_duplicates and self.is_request_duplicated(request): 73 | return False 74 | 75 | if request.method == "GET": 76 | purl = urlsplit(request.url) 77 | if not purl.query: 78 | return False 79 | 80 | #print request.url 81 | 82 | out_dir = tmp_dir + "/tmp" 83 | if not os.path.exists(out_dir): 84 | os.makedirs(out_dir, 0700) 85 | 86 | cookie_file = tmp_dir + "/cookies.json" 87 | with open(cookie_file,'w') as cf: 88 | for c in request.cookies: 89 | cf.write(c.get_as_netscape() + "\n") 90 | 91 | cmd = [ 92 | "--batch", 93 | "-u", request.url, 94 | "-v", "0", 95 | "--disable-coloring", 96 | "--text-only", 97 | "--purge-output", 98 | "-o", 99 | "--crawl=0", 100 | "--output-dir", out_dir 101 | ] 102 | 103 | if request.referer: 104 | cmd.extend(("--referer", request.referer)) 105 | 106 | if len(request.cookies) > 0: 107 | cmd.extend(("--load-cookies", cookie_file)) 108 | 109 | if request.method == "POST": 110 | cmd.extend(("--method","POST")) 111 | if request.data: 112 | cmd.extend(("--data",request.data)) 113 | 114 | 115 | return cmd 116 | 117 | def scanner_executed(self, request, out, err, tmp_dir, cmd): 118 | # print cmd_to_str(cmd) 119 | if not out:return 120 | 121 | descr = "C O M M A N D\n\n%s\n\nD E T A I L S\n\n" % cmd_to_str(cmd) 122 | report = re.findall(r'---([^]]*)---', out) 123 | if len(report) == 0: return 124 | for vuln in report: 125 | descr += vuln + "\n" 126 | 127 | self.save_vulnerability(request, "sqli", descr) 128 | 129 | -------------------------------------------------------------------------------- /core/scan/scanner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import time 16 | import re 17 | import json 18 | import base64 19 | import uuid 20 | import getopt 21 | import os 22 | import glob 23 | import importlib 24 | 25 | from core.lib.exception import * 26 | from core.lib.cookie import Cookie 27 | from core.lib.utils import * 28 | from base_scanner import BaseScanner 29 | 30 | 31 | class Scanner: 32 | 33 | def __init__(self, argv): 34 | scanner_files = glob.glob(os.path.join("%s%sscanners" % (getrealdir(__file__), os.sep) , "*.py")) 35 | self.scanners = [os.path.basename(m).split(".")[0] for m in scanner_files if not m.endswith("__.py")] 36 | 37 | num_threads = None 38 | request_types = None 39 | process_timeout = None 40 | display_progress = True 41 | scanner_exe = None 42 | 43 | try: 44 | opts, args = getopt.getopt(argv, 'hn:t:r:qe:') 45 | except getopt.GetoptError as err: 46 | print str(err) 47 | sys.exit(1) 48 | 49 | 50 | if len(args) < 2: 51 | self.usage() 52 | sys.exit(1) 53 | 54 | 55 | for o, v in opts: 56 | if o == '-h': 57 | self.usage() 58 | sys.exit(0) 59 | elif o == '-n': 60 | num_threads = int(v) 61 | elif o == '-t': 62 | process_timeout = int(v) 63 | elif o == '-e': 64 | scanner_exe = v 65 | elif o == '-q': 66 | display_progress = False 67 | elif o == '-r': 68 | request_types = v 69 | 70 | self.scanner = args[0] 71 | self.db_file = args[1] 72 | 73 | scanner_argv = args[2:] 74 | 75 | if not self.scanner in self.scanners: 76 | print "Available scanners are: %s" % ", ".join(self.scanners) 77 | sys.exit(1) 78 | 79 | if not os.path.exists(self.db_file): 80 | print "No such file %s" % self.db_file 81 | sys.exit(1) 82 | 83 | 84 | mod = importlib.import_module("core.scan.scanners.%s" % self.scanner) 85 | run = getattr(mod, self.scanner.title()) 86 | run(self.db_file, num_threads, request_types, process_timeout, scanner_exe, display_progress, scanner_argv) 87 | 88 | print "Scan finished" 89 | 90 | 91 | 92 | def usage(self): 93 | print ( 94 | "\n" 95 | "Usage: scan [options] [scanner_options]\n" 96 | "Options: \n" 97 | " -h this help\n" 98 | " -n THREADS number of parallel threads\n" 99 | " -r REQUEST_TYPES comma separated list of request types to pass to the scanner\n" 100 | " -t TIMEOUT process timeout in seconds\n" 101 | " -e PATH path to scanner executable\n" 102 | "\n" 103 | "Scanner Options: \n" 104 | " those are scanner-specific options (if available), you should try -h ..\n" 105 | "\n" 106 | "Available scanners are:\n" 107 | " - " + "\n - ".join(self.scanners) + "\n" 108 | "\n" 109 | "Available request types are:\n" 110 | " - xhr (ajax)\n" 111 | " - link (anchors href)\n" 112 | " - redirect (url from redirect)\n" 113 | " - form\n" 114 | " - jsonp\n" 115 | " - websocket\n" 116 | ) 117 | 118 | -------------------------------------------------------------------------------- /core/scan/scanners/wapiti.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import os 16 | import time 17 | import re 18 | import json 19 | import base64 20 | import uuid 21 | 22 | from core.lib.exception import * 23 | from core.lib.cookie import Cookie 24 | from core.lib.utils import * 25 | from core.scan.base_scanner import BaseScanner 26 | 27 | class Wapiti(BaseScanner): 28 | 29 | 30 | def init(self, argv): 31 | return True 32 | 33 | def get_settings(self): 34 | return dict( 35 | scanner_name = "wapiti", 36 | request_types = "xhr,link,form,jsonp,redirect", 37 | num_threads = 10, 38 | process_timeout = 180, 39 | scanner_exe = "python /usr/local/bin/wapiti" 40 | ) 41 | 42 | # return False to skip current request 43 | def get_cmd(self, request, tmp_dir): 44 | url = request.url 45 | # skip check of XSS via POST sice they should be considered CSRF 46 | if request.method == "POST" and request.data: 47 | url += "?" + request.data 48 | 49 | 50 | out_file = tmp_dir + "/output.json" 51 | 52 | cookie_file = tmp_dir + "/cookies.json" 53 | with open(cookie_file,'w') as cf: 54 | jsn = self.convert_cookies(request.cookies) 55 | cf.write(jsn) 56 | 57 | 58 | cmd = [ 59 | url, 60 | "--timeout", "30", 61 | # Set the modules (and HTTP methods for each module) to use for attacks. 62 | # Prefix a module name with a dash to deactivate the related module. 63 | # To only browse the target (without sending any payloads), deactivate every module with -m "-all". 64 | # If you don't specify the HTTP methods, GET and POST will be used. 65 | # Example: -m "-all,xss:get,exec:post" 66 | "--module", "-all,xss:get", 67 | "--scope", "page", 68 | "--format", "json", 69 | "--output", out_file, 70 | "--verify-ssl", "0" 71 | ] 72 | 73 | # ! no option to set referer ? 74 | 75 | if len(request.cookies) > 0: 76 | cmd.extend(("--cookie", cookie_file)) 77 | 78 | # print cmd_to_str(cmd) 79 | # self.exit(1) 80 | # return False 81 | return cmd 82 | 83 | def scanner_executed(self, request, out, err, tmp_dir, cmd): 84 | out_file = tmp_dir + "/output.json" 85 | 86 | if not os.path.exists(out_file): 87 | return 88 | 89 | with open(out_file,'r') as fil: 90 | jsn = fil.read() 91 | 92 | report = [] 93 | try: 94 | report = json.loads(jsn)['vulnerabilities']['Cross Site Scripting'] 95 | except Exception as e: 96 | print err 97 | 98 | for vuln in report: 99 | self.save_vulnerability(request, "XSS", json.dumps(vuln)) 100 | 101 | 102 | 103 | # convert cookies to wapiti format 104 | def convert_cookies(self, cookies): 105 | wcookies = {} 106 | for cookie in cookies: 107 | domain = cookie.domain 108 | if domain: 109 | if not domain.startswith("."): domain = ".%s" % domain 110 | else: 111 | domain = cookie.setter.hostname 112 | 113 | if not domain in wcookies.keys(): 114 | wcookies[domain] = {} 115 | 116 | if not cookie.path in wcookies[domain].keys(): 117 | wcookies[domain][cookie.path] = {} 118 | 119 | wcookies[domain][cookie.path][cookie.name] = dict( 120 | version = 0, 121 | expires = cookie.expires, 122 | secure = cookie.secure, 123 | value = cookie.value, 124 | port = None 125 | ) 126 | 127 | return json.dumps(wcookies) -------------------------------------------------------------------------------- /tests/crawl_tests/urlfinder_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from core.crawl.lib.urlfinder import UrlFinder 4 | 5 | 6 | class UrlFinderTest(unittest.TestCase): 7 | def test_empty_html(self): 8 | html_sample = "" 9 | finder = UrlFinder(html_sample) 10 | 11 | self.assertEqual(finder.get_urls(), []) 12 | 13 | def test_basic_html(self): 14 | html_sample = """ 15 | 16 | 17 | 18 | 19 | 20 | 21 | """ 22 | finder = UrlFinder(html_sample) 23 | 24 | self.assertEqual(finder.get_urls(), []) 25 | 26 | def test_with_relative_link(self): 27 | html_sample = 'test' 28 | finder = UrlFinder(html_sample) 29 | 30 | self.assertEqual(finder.get_urls(), ["test.html"]) 31 | 32 | def test_with_relative_link_and_absolute_base_href(self): 33 | html_sample = """ 34 | 35 | 36 | 37 | 38 | 39 | test 40 | 41 | """ 42 | finder = UrlFinder(html_sample) 43 | self.assertEqual(finder.get_urls(), ["http://somewhere.else/someWeirdPath/test.html"]) 44 | 45 | html_sample = """ 46 | 47 | 48 | 49 | 50 | 51 | test 52 | 53 | """ 54 | finder = UrlFinder(html_sample) 55 | self.assertEqual(finder.get_urls(), ["http://somewhere.else/someWeirdPath/test.html"]) 56 | 57 | def test_with_relative_link_and_relative_base_href(self): 58 | html_sample = """ 59 | 60 | 61 | 62 | 63 | 64 | test 65 | 66 | """ 67 | finder = UrlFinder(html_sample) 68 | self.assertEqual(finder.get_urls(), ["/someWeirdPath/test.html"]) 69 | 70 | html_sample = """ 71 | 72 | 73 | 74 | 75 | 76 | test 77 | 78 | """ 79 | finder = UrlFinder(html_sample) 80 | self.assertEqual(finder.get_urls(), ["someWeirdPath/test.html"]) 81 | 82 | def test_with_anchor_link(self): 83 | html_sample = 'test' 84 | finder = UrlFinder(html_sample) 85 | 86 | self.assertEqual(finder.get_urls(), []) 87 | 88 | def test_with_no_http_link(self): 89 | html_sample = 'test' 90 | 91 | finder = UrlFinder(html_sample) 92 | 93 | self.assertEqual(finder.get_urls(), []) 94 | 95 | def test_with_http_absolute_link(self): 96 | html_sample = 'test' 97 | 98 | finder = UrlFinder(html_sample) 99 | 100 | self.assertEqual(finder.get_urls(), ["http://test.lan"]) 101 | 102 | def test_with_https_absolute_link(self): 103 | html_sample = 'test' 104 | 105 | finder = UrlFinder(html_sample) 106 | 107 | self.assertEqual(finder.get_urls(), ["https://test.lan"]) 108 | -------------------------------------------------------------------------------- /core/util/utilities/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - htcap.org 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import sys 14 | import sqlite3 15 | import json 16 | import getopt 17 | import os 18 | import getpass 19 | 20 | from core.lib.utils import * 21 | from core.lib.shell import CommandExecutor 22 | from core.util.base_util import BaseUtil 23 | from core.lib.cookie import Cookie 24 | 25 | reload(sys) 26 | sys.setdefaultencoding('utf8') 27 | 28 | class Login(BaseUtil): 29 | 30 | @staticmethod 31 | def get_settings(): 32 | return dict( 33 | descr = "Login to a webapp to get session cookies and logout urls", 34 | optargs = 'p:HJhAcl', 35 | minargs = 2 36 | ) 37 | 38 | def usage(self): 39 | return ( 40 | "%s\n" 41 | "usage: %s [options] []\n" 42 | "Options:\n" 43 | " -h This help\n" 44 | " -p PASSWD Set login password\n" 45 | " -c Do not output cookies\n" 46 | " -l Do not output logout urls\n" 47 | " -H Format output as htcap arguments\n" 48 | " -J Format output as json (for cookies only)\n" 49 | " -A Format output as general command line arguments\n" 50 | % (self.get_settings()['descr'], self.utilname) 51 | ) 52 | 53 | 54 | def main(self, args, opts): 55 | passw = None 56 | format = None 57 | out_cookies = True 58 | out_logouts = True 59 | for o,v in opts: 60 | if o == "-h": 61 | print self.usage() 62 | sys.exit(0) 63 | elif o == "-p": 64 | passw = v 65 | elif o == "-c": 66 | out_cookies = False 67 | elif o == "-l": 68 | out_logouts = False 69 | elif o in ("-H", "-J", "-A"): 70 | format = o 71 | 72 | if not passw: 73 | print "The password is hidden here BUT it will be passed to phantomjs via commandline ..." 74 | try: 75 | passw = getpass.getpass() 76 | except KeyboardInterrupt: 77 | print "\nAbort..." 78 | sys.exit(0) 79 | 80 | jspath = "%s%s%s%s" % (getrealdir(__file__), "login", os.sep, "login.js") 81 | cmd = get_phantomjs_cmd() + [jspath, args[0], args[1], passw] 82 | if len(args) > 2: cmd.append(args[2]) 83 | #print cmd_to_str(cmd) 84 | exe = CommandExecutor(cmd, True) 85 | out, err = exe.execute(20) 86 | if err: 87 | print "Unable to login" 88 | sys.exit(1) 89 | 90 | try: 91 | ret = json.loads(out) 92 | except ValueError as e: 93 | print e 94 | sys.exit(1) 95 | allcookies, logouts = ret 96 | cookies = [] 97 | if out_cookies: 98 | for c in reversed(allcookies): 99 | cookie = Cookie(c) 100 | if not cookie in cookies: cookies.append(cookie) 101 | if not out_logouts: 102 | logouts = [] 103 | 104 | if not format: 105 | print "Cookies:" 106 | for c in cookies: 107 | print " %s=%s" % (c.name, c.value) 108 | print "Logout urls:" 109 | for u in logouts: 110 | print " %s" % u 111 | elif format == "-A": 112 | for c in cookies: 113 | print cmd_to_str([c.name, c.value]) 114 | for u in logouts: 115 | print cmd_to_str([u]) 116 | elif format == "-H": 117 | args = [] 118 | if len(cookies) > 0: 119 | args = ["-c", ";".join(["%s=%s" % (c.name, c.value) for c in cookies])] 120 | if len(logouts) > 0: 121 | args.extend(["-x", ",".join(logouts)]) 122 | if len(args) > 0: 123 | print cmd_to_str(args) 124 | elif format == "-J": 125 | cd = [] 126 | for c in cookies: 127 | cd.append(c.get_dict()) 128 | if out_cookies: 129 | print json.dumps(cd) 130 | 131 | 132 | -------------------------------------------------------------------------------- /core/crawl/probe/src/constants.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | 4 | exports.constants = { 5 | XHRTimeout: 5000, 6 | 7 | eventLoopConfig: { 8 | messageEvent: { 9 | from: 'javascript-probe', 10 | name: 'event-loop-ready', 11 | }, 12 | 13 | /** 14 | * number of event loop cycle between every new action proceed in the eventLoop 15 | * lower is better for speed 16 | * higher is better for discovery 17 | */ 18 | bufferCycleSize: 150, 19 | 20 | /** 21 | * in milliseconds, 22 | * after trigger of an event, time to wait before requesting another eventLoop cycle 23 | * lower is better for speed 24 | */ 25 | afterEventTriggeredTimeout: 10, 26 | 27 | /** 28 | * in milliseconds, 29 | * after a done XHR, time to wait before requesting another eventLoop cycle 30 | */ 31 | afterDoneXHRTimeout: 10, 32 | 33 | /** 34 | * in milliseconds, 35 | * time to wait before closing the event loop manager (when everything seems to be done) 36 | */ 37 | beforeClosingEventLoopManagerTimeout: 500, 38 | }, 39 | 40 | // see: https://developer.mozilla.org/en-US/docs/Web/Events 41 | mappableEvents: [ 42 | 'abort', 'blur', 'canplay', 'canplaythrough', 'change', 'click', 'close', 'contextmenu', 'copy', 43 | 'cut', 'dblclick', 'drag', 'dragend', 'dragenter', 'dragleave', 'dragover', 'dragstart', 'drop', 44 | 'durationchange', 'emptied', 'ended', 'error', 'focus', 'fullscreenchange', 'fullscreenerror', 45 | 'input', 'invalid', 'keydown', 'keypress', 'keyup', 'load', 'loadeddata', 'loadedmetadata', 46 | 'loadstart', 'mousedown', 'mouseenter', 'mouseleave', 'mousemove', 'mouseout', 'mouseover', 47 | 'mouseup', 'paste', 'pause', 'play', 'playing', 'progress', 'ratechange', 'reset', 'resize', 48 | 'scroll', 'seeked', 'seeking', 'select', 'show', 'stalled', 'submit', 'suspend', 'timeupdate', 49 | 'volumechange', 'waiting', 'wheel', 50 | ], 51 | 52 | /** 53 | * always trigger these events on the given element 54 | */ 55 | triggerableEvents: { 56 | 'button': ['click', 'dblclick', 'keyup', 'keydown', 'mouseup', 'mousedown'], 57 | 'select': ['change', 'click', 'keyup', 'keydown', 'mouseup', 'mousedown'], 58 | 'input': ['change', 'click', 'blur', 'focus', 'keyup', 'keydown', 'mouseup', 'mousedown'], 59 | 'a': ['click', 'dblclick', 'keyup', 'keydown', 'mouseup', 'mousedown'], 60 | 'textarea': ['change', 'click', 'blur', 'focus', 'keyup', 'keydown', 'mouseup', 'mousedown'], 61 | 'span': ['click', 'mouseup', 'mousedown'], 62 | 'td': ['click', 'mouseup', 'mousedown'], 63 | 'tr': ['click', 'mouseup', 'mousedown'], 64 | 'div': ['click', 'mouseup', 'mousedown'], 65 | }, 66 | 67 | // map input names to string generators. see generateRandomValues to see all available generators 68 | inputNameMatchValue: [ // regexps NEED to be string to get passed to the page 69 | {name: 'mail', value: 'email'}, 70 | {name: '((number)|(phone))|(^tel)', value: 'number'}, 71 | {name: '(date)|(birth)', value: 'humandate'}, 72 | {name: '((month)|(day))|(^mon$)', value: 'month'}, 73 | {name: 'year', value: 'year'}, 74 | {name: 'url', value: 'url'}, 75 | {name: 'firstname', value: 'firstname'}, 76 | {name: '(surname)|(lastname)', value: 'surname'}, 77 | ], 78 | 79 | /** 80 | * in pixels, 81 | * viewport size of the browser 82 | */ 83 | viewport: { 84 | width: 1920, 85 | height: 1080, 86 | }, 87 | }; 88 | 89 | })(); 90 | -------------------------------------------------------------------------------- /core/util/utilities/htmlreport/report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |
6 |
7 |

Htcap Report

8 |
9 | 10 | 11 | 12 | 13 | 14 | 18 | 22 | 23 |
Target:
Crawl date:
Pages crawled:
Crawl duration:
Out of scope: 15 | 16 | open 17 |
Non HTML: 19 | 20 | open 21 |
Command:
24 | 25 |
26 |
27 |
28 | 29 |
30 |
31 |
32 | Show: 33 |
34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 |
Hide Urls:  Hide Results:
43 |
44 |
45 |
46 | open notes 47 | open marked 48 | open trash 49 |   50 | collapse all 51 | expand visibles 52 |   53 | 54 | save session 55 | 58 | 59 | 60 |
61 |
62 |
63 | 64 |
65 | 66 | 67 | 74 | 75 | 76 | 83 | 84 | 85 | 94 | 95 | 96 | 97 | 104 | 105 | 112 | 113 | 114 | 115 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /core/crawl/probe/.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "eslint:recommended", 3 | "env": { 4 | "browser": true, 5 | "node": true, 6 | "es6": true 7 | }, 8 | "globals": { 9 | "describe": true, 10 | "afterEach": true, 11 | "beforeEach": true, 12 | "it": true, 13 | "expect": true, 14 | "by": true, 15 | "spyOn": true, 16 | "WebKitMutationObserver": true, 17 | "chrome": true, 18 | "PointerEvent": true 19 | }, 20 | "rules": { 21 | "semi": "error", 22 | "no-bitwise": "error", 23 | "camelcase": "error", 24 | "curly": "error", 25 | "eqeqeq": "error", 26 | "object-curly-spacing": "warn", 27 | "object-curly-newline": [ 28 | "error", 29 | { 30 | "consistent": true 31 | } 32 | ], 33 | "comma-spacing": "warn", 34 | "newline-per-chained-call": [ 35 | "error", 36 | { 37 | "ignoreChainWithDepth": 2 38 | } 39 | ], 40 | "wrap-iife": [ 41 | "error", 42 | "any" 43 | ], 44 | "indent": [ 45 | "error", 46 | 4, 47 | { 48 | "SwitchCase": 1, 49 | "MemberExpression": "off", 50 | "FunctionDeclaration": { 51 | "parameters": "first" 52 | }, 53 | "FunctionExpression": { 54 | "parameters": "first" 55 | } 56 | } 57 | ], 58 | "no-use-before-define": [ 59 | "error", 60 | { 61 | "functions": false, 62 | "classes": true 63 | } 64 | ], 65 | "new-cap": "error", 66 | "no-caller": "error", 67 | "quotes": [ 68 | "error", 69 | "single" 70 | ], 71 | "no-undef": "error", 72 | "no-unused-vars": "error", 73 | "strict": [ 74 | "error", 75 | "function" 76 | ], 77 | "no-multi-str": "error", 78 | "operator-linebreak": [ 79 | "error", 80 | "after" 81 | ], 82 | "max-len": [ 83 | "error", 84 | 160 85 | ], 86 | "no-with": "error", 87 | "brace-style": "error", 88 | "no-mixed-spaces-and-tabs": "error", 89 | "key-spacing": [ 90 | "error", 91 | { 92 | "beforeColon": false, 93 | "afterColon": true 94 | } 95 | ], 96 | "space-unary-ops": [ 97 | "error", 98 | { 99 | "words": false, 100 | "nonwords": false 101 | } 102 | ], 103 | "space-before-function-paren": [ 104 | "error", 105 | "never" 106 | ], 107 | "no-spaced-func": "error", 108 | "array-bracket-spacing": [ 109 | "error", 110 | "never" 111 | ], 112 | "keyword-spacing": [ 113 | "error", 114 | { 115 | "overrides": { 116 | "else": { 117 | "before": true, 118 | "after": true 119 | }, 120 | "while": { 121 | "before": true, 122 | "after": true 123 | }, 124 | "catch": { 125 | "before": true, 126 | "after": true 127 | }, 128 | "if": { 129 | "after": true 130 | }, 131 | "for": { 132 | "after": true 133 | }, 134 | "do": { 135 | "after": true 136 | }, 137 | "switch": { 138 | "after": true 139 | }, 140 | "return": { 141 | "after": true 142 | }, 143 | "try": { 144 | "after": true 145 | } 146 | } 147 | } 148 | ], 149 | "space-in-parens": [ 150 | "error", 151 | "never" 152 | ], 153 | "comma-dangle": [ 154 | "error", 155 | "always-multiline" 156 | ], 157 | "no-trailing-spaces": "error", 158 | "comma-style": [ 159 | "error", 160 | "last" 161 | ], 162 | "eol-last": "error", 163 | "space-infix-ops": "error", 164 | "space-before-blocks": [ 165 | "error", 166 | "always" 167 | ], 168 | "linebreak-style": [ 169 | "error", 170 | "unix" 171 | ] 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /core/crawl/probe/chrome_extension/background.js: -------------------------------------------------------------------------------- 1 | /*eslint no-console: off */ 2 | (function() { 3 | 'use strict'; 4 | 5 | // keep track of all the opened tab 6 | let tabs = {}; 7 | 8 | // store the probe starting tab (the first tab navigated with success) 9 | let startingTabId = undefined; 10 | 11 | // Get all existing tabs 12 | chrome.tabs.query({}, function(results) { 13 | results.forEach(function(tab) { 14 | tabs[tab.id] = tab; 15 | }); 16 | }); 17 | 18 | function onCreatedListener(tab) { 19 | tabs[tab.id] = tab; 20 | tabs[tab.id].haveBeenNavigated = false; 21 | } 22 | 23 | // Create tab event listeners 24 | function onUpdatedListener(tabId, changeInfo, tab) { 25 | if (tab.url.startsWith('http')) { 26 | if (tab.url.startsWith('http') && changeInfo.status === 'complete') { 27 | startingTabId = tabId; 28 | tabs[startingTabId].haveBeenNavigated = true; 29 | } 30 | } 31 | } 32 | 33 | function onRemovedListener(tabId) { 34 | delete tabs[tabId]; 35 | } 36 | 37 | /** 38 | * if the request url differ from the current tab url block it 39 | * @param details 40 | * @return {{redirectUrl: string}} 41 | */ 42 | function onBeforeRequestListener(details) { 43 | let result, currentTab = tabs[details.tabId]; 44 | 45 | // if the current tab exist (sometimes the request is issue before the tab exist) 46 | if (currentTab) { 47 | 48 | // DEBUG: 49 | console.group(); 50 | console.log('currentTab', currentTab); 51 | console.log('details', details); 52 | console.log('startingTabId', startingTabId); 53 | console.log(details.url.startsWith('http') && details.type === 'sub_frame'); 54 | console.groupEnd(); 55 | 56 | // create the message link with the probe 57 | chrome.tabs.executeScript(startingTabId || currentTab.id, {file: 'content.js', runAt: 'document_start'}); 58 | 59 | // if the content is loaded from sub-frame) 60 | if (details.url.startsWith('http') && details.type === 'sub_frame') { 61 | 62 | _notifyProbe(details.url, startingTabId || currentTab.id); 63 | 64 | // redirect the navigation to nowhere 65 | result = {redirectUrl: 'javascript:void(0)'}; 66 | 67 | } else if (startingTabId) { 68 | 69 | if (currentTab.id !== startingTabId) { // if the current tab is a new tab 70 | _notifyProbe(details.url, startingTabId); 71 | 72 | // redirect the navigation to nowhere 73 | result = {redirectUrl: 'javascript:void(0)'}; 74 | 75 | // close the tab 76 | chrome.tabs.remove(currentTab.id); 77 | 78 | } else if (tabs[startingTabId].haveBeenNavigated) { // if the starting tab have already been navigated 79 | 80 | _notifyProbe(details.url, startingTabId); 81 | 82 | // redirect the navigation to nowhere 83 | result = {redirectUrl: 'javascript:void(0)'}; 84 | } 85 | } 86 | } 87 | return result; 88 | } 89 | 90 | // Subscribe to tab events to track opened tabs 91 | chrome.tabs.onCreated.addListener(onCreatedListener); 92 | chrome.tabs.onUpdated.addListener(onUpdatedListener); 93 | chrome.tabs.onRemoved.addListener(onRemovedListener); 94 | 95 | chrome.webRequest.onBeforeRequest.addListener(onBeforeRequestListener, { 96 | urls: [''], 97 | types: ['main_frame', 'sub_frame'], // only watching for "frame" type request 98 | }, ['blocking']); 99 | 100 | function _notifyProbe(url, tabId) { 101 | 102 | // DEBUG: 103 | console.warn(`Navigation to ${url} blocked.`); 104 | 105 | // sending message to the probe 106 | chrome.tabs.sendMessage(tabId, {url: url}); 107 | } 108 | 109 | })(); 110 | -------------------------------------------------------------------------------- /core/lib/request_pattern.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import json 14 | import xml.etree.ElementTree as ET 15 | from urlparse import parse_qs, urlsplit 16 | 17 | 18 | class RequestPattern: 19 | def __init__(self, request): 20 | self.request = request 21 | self.pattern = None 22 | 23 | self.set_pattern() 24 | 25 | def set_pattern(self): 26 | """ 27 | sets requst pattern for comparision 28 | """ 29 | 30 | # pattern[0] = url_pattern, pattern[1] = data_pattern 31 | self.pattern = [self.get_url_pattern(self.request.url), None] 32 | 33 | if self.request.method == "GET" or not self.request.data: 34 | return 35 | 36 | # try xml 37 | try: 38 | root = ET.fromstring(self.request.data) 39 | self.pattern[1] = self.get_xml_pattern(root) 40 | except Exception as e: 41 | # try json 42 | try: 43 | self.pattern[1] = self.get_json_pattern(self.request.data) 44 | except Exception as e: 45 | # try url-encoded 46 | try: 47 | self.pattern[1] = self.get_urlencoded_pattern(self.request.data, False) 48 | except Exception as e: 49 | # print "! UNKNOWN POST DATA FORMAT" 50 | pass 51 | 52 | def get_url_pattern(self, url): 53 | """ 54 | returns url pattern for comparision (query and data parameters are sorted and without values) 55 | """ 56 | purl = urlsplit(url) 57 | patt = [purl.scheme, purl.netloc, purl.path, self.get_urlencoded_pattern(purl.query)] 58 | 59 | return patt 60 | 61 | def get_xml_pattern(self, node): 62 | """ 63 | returns the xml tree as an array without values, example: 64 | 65 | 66 | 67 | 68 | 1 69 | 1 70 | 123 71 | 72 | 73 | 74 | ['root', [ <--- child nodes sorted ('elements' comes before 'node') 75 | ['elements', 'index', [ 76 | ['a', 'type'], ['z'], ['z'] 77 | ]], 78 | ['node', 'bar', foo'] <--- properties sorted ('bar' comes before 'foo') 79 | ]] 80 | """ 81 | 82 | # describe a tag as its name plus the name of its properties (sorted) 83 | patt = [node.tag] + [x for x in sorted(node.attrib.keys())] 84 | 85 | # collect child nodes in the form of "node array" (tagname+props) 86 | ch = [] 87 | for child in node: 88 | ch.append(self.get_xml_pattern(child)) 89 | 90 | if ch: 91 | # sort using the tagname as the key 92 | ch.sort(key=lambda x: x[0]) 93 | patt.append(ch) 94 | 95 | return patt 96 | 97 | def get_json_pattern(self, data): 98 | """ 99 | returns an object with values set to zero (sorting of keys is not needed), example: 100 | """ 101 | 102 | patt = json.loads(data) 103 | self.nullify_object_values(patt) 104 | 105 | return patt 106 | 107 | def nullify_object_values(self, obj): 108 | """ 109 | sets to 0 all object values 110 | """ 111 | keys = obj.keys() if isinstance(obj, dict) else range(0, len(obj)) 112 | 113 | for k in keys: 114 | if not hasattr(obj[k], '__iter__'): 115 | obj[k] = 0 116 | else: 117 | self.nullify_object_values(obj[k]) 118 | 119 | def get_urlencoded_pattern(self, data, ignoreErrors=True): 120 | """ 121 | returns query parameters sorted and without vaules 122 | """ 123 | # parse_qs(qs[, keep_blank_values[, strict_parsing]]) 124 | query = parse_qs(data, True, not ignoreErrors) 125 | return sorted(query.keys()) 126 | -------------------------------------------------------------------------------- /core/crawl/probe/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @todo (blocked): 3 | * - make possible to send custom headers (set a referer) on page.goto() see: {@link https://github.com/GoogleChrome/puppeteer/issues/1062} 4 | * and {@link https://github.com/GoogleChrome/puppeteer/issues/686} 5 | * 6 | * @todo (nice to have): 7 | * - store headers for every request (mainly cookies and referrer) to enable a better "replay" 8 | * - also analyse the error pages (40x and 50x) 9 | */ 10 | 11 | (function() { 12 | 'use strict'; 13 | 14 | const process = require('process'); 15 | 16 | const logger = require('./logger').debug; 17 | const output = require('./logger').output; 18 | const puppeteer = require('puppeteer'); 19 | 20 | const constants = require('./src/constants').constants; 21 | const utils = require('./src/utils'); 22 | 23 | const pageHandler = require('./src/page-handler'); 24 | 25 | let options = utils.getOptionsFromArgs(), 26 | result = [], 27 | browser, 28 | handler; 29 | 30 | let startTime = Date.now(); 31 | 32 | // handling SIGTERM signal 33 | process.on('SIGTERM', () => { 34 | if (options.verbosity >= 1) { 35 | logger.info('SIGTERM signal received'); 36 | } 37 | result.push({'status': 'error', 'code': 'interruptReceived'}); 38 | _requestJobEnd(); 39 | }); 40 | 41 | function _requestJobEnd(exitCode) { 42 | 43 | if (options.verbosity >= 1) { 44 | logger.info('closing Node process'); 45 | logger.info('debug', `got results in ${(Date.now() - startTime) / 1000} sec : ${JSON.stringify(result)}`); 46 | } 47 | output.log('info', `${JSON.stringify(result)}`); 48 | 49 | if (!options.debug) { // keep the browser open for debug 50 | if (browser) { 51 | browser.close() 52 | .then(() => { 53 | process.exit(exitCode); 54 | }); 55 | } else { 56 | process.exit(exitCode); 57 | } 58 | } 59 | } 60 | 61 | function run([newBrowser, newPage]) { 62 | 63 | browser = newBrowser; 64 | 65 | handler = new pageHandler.Handler(newPage, constants, options); 66 | 67 | handler.on('finished', (exitCode, status) => { 68 | result.push(status); 69 | _requestJobEnd(exitCode); 70 | }); 71 | 72 | handler.on('probeResult', (newResult) => { 73 | result.push(newResult); 74 | }); 75 | 76 | handler.initialize() 77 | .then((page) => { 78 | if (options.verbosity >= 1) { 79 | logger.info(`starting navigation to ${options.startUrl.href}`); 80 | } 81 | page.goto(options.startUrl.href, {waitUntil: 'networkidle2'}) 82 | .then(response => { 83 | 84 | if (response.ok()) { 85 | // checking if it's some HTML document 86 | if (response.headers()['content-type'] 87 | .toLowerCase() 88 | .includes('text/html')) { 89 | 90 | handler.getCookies() 91 | .then(cookies => { 92 | result.push(['cookies', cookies]); 93 | }); 94 | 95 | if (options.verbosity >= 1) { 96 | logger.info('starting the probe'); 97 | } 98 | // start analysis on the page 99 | handler.startProbe(); 100 | } else { 101 | result.push({'status': 'error', 'code': 'contentType', 'message': `content type is ${response.headers()['content-type']}`}); 102 | _requestJobEnd(); 103 | } 104 | } else { 105 | result.push({'status': 'error', 'code': 'responseCode', 'message': `${response.status()}`}); 106 | _requestJobEnd(1); 107 | } 108 | }, 109 | (error) => { 110 | if (options.verbosity >= 1) { 111 | logger.error(`Error during goto: ${error}`); 112 | } 113 | result.push({'status': 'error', 'code': 'load', 'message': `${error}`}); 114 | _requestJobEnd(1); 115 | }); 116 | }, (error) => { 117 | if (options.verbosity >= 1) { 118 | logger.error(`Error during initialisation: ${error}`); 119 | } 120 | result.push({'status': 'error', 'code': 'probeError', 'message': `${error}`}); 121 | _requestJobEnd(1); 122 | }); 123 | } 124 | 125 | pageHandler.getBrowserAndPage(puppeteer, options.proxyAddress, options.debug) 126 | .then(run); 127 | 128 | })(); 129 | -------------------------------------------------------------------------------- /core/scan/scanners/arachni.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import os 16 | import time 17 | import re 18 | import json 19 | import base64 20 | import uuid 21 | import urllib 22 | import getopt 23 | import datetime 24 | 25 | from core.lib.exception import * 26 | from core.lib.cookie import Cookie 27 | 28 | from core.scan.base_scanner import BaseScanner 29 | from core.lib.shell import CommandExecutor 30 | from core.lib.utils import * 31 | 32 | class Arachni(BaseScanner): 33 | 34 | def init(self, argv): 35 | # scanner_exe is converted to array to handle something like "python /usr/bin/scanner" 36 | self.reporter = "%s%sarachni_reporter" % (os.path.dirname(self.settings['scanner_exe'][-1]), os.sep) 37 | if not os.path.exists(self.reporter): 38 | print "Error finding arachni_reporter: %s" % self.reporter 39 | self.exit(1) 40 | 41 | self.skip_duplicates = True 42 | self.execute_command = True 43 | self.audit_both_methods = False 44 | 45 | try: 46 | opts, args = getopt.getopt(argv, 'hspb') 47 | except getopt.GetoptError as err: 48 | print str(err) 49 | self.exit(1) 50 | 51 | for o, v in opts: 52 | if o == '-h': 53 | self.usage() 54 | self.exit(0) 55 | elif o == '-s': 56 | self.skip_duplicates = False 57 | elif o == '-p': 58 | self.execute_command = False 59 | elif o == '-b': 60 | self.audit_both_methods = False 61 | 62 | 63 | 64 | def usage(self): 65 | print ( "htcap arachni module\nusage: scan arachni [options]\n" 66 | "Options are:\n" 67 | " -h this help\n" 68 | " -s do not skip duplicated urls\n" 69 | " -p print first command and exit\n" 70 | " -b set audit-with-both-methods arachni option\n" 71 | ) 72 | 73 | def get_settings(self): 74 | return dict( 75 | request_types = "xhr,link,form,jsonp,redirect", 76 | num_threads = 5, 77 | process_timeout = 180, 78 | scanner_exe = "/usr/share/arachni/bin/arachni" 79 | #scanner_exe = "/usr/bin/arachni" 80 | ) 81 | 82 | # return False to skip current request 83 | def get_cmd(self, request, tmp_dir): 84 | out_file = tmp_dir + "/report" 85 | 86 | if self.skip_duplicates and self.is_request_duplicated(request): 87 | return False 88 | 89 | timeout = str(datetime.timedelta(seconds=(self.settings['process_timeout']-5))) 90 | 91 | cmd = [ 92 | #"--checks", "sql_injection*", 93 | "--checks", "code_injection*,file_inclusion*,path_traversal*,rfi*,xss*,xxe*", # "xss*", 94 | "--output-only-positives", 95 | "--http-request-concurrency", "1", 96 | "--http-request-timeout", "10000", 97 | "--timeout", timeout, #"00:03:00", 98 | "--scope-dom-depth-limit", "0", 99 | "--scope-directory-depth-limit", "0", 100 | "--scope-page-limit", "1", 101 | "--report-save-path", out_file, 102 | "--snapshot-save-path", "/dev/null", 103 | #"--http-proxy-type", "socks5", 104 | #"--http-proxy","127.0.0.1:9150" 105 | ] 106 | 107 | if self.audit_both_methods: 108 | cmd.append("--audit-with-both-methods") 109 | 110 | if request.referer: 111 | cmd.extend(['--http-request-header', 'Referer=%s' % request.referer]) 112 | 113 | if len(request.cookies) > 0: 114 | cmd.extend(["--http-cookie-string", "; ".join(["%s=%s" % (c.name,c.value) for c in request.cookies])]) 115 | 116 | cmd.append(request.url) 117 | 118 | if not self.execute_command: 119 | print cmd_to_str(self.settings['scanner_exe'] + cmd) 120 | self.exit(0) 121 | return False 122 | 123 | return cmd 124 | 125 | def scanner_executed(self, request, out, err, tmp_dir, cmd): 126 | out_file = tmp_dir + "/report" 127 | 128 | if not os.path.isfile(out_file): 129 | return 130 | 131 | json_file = tmp_dir + "/report.json" 132 | 133 | cmd = [self.reporter, "--reporter", "json:outfile=%s" % json_file, out_file] 134 | exe = CommandExecutor(cmd, True) 135 | out, err = exe.execute(30) 136 | 137 | if err: 138 | print ">>> error exporting arachni to json: %s %s" % (err, request.url) 139 | return 140 | 141 | if not os.path.isfile(json_file): 142 | return 143 | 144 | with open(json_file,'r') as fil: 145 | jsn = fil.read() 146 | 147 | report = [] 148 | try: 149 | report = json.loads(jsn) 150 | except Exception as e: 151 | print err 152 | 153 | issues = report['issues'] 154 | 155 | for i in issues: 156 | ref = i['references']['OWASP'] if i['references'] and 'OWASP' in i['references'] else "N/A" 157 | req = "N/A" 158 | req = None 159 | 160 | if 'request' in i: 161 | req = i['request'] 162 | elif 'variations' in i and len(i['variations']) > 0: 163 | req = i['variations'][0]['request'] 164 | 165 | 166 | fields = (i['name'], ref, i['severity'], req['headers_string'] if req else "N/A") 167 | descr = "D E T A I L S\n\nName: %s\nReference: %s\nSeverity: %s\n\n\nR E Q U E S T\n\n%s" % fields 168 | 169 | if req and req['method'] == "post": 170 | descr += "%s" % urllib.urlencode(req['body']) 171 | 172 | self.save_vulnerability(request, i['check']['shortname'], descr) 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /core/lib/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import os 14 | import pipes 15 | import posixpath 16 | import re 17 | import sys 18 | import time 19 | from urlparse import urlsplit, parse_qsl 20 | 21 | 22 | def get_program_infos(): 23 | infos = { 24 | "version": "1.0.1 - dev", 25 | "author_name": "Filippo Cavallarin", 26 | "author_email": "filippo.cavallarin@wearesegment.com" 27 | } 28 | 29 | return infos 30 | 31 | 32 | def generate_filename(name, ext=None, out_file_overwrite=False, ask_out_file_overwrite=False): 33 | def fname(): 34 | return ".".join([f for f in ft if f]) 35 | 36 | ft = name.split(".") 37 | 38 | if not ext and len(ft) > 1: 39 | ext = ft[-1] 40 | 41 | # remove extension if present in name and equal to ext 42 | if ft[-1] == ext: ft.pop() 43 | 44 | # always append ext, even if None 45 | ft.append(ext) 46 | 47 | if ask_out_file_overwrite and os.path.exists(fname()): 48 | try: 49 | sys.stdout.write("File %s already exists. Overwrite [y/N]: " % fname()) 50 | out_file_overwrite = sys.stdin.read(1) == "y" 51 | except KeyboardInterrupt: 52 | print "\nAborted" 53 | sys.exit(0) 54 | 55 | if not out_file_overwrite: 56 | bn = ft[-2] 57 | i = 1 58 | while os.path.exists(fname()): 59 | ft[-2] = "%s-%d" % (bn, i) 60 | i += 1 61 | 62 | return fname() 63 | 64 | 65 | def cmd_to_str(cmd): 66 | ecmd = [pipes.quote(o) for o in cmd] 67 | return " ".join(ecmd) 68 | 69 | 70 | def stdoutw(str): 71 | sys.stdout.write(str) 72 | sys.stdout.flush() 73 | 74 | 75 | def getrealdir(path): 76 | return os.path.dirname(os.path.realpath(path)) + os.sep 77 | 78 | 79 | def print_progressbar(tot, scanned, start_time, label): 80 | perc = (scanned * 33) / (tot if tot > 0 else 1) 81 | sys.stdout.write("\b" * 150) 82 | out = "[%s%s] %d of %d %s in %d minutes" % ( 83 | "=" * perc, " " * (33 - perc), scanned, tot, label, int(time.time() - start_time) / 60) 84 | stdoutw(out) 85 | 86 | 87 | def join_qsl(qs): 88 | """ 89 | join a list returned by parse_qsl 90 | do not use urlencode since it will encode values and not just join tuples 91 | """ 92 | return "&".join(["%s=%s" % (k, v) for k, v in qs]) 93 | 94 | 95 | # ?a=1&a=2&a=3 -> ?a=3, ?a[]=1&a[]=2&a[]=3 -> UNCHANGED 96 | def group_qs_params(url): 97 | purl = urlsplit(url) 98 | qs = parse_qsl(purl.query) 99 | nqs = list() 100 | 101 | for t in reversed(qs): 102 | if t[0].endswith("[]") or t[0] not in [f for f, _ in nqs]: 103 | nqs.append(t) 104 | 105 | purl = purl._replace(query=join_qsl(reversed(nqs))) 106 | 107 | return purl.geturl(); 108 | 109 | 110 | def normalize_url(url): 111 | # add http if scheme is not present 112 | # if an url like 'test.com:80//path' is passed to urlsplit the result is: 113 | # (scheme='test.com', path='80//path', ...) 114 | if not re.match("^[a-z]+://", url, re.I): 115 | url = "http://%s" % url 116 | 117 | purl = urlsplit(url) 118 | 119 | # no path and no query_string .. just ensure url ends with / 120 | if not purl.path: 121 | return "%s/" % purl.geturl() 122 | 123 | # group multiple / (path//to///file -> path/to/file) 124 | new_path = re.sub(r"/+", "/", purl.path) 125 | # normalize ../../../ 126 | new_path = posixpath.normpath(new_path) 127 | if purl.path.endswith('/') and not new_path.endswith('/'): 128 | new_path += '/' 129 | 130 | purl = purl._replace(path=new_path) 131 | 132 | return purl.geturl() 133 | 134 | 135 | def extract_http_auth(url): 136 | """ 137 | returns a tuple with httpauth string and the original url with http auth removed 138 | http://foo:bar@example.local -> (foo:bar, http://example.local) 139 | """ 140 | 141 | purl = urlsplit(url) 142 | if not purl.netloc: 143 | return (None, url) 144 | try: 145 | auth, netloc = purl.netloc.split("@", 1) 146 | except: 147 | return (None, url) 148 | 149 | purl = purl._replace(netloc=netloc) 150 | 151 | return (auth, purl.geturl()) 152 | 153 | 154 | def remove_tokens(query): 155 | """ 156 | tries to detect and remove tokens from a query string 157 | used to compare request ignoring, for example, CSRF tokens 158 | """ 159 | 160 | qs = parse_qsl(query) 161 | nqs = [] 162 | for k, v in qs: 163 | if len(v) < 32 or not re.match(r'^[a-z0-9\-_\.:=]+$', v, re.I): 164 | nqs.append((k, v)) 165 | 166 | return join_qsl(nqs) 167 | 168 | 169 | def get_probe_cmd(probe): 170 | standard_paths = [os.getcwd()] 171 | envpath = os.environ['PATH'].split(os.pathsep) 172 | exe_name = probe 173 | 174 | if sys.platform != "win32": 175 | # force check to standard paths in case $PATH is not set (ie crontab) 176 | standard_paths.extend(["/usr/bin", "/usr/local/bin", "/usr/share/bin"]) 177 | else: 178 | exe_name = "%s.exe" % exe_name 179 | 180 | exe_paths = ["%s%s%s" % (p, os.sep, exe_name) for p in standard_paths + envpath] 181 | 182 | for exe in exe_paths: 183 | if os.path.isfile(exe): 184 | return [exe, "--ignore-ssl-errors=yes", "--web-security=false", "--ssl-protocol=any", "--debug=false"] 185 | 186 | return None 187 | -------------------------------------------------------------------------------- /core/util/utilities/login/login.js: -------------------------------------------------------------------------------- 1 | /* 2 | HTCAP - htcap.org 3 | Author: filippo.cavallarin@wearesegment.com 4 | 5 | This program is free software; you can redistribute it and/or modify it under 6 | the terms of the GNU General Public License as published by the Free Software 7 | Foundation; either version 2 of the License, or (at your option) any later 8 | version. 9 | */ 10 | 11 | var system = require('system'); 12 | var fs = require('fs'); 13 | 14 | // ctrl-c from probe/function.js .. maybe I should create some common.js and put it somewhere .. but where?.. really.. where?? 15 | function getCookies(headers, url){ 16 | var a, b, c, ret = []; 17 | var purl = document.createElement('a'); 18 | purl.href = url; 19 | var domain = purl.hostname; 20 | 21 | for(a = 0; a < headers.length; a++){ 22 | //console.log(JSON.stringify(headers[a])) 23 | if(headers[a].name.toLowerCase() == "set-cookie"){ 24 | var cookies = headers[a].value.split("\n"); // phantomjs stores multiple cookies in this way .. 25 | for(b = 0; b < cookies.length; b++){ 26 | var ck = cookies[b].split(/; */); 27 | var cookie = {domain: domain, path: "/", secure: false, httponly:false}; 28 | for(c = 0; c < ck.length; c++){ 29 | var kv = ck[c].split("="); 30 | if(c == 0){ 31 | cookie.name = kv[0]; 32 | cookie.value = decodeURIComponent(kv[1]); 33 | continue; 34 | } 35 | switch(kv[0].toLowerCase()){ 36 | case "expires": 37 | if(!("expires" in cookie)) 38 | cookie.expires = parseInt((new Date(kv[1])).getTime() / 1000); 39 | break; 40 | case "max-age": 41 | cookie.expires = parseInt(((new Date()).getTime() / 1000) + parseInt(kv[1])); 42 | break; 43 | case "domain": 44 | case "path": 45 | cookie[kv[0]] = kv[1]; 46 | break; 47 | case "httponly": 48 | case "secure": 49 | cookie[kv[0]] = true; 50 | break; 51 | } 52 | } 53 | ret.push(cookie); 54 | } 55 | } 56 | } 57 | return ret; 58 | }; 59 | 60 | 61 | 62 | function print(mess){ 63 | output.push(mess) 64 | } 65 | 66 | if(system.args.length < 4){ 67 | console.log("usage: login.js []"); 68 | phantom.exit(0); 69 | } 70 | 71 | 72 | var step = 0; 73 | var allCookies = []; 74 | var output = [] 75 | 76 | var url = system.args[1]; 77 | var login = system.args[2]; 78 | var password = system.args[3]; 79 | var buttonTxt = system.args[4] || null; 80 | 81 | 82 | var page = require('webpage').create(); 83 | 84 | page.onConsoleMessage = function(msg, lineNum, sourceId) { 85 | 86 | //console.log("console: " + msg); 87 | } 88 | 89 | page.onResourceReceived = function(response) { 90 | var cookies = getCookies(response.headers, url) 91 | for(var a = 0; a < cookies.length; a++){ 92 | allCookies.push(cookies[a]) 93 | } 94 | }; 95 | 96 | phantom.onError = function(msg, trace) {} 97 | page.onError = function(msg, trace) {} 98 | 99 | page.onCallback = function(data) { 100 | switch(data.cmd){ 101 | case "next": 102 | step = 2; 103 | //page.render("010101.png") 104 | return; 105 | case "end": 106 | console.log(JSON.stringify(output)) 107 | phantom.exit(0); 108 | case "print": 109 | print(JSON.parse(data.par)) 110 | return; 111 | } 112 | } 113 | 114 | page.onLoadFinished = function(status) { 115 | if(status != 'success'){ 116 | console.log("error loading page " + url); 117 | phantom.exit(1); 118 | } 119 | 120 | if(step < 2)return; 121 | 122 | if(allCookies.length > 0){ 123 | print(allCookies); 124 | } else { 125 | print([]); 126 | } 127 | 128 | page.evaluate(function(){ 129 | var els = document.getElementsByTagName("a"); 130 | var lu = []; 131 | var re = /.*(log|sign)(_|\-| )?(out|off).*/gi; 132 | for(var a = 0; a < els.length; a++){ 133 | if(els[a].href.match(re) || els[a].innerText.match(re)){ 134 | lu.push(els[a].href) 135 | } 136 | } 137 | if(lu.length > 0){ 138 | callPhantom({cmd:"print",par:JSON.stringify(lu)}); 139 | } else { 140 | callPhantom({cmd:"print",par:'[]'}); 141 | } 142 | 143 | callPhantom({cmd:"end"}); 144 | }) 145 | } 146 | 147 | 148 | 149 | page.settings.loadImages = false; 150 | 151 | page.open(url, {}, function(status){ 152 | page.evaluate(function(login, password,buttonTxt){ 153 | function trigger(el, evname){ 154 | if ('createEvent' in document) { 155 | var evt = document.createEvent('HTMLEvents'); 156 | evt.initEvent(evname, true, false); 157 | el.dispatchEvent(evt); 158 | } else { 159 | evname = 'on' + evname; 160 | if( evname in el && typeof el[evname] == "function"){ 161 | el[evname](); 162 | } 163 | } 164 | }; 165 | 166 | function getAdiacent(cont, selector){ 167 | var ad = null; 168 | 169 | while(!ad && cont){ 170 | ad = cont.querySelector(selector) 171 | cont = cont.parentNode 172 | } 173 | return ad 174 | } 175 | 176 | var passw_el = document.querySelector("input[type=password]"); 177 | passw_el.value = password; 178 | var login_el = getAdiacent(passw_el, "input[type=text],input[type=email],input:not([type])"); 179 | var button_el = null; 180 | if(buttonTxt){ 181 | var els = document.getElementsByTagName("*"); 182 | for(var a = 0; a < els.length; a++){ 183 | for(var ch = els[a].firstChild; ch; ch = ch.nextSibling){ 184 | if(ch.nodeType != 3)continue; // skip non textNodes 185 | if(ch.nodeValue.toLowerCase().trim() == buttonTxt.toLowerCase()){ 186 | button_el = els[a]; 187 | break; 188 | } 189 | } 190 | } 191 | } else { 192 | button_el = getAdiacent(passw_el, "input[type=submit],button"); 193 | if(!button_el){ 194 | button_el = getAdiacent(passw_el, "a"); 195 | } 196 | } 197 | if(!login_el || ! button_el){ 198 | console.log("error") 199 | } 200 | login_el.value = login; 201 | trigger(login_el, "blur") 202 | trigger(login_el, "change") 203 | trigger(passw_el, "blur") 204 | trigger(passw_el, "change") 205 | 206 | setTimeout(function(){ 207 | trigger(button_el, "click"); 208 | callPhantom({cmd:"next"}); 209 | },50); 210 | 211 | },login, password, buttonTxt); 212 | }); 213 | 214 | -------------------------------------------------------------------------------- /core/lib/cookie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - www.htcap.org 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import cookielib 14 | import time 15 | from urllib import quote 16 | from urlparse import urlparse 17 | 18 | 19 | class Cookie: 20 | """ 21 | RFC 6265 22 | """ 23 | 24 | def __init__(self, cookie, setter=None): 25 | self.name = (str(cookie['name']) if 'name' in cookie and cookie['name'] else None) 26 | self.domain = (str(cookie['domain']) if 'domain' in cookie and cookie['domain'] else None) 27 | self.path = (str(cookie['path']) if 'path' in cookie and cookie['path'] else "/") 28 | 29 | # setter is the url that set this cookie, it's used to handle cookies with domain=None 30 | # if both domain and setter are None then no domain restrictions are applied (used when cookied are loaded from db) 31 | self.setter = urlparse(setter) if setter else None 32 | 33 | # if self.domain[0] != ".": 34 | # self.domain = "." + self.domain 35 | 36 | self.update(cookie) 37 | 38 | def update(self, cookie): 39 | self.value = (quote(str(cookie['value'])) if 'value' in cookie and cookie['value'] else None) 40 | self.expires = (cookie['expires'] if 'expires' in cookie else None) 41 | self.secure = (cookie['secure'] if 'secure' in cookie else False) 42 | self.httponly = (cookie['httponly'] if 'httponly' in cookie else False) 43 | 44 | def __eq__(self, other): 45 | return (other 46 | and self.name == other.name 47 | and self.path == other.path 48 | and (self.domain == None or other.domain == None or self.domain == other.domain) 49 | ) 50 | 51 | def get_string(self): 52 | return "%s=%s; path = %s" % (self.name, self.value, self.path) # self.setter) 53 | 54 | # if domain is set it is valid for all subdomains 55 | # if domain is not set it is valid only for the setter's domain 56 | def is_valid_for_url(self, url): 57 | purl = urlparse(url) 58 | # the preceding dot in domain is optional so .foo.com and foo.com are the same 59 | if self.domain is None: 60 | # domain is considered the domain of setter 61 | # the cookie is valid if url's domain is EQUAL to setter's domain 62 | # if setter is None, no domain restrictions are applied (ie when loading cookies from db) 63 | if self.setter and purl.hostname != self.setter.hostname: return False 64 | else: 65 | if not purl.hostname: return False 66 | # url is valid ALSO if it is a subdomain of self.domain 67 | sh = [t for t in self.domain.split(".")[::-1] if t] # skip empty vals (in case of .foo.bar) 68 | uh = purl.hostname.split(".")[::-1] 69 | # @TODO DO NOT trust self.domain blindely .. check if = to setter... 70 | if uh[:len(sh)] != sh: return False 71 | 72 | if self.path: 73 | # check if url's path is equal or subfolder of self.path 74 | if not purl.path: return False 75 | sp = [t for t in self.path.split("/") if t] 76 | up = [t for t in purl.path.split("/") if t] 77 | if up[:len(sp)] != sp: return False 78 | 79 | # @TODO!!! 80 | if self.expires: 81 | pass 82 | 83 | # print "%s is valid for %s" % (self.get_string(), url) 84 | 85 | return True 86 | 87 | # def get_json(self): 88 | # return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True) 89 | 90 | def get_dict(self): 91 | return dict( 92 | name=self.name, 93 | value=self.value, 94 | domain=self.domain, 95 | path=self.path, 96 | secure=self.secure, 97 | expires=self.expires, 98 | httponly=self.httponly 99 | ) 100 | 101 | def get_cookielib_cookie(self): 102 | return cookielib.Cookie( 103 | version=0, 104 | name=self.name, 105 | value=self.value, 106 | port=None, 107 | port_specified=False, 108 | domain=self.domain if self.domain else "", # is this ok? 109 | domain_specified=True, 110 | domain_initial_dot=False, 111 | path=self.path, 112 | path_specified=True, 113 | secure=self.secure, 114 | expires=self.expires, 115 | discard=True, 116 | comment=None, 117 | comment_url=None, 118 | rest=None 119 | ) 120 | 121 | def get_as_netscape(self): 122 | """ 123 | 7 tab delimited properties: 124 | domain - The domain that created AND that can read the variable. 125 | flag - A TRUE/FALSE value indicating if all machines within a given domain can access 126 | the variable. This value is set automatically by the browser, depending on the value you set for domain. 127 | path - The path within the domain that the variable is valid for. 128 | secure - A TRUE/FALSE value indicating if a secure connection with the domain is needed to access the variable. 129 | expiration - The UNIX time that the variable will expire on. UNIX time is defined as the number of seconds since Jan 1, 1970 00:00:00 GMT. 130 | name - The name of the variable. 131 | value - The value of the variable. 132 | """ 133 | 134 | domain = self.domain 135 | if domain: 136 | if not domain.startswith("."): domain = ".%s" % domain 137 | else: 138 | domain = self.setter.hostname if self.setter else "." 139 | 140 | # @TODO capire come e se settare 'flag' 141 | flag = "TRUE" 142 | # @TODO non è chiaro cosa setto se il cookie non ha expire date .. per ora lo setto nel futuro e pace 143 | expiry = self.expires if self.expires else (time.time() + (3600 * 24 * 7)) 144 | values = (domain, flag, self.path, ("TRUE" if self.secure else "FALSE"), expiry, self.name, self.value) 145 | return "%s\t%s\t%s\t%s\t%d\t%s\t%s" % values 146 | 147 | def __str__(self): 148 | return "Cookie: %s=%s" % (self.name, self.value) 149 | -------------------------------------------------------------------------------- /core/lib/thirdparty/simhash/__init__.py: -------------------------------------------------------------------------------- 1 | # Created by 1e0n in 2013 2 | from __future__ import division, unicode_literals 3 | 4 | import sys 5 | import re 6 | import hashlib 7 | import logging 8 | import collections 9 | from itertools import groupby 10 | 11 | if sys.version_info[0] >= 3: 12 | basestring = str 13 | unicode = str 14 | long = int 15 | else: 16 | range = xrange 17 | 18 | 19 | class Simhash(object): 20 | 21 | def __init__(self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=None): 22 | """ 23 | `f` is the dimensions of fingerprints 24 | 25 | `reg` is meaningful only when `value` is basestring and describes 26 | what is considered to be a letter inside parsed string. Regexp 27 | object can also be specified (some attempt to handle any letters 28 | is to specify reg=re.compile(r'\w', re.UNICODE)) 29 | 30 | `hashfunc` accepts a utf-8 encoded string and returns a unsigned 31 | integer in at least `f` bits. 32 | """ 33 | 34 | self.f = f 35 | self.reg = reg 36 | self.value = None 37 | 38 | if hashfunc is None: 39 | def _hashfunc(x): 40 | return int(hashlib.md5(x).hexdigest(), 16) 41 | 42 | self.hashfunc = _hashfunc 43 | else: 44 | self.hashfunc = hashfunc 45 | 46 | if isinstance(value, Simhash): 47 | self.value = value.value 48 | elif isinstance(value, basestring): 49 | self.build_by_text(unicode(value)) 50 | elif isinstance(value, collections.Iterable): 51 | self.build_by_features(value) 52 | elif isinstance(value, long): 53 | self.value = value 54 | else: 55 | raise Exception('Bad parameter with type {}'.format(type(value))) 56 | 57 | def _slide(self, content, width=4): 58 | return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))] 59 | 60 | def _tokenize(self, content): 61 | content = content.lower() 62 | content = ''.join(re.findall(self.reg, content)) 63 | ans = self._slide(content) 64 | return ans 65 | 66 | def build_by_text(self, content): 67 | features = self._tokenize(content) 68 | features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))} 69 | return self.build_by_features(features) 70 | 71 | def build_by_features(self, features): 72 | """ 73 | `features` might be a list of unweighted tokens (a weight of 1 74 | will be assumed), a list of (token, weight) tuples or 75 | a token -> weight dict. 76 | """ 77 | v = [0] * self.f 78 | masks = [1 << i for i in range(self.f)] 79 | if isinstance(features, dict): 80 | features = features.items() 81 | for f in features: 82 | if isinstance(f, basestring): 83 | h = self.hashfunc(f.encode('utf-8')) 84 | w = 1 85 | else: 86 | assert isinstance(f, collections.Iterable) 87 | h = self.hashfunc(f[0].encode('utf-8')) 88 | w = f[1] 89 | for i in range(self.f): 90 | v[i] += w if h & masks[i] else -w 91 | ans = 0 92 | for i in range(self.f): 93 | if v[i] >= 0: 94 | ans |= masks[i] 95 | self.value = ans 96 | 97 | def distance(self, another): 98 | assert self.f == another.f 99 | x = (self.value ^ another.value) & ((1 << self.f) - 1) 100 | ans = 0 101 | while x: 102 | ans += 1 103 | x &= x - 1 104 | return ans 105 | 106 | 107 | class SimhashIndex(object): 108 | 109 | def __init__(self, objs, f=64, k=2): 110 | """ 111 | `objs` is a list of (obj_id, simhash) 112 | obj_id is a string, simhash is an instance of Simhash 113 | `f` is the same with the one for Simhash 114 | `k` is the tolerance 115 | """ 116 | self.k = k 117 | self.f = f 118 | count = len(objs) 119 | logging.info('Initializing %s data.', count) 120 | 121 | self.bucket = collections.defaultdict(set) 122 | 123 | for i, q in enumerate(objs): 124 | if i % 10000 == 0 or i == count - 1: 125 | logging.info('%s/%s', i + 1, count) 126 | 127 | self.add(*q) 128 | 129 | def get_near_dups(self, simhash): 130 | """ 131 | `simhash` is an instance of Simhash 132 | return a list of obj_id, which is in type of str 133 | """ 134 | assert simhash.f == self.f 135 | 136 | ans = set() 137 | 138 | for key in self.get_keys(simhash): 139 | dups = self.bucket[key] 140 | logging.debug('key:%s', key) 141 | if len(dups) > 200: 142 | logging.warning('Big bucket found. key:%s, len:%s', key, len(dups)) 143 | 144 | for dup in dups: 145 | sim2, obj_id = dup.split(',', 1) 146 | sim2 = Simhash(long(sim2, 16), self.f) 147 | 148 | d = simhash.distance(sim2) 149 | if d <= self.k: 150 | ans.add(obj_id) 151 | return list(ans) 152 | 153 | def add(self, obj_id, simhash): 154 | """ 155 | `obj_id` is a string 156 | `simhash` is an instance of Simhash 157 | """ 158 | assert simhash.f == self.f 159 | 160 | for key in self.get_keys(simhash): 161 | v = '%x,%s' % (simhash.value, obj_id) 162 | self.bucket[key].add(v) 163 | 164 | def delete(self, obj_id, simhash): 165 | """ 166 | `obj_id` is a string 167 | `simhash` is an instance of Simhash 168 | """ 169 | assert simhash.f == self.f 170 | 171 | for key in self.get_keys(simhash): 172 | v = '%x,%s' % (simhash.value, obj_id) 173 | if v in self.bucket[key]: 174 | self.bucket[key].remove(v) 175 | 176 | @property 177 | def offsets(self): 178 | """ 179 | You may optimize this method according to 180 | """ 181 | return [self.f // (self.k + 1) * i for i in range(self.k + 1)] 182 | 183 | def get_keys(self, simhash): 184 | for i, offset in enumerate(self.offsets): 185 | if i == (len(self.offsets) - 1): 186 | m = 2 ** (self.f - offset) - 1 187 | else: 188 | m = 2 ** (self.offsets[i + 1] - offset) - 1 189 | c = simhash.value >> offset & m 190 | yield '%x:%x' % (c, i) 191 | 192 | def bucket_size(self): 193 | return len(self.bucket) 194 | -------------------------------------------------------------------------------- /core/scan/base_scanner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import time 16 | import re 17 | import json 18 | import urllib 19 | import cookielib 20 | import threading 21 | import base64 22 | import posixpath 23 | import tempfile 24 | import os 25 | import uuid 26 | import urllib2 27 | import shutil 28 | import datetime 29 | 30 | from urlparse import urlparse, urlsplit, urljoin, parse_qs 31 | 32 | import core.lib.thirdparty.pysocks.socks as socks 33 | from core.lib.thirdparty.pysocks.sockshandler import SocksiPyHandler 34 | 35 | from core.lib.exception import * 36 | from core.crawl.lib.shared import * 37 | 38 | 39 | from core.lib.request import Request 40 | 41 | from core.lib.cookie import Cookie 42 | 43 | 44 | from core.lib.shell import CommandExecutor 45 | from core.lib.database import Database 46 | 47 | from core.lib.utils import * 48 | from core.constants import * 49 | 50 | from core.lib.request_pattern import RequestPattern 51 | 52 | 53 | 54 | class BaseScanner: 55 | def __init__(self, db_file, num_threads, request_types, process_timeout, scanner_exe, display_progress, scanner_argv): 56 | self.scan_start_time = int(time.time()) 57 | self.threads = [] 58 | self._th_lock = threading.Lock() 59 | self._th_lock_db = threading.Lock() 60 | self.performed_requests = 0 61 | self._urlpatterns = [] 62 | self._exitcode = 0 63 | self.scanner_name = self.__class__.__name__.lower() 64 | self._running = False 65 | self.settings = self.get_settings() 66 | 67 | #override default settings 68 | if num_threads: self.settings['num_threads'] = num_threads 69 | if request_types: self.settings['request_types'] = request_types 70 | if process_timeout: self.settings['process_timeout'] = process_timeout 71 | if scanner_exe: self.settings['scanner_exe'] = scanner_exe 72 | self.settings['scanner_exe'] = self.settings['scanner_exe'].split(" ") 73 | 74 | 75 | 76 | self.db = Database(db_file) 77 | self.id_assessment = self.db.create_assessment(self.scanner_name, int(time.time())) 78 | self.pending_requests = self.db.get_requests(self.settings['request_types']) 79 | self.tot_requests = len(self.pending_requests) 80 | self._duplicated_requests = [] 81 | 82 | urlpatterns = [] 83 | for req in self.pending_requests: 84 | patt = RequestPattern(req).pattern 85 | if patt in urlpatterns: 86 | self._duplicated_requests.append(req.db_id) 87 | else: 88 | urlpatterns.append(patt) 89 | 90 | init = self.init(scanner_argv if scanner_argv else []) 91 | 92 | self._running = True 93 | print "Scanner %s started with %d threads" % (self.scanner_name, self.settings['num_threads']) 94 | 95 | for n in range(0, self.settings['num_threads']): 96 | thread = self.Executor(self) 97 | self.threads.append(thread) 98 | thread.start() 99 | 100 | try: 101 | self.wait_executor(self.threads, display_progress) 102 | except KeyboardInterrupt: 103 | print "\nTerminated by user" 104 | self.kill_threads() 105 | 106 | self.save_assessment() 107 | sys.exit(self._exitcode) 108 | 109 | 110 | def get_settings(self): 111 | return dict( 112 | request_types = "xhr,link,redirect,form,json", 113 | num_threads = 10, 114 | process_timeout = 120, 115 | scanner_exe = "" 116 | ) 117 | 118 | 119 | def get_cmd(self, url, outfile): 120 | cmd = [] 121 | return cmd 122 | 123 | 124 | def scanner_executed(self, id_parent, out, err, out_file): 125 | return 126 | 127 | 128 | def wait_executor(self, threads, display_progress): 129 | executor_done = False 130 | while not executor_done: 131 | executor_done = True 132 | for th in threads: 133 | if th.isAlive(): 134 | executor_done = False 135 | th.join(1) 136 | 137 | if display_progress: 138 | self._th_lock.acquire() 139 | scanned = self.performed_requests 140 | pending = len(self.pending_requests) 141 | tot = self.tot_requests 142 | self._th_lock.release() 143 | 144 | print_progressbar(tot, scanned, self.scan_start_time, "requests scanned") 145 | if display_progress: 146 | print "" 147 | 148 | 149 | def kill_threads(self): 150 | self._th_lock.acquire() 151 | for th in self.threads: 152 | if th.isAlive(): th.exit = True 153 | self._th_lock.release() 154 | 155 | 156 | def exit(self, code): 157 | if self._running: 158 | self._th_lock.acquire() 159 | self._exitcode = code 160 | self._th_lock.release() 161 | self.kill_threads() 162 | print "kill thread" 163 | print "" 164 | else : 165 | sys.exit(code) 166 | 167 | 168 | def save_vulnerability(self, request, type, description): 169 | self._th_lock_db.acquire() 170 | self.db.insert_vulnerability(self.id_assessment, request.db_id, type, description) 171 | self._th_lock_db.release() 172 | 173 | 174 | def save_assessment(self): 175 | self._th_lock_db.acquire() 176 | self.db.save_assessment(self.id_assessment, int(time.time())) 177 | self._th_lock_db.release() 178 | 179 | 180 | def is_request_duplicated(self, request): 181 | return request.db_id in self._duplicated_requests 182 | 183 | 184 | class Executor(threading.Thread): 185 | 186 | def __init__(self, scanner): 187 | threading.Thread.__init__(self) 188 | self.scanner = scanner 189 | self.exit = False 190 | self.thread_uuid = uuid.uuid4() 191 | self.tmp_dir = "%s%shtcap_tempdir-%s" % (tempfile.gettempdir(), os.sep, self.thread_uuid) 192 | os.makedirs(self.tmp_dir, 0700) 193 | 194 | def inc_counter(self): 195 | self.scanner._th_lock.acquire() 196 | self.scanner.performed_requests += 1 197 | self.scanner._th_lock.release() 198 | 199 | def run(self): 200 | req = None 201 | while True: 202 | 203 | self.scanner._th_lock.acquire() 204 | if self.exit == True or len(self.scanner.pending_requests) == 0: 205 | self.scanner._th_lock.release() 206 | shutil.rmtree(self.tmp_dir) 207 | return 208 | 209 | req = self.scanner.pending_requests.pop() 210 | 211 | self.scanner._th_lock.release() 212 | 213 | 214 | cmd_options = self.scanner.get_cmd(req, self.tmp_dir) 215 | if cmd_options == False: 216 | self.inc_counter() 217 | continue 218 | 219 | cmd = self.scanner.settings['scanner_exe'] + cmd_options 220 | 221 | 222 | exe = CommandExecutor(cmd, True) 223 | out, err = exe.execute(self.scanner.settings['process_timeout']) 224 | # if err: print "\nError: \n%s\n%s\n%s\n" % (err," ".join(cmd),out) 225 | 226 | self.inc_counter() 227 | 228 | self.scanner.scanner_executed(req, out,err, self.tmp_dir, cmd) 229 | 230 | -------------------------------------------------------------------------------- /core/util/utilities/report.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | HTCAP - beta 1 6 | Author: filippo.cavallarin@wearesegment.com 7 | 8 | This program is free software; you can redistribute it and/or modify it under 9 | the terms of the GNU General Public License as published by the Free Software 10 | Foundation; either version 2 of the License, or (at your option) any later 11 | version. 12 | """ 13 | 14 | import sys 15 | import os 16 | import sqlite3 17 | import json 18 | from urlparse import urlsplit 19 | from core.util.base_util import BaseUtil 20 | from core.lib.utils import * 21 | reload(sys) 22 | sys.setdefaultencoding('utf8') 23 | 24 | 25 | 26 | class Report(BaseUtil): 27 | 28 | def dict_from_row(self, row): 29 | return dict(zip(row.keys(), row)) 30 | 31 | @staticmethod 32 | def get_settings(): 33 | return dict( 34 | descr = "Generate the html report", 35 | optargs = '', 36 | minargs = 2 37 | ) 38 | 39 | def usage(self): 40 | return ( 41 | "%s\n" 42 | "usage: %s \n" 43 | % (self.get_settings()['descr'], self.utilname) 44 | ) 45 | 46 | 47 | def get_report(self, cur): 48 | report = [] 49 | qry = """ 50 | SELECT r.type,r.id,r.url,r.method,r.data,r.http_auth,r.referer,r.out_of_scope, ri.trigger, r.crawler_errors, 51 | (ri.id is not null) AS has_requests, ri.type AS req_type,ri.method AS req_method,ri.url AS req_url,ri.data AS req_data 52 | FROM request r 53 | LEFT JOIN request_child rc ON r.id=rc.id_request 54 | LEFT JOIN request ri ON ri.id = rc.id_child 55 | WHERE 56 | r.type IN ('link', 'redirect','form') 57 | and (has_requests=0 OR req_type IN ('xhr','form','websocket') OR (req_type='jsonp' AND ri.trigger <> '')) 58 | """ 59 | try: 60 | cur.execute(qry) 61 | for r in cur.fetchall(): 62 | report.append(self.dict_from_row(r)) 63 | except Exception as e: 64 | print str(e) 65 | 66 | return report 67 | 68 | def get_assessment_vulnerabilities(self, cur, id_request): 69 | report = [] 70 | qry = """ 71 | SELECT type, description FROM vulnerability WHERE id_request IN ( 72 | SELECT id FROM request WHERE ( 73 | id=? AND type IN ('link','redirect')) OR 74 | (id_parent=? AND type IN ('xhr','jsonp','form','websocket') 75 | ) 76 | ) 77 | """ 78 | 79 | try: 80 | 81 | cur.execute(qry, (id_request,id_request)) 82 | for r in cur.fetchall(): 83 | report.append(json.dumps({"type":r['type'], "description":r['description']})) 84 | except Exception as e: 85 | print str(e) 86 | 87 | 88 | return report 89 | 90 | 91 | def get_crawl_info(self, cur): 92 | crawl = None 93 | qry = """ 94 | SELECT *, 95 | (SELECT htcap_version FROM crawl_info) AS htcap_version, 96 | (SELECT COUNT(*) FROM request WHERE crawled=1) AS pages_crawled 97 | FROM crawl_info 98 | """ 99 | 100 | try: 101 | 102 | cur.execute(qry) 103 | crawl = self.dict_from_row(cur.fetchone()) 104 | except Exception as e: 105 | print str(e) 106 | 107 | return crawl 108 | 109 | def get_request_cmp_tuple(self, row): 110 | # http_auth in included in the url 111 | return (row['url'], row['method'], row['data']) 112 | 113 | def add_http_auth(self, url, auth): 114 | purl = urlsplit(url) 115 | return purl._replace(netloc="%s@%s" % (auth, purl.netloc)).geturl() 116 | 117 | def get_json(self, cur): 118 | report = self.get_report(cur) 119 | infos = self.get_crawl_info(cur) 120 | 121 | 122 | ret = dict( 123 | infos= infos, 124 | results = [] 125 | ) 126 | 127 | for row in report: 128 | if row['http_auth']: 129 | row['url'] = self.add_http_auth(row['url'], row['http_auth']) 130 | 131 | if self.get_request_cmp_tuple(row) in [self.get_request_cmp_tuple(r) for r in ret['results']]: continue 132 | d = dict( 133 | id = row['id'], 134 | url = row['url'], 135 | method = row['method'], 136 | data = row['data'], 137 | referer = row['referer'], 138 | xhr = [], 139 | jsonp = [], 140 | websockets = [], 141 | forms = [], 142 | errors = json.loads(row['crawler_errors']) if row['crawler_errors'] else [], 143 | vulnerabilities = self.get_assessment_vulnerabilities(cur, row['id']) 144 | ) 145 | if row['out_of_scope']: d['out_of_scope'] = True 146 | 147 | if row['has_requests']: 148 | for r in report: 149 | if r['id'] != row['id']: continue 150 | req_obj = {} 151 | 152 | trigger = json.loads(r['trigger']) if 'trigger' in r and r['trigger'] else None # {'event':'ready','element':'[document]'} 153 | req_obj['trigger'] = "%s.%s()" % (trigger['element'], trigger['event']) if trigger else "" 154 | 155 | if r['req_type']=='xhr': 156 | req_obj['request'] = ["%s %s" % (r['req_method'], r['req_url'])] 157 | if r['req_data']: req_obj['request'].append(r['req_data']) 158 | d['xhr'].append(req_obj) 159 | 160 | elif r['req_type']=='jsonp': 161 | req_obj['request'] = r['req_url'] 162 | d['jsonp'].append(req_obj) 163 | 164 | elif r['req_type']=='websocket': 165 | req_obj['request'] = r['req_url'] 166 | d['websockets'].append(req_obj) 167 | 168 | elif r['req_type']=='form': 169 | #req_obj['request'] = "%s %s data:%s" % (r['req_method'], r['req_url'], r['req_data']) 170 | req_obj['request'] = ["%s %s" % (r['req_method'], r['req_url'])] 171 | if r['req_data']: req_obj['request'].append(r['req_data']) 172 | d['forms'].append(req_obj) 173 | 174 | 175 | if row['has_requests'] or row['out_of_scope'] or len(d['errors']) > 0 or len(d['vulnerabilities']) > 0: 176 | ret['results'].append(d) 177 | 178 | return json.dumps(ret) 179 | 180 | 181 | 182 | 183 | 184 | 185 | def main(self, args, opts): 186 | 187 | base_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep + "htmlreport" + os.sep 188 | 189 | # if len(args) < 3: 190 | # print "usage: %s " % args[0] 191 | # sys.exit(1) 192 | 193 | dbfile = args[0] 194 | outfile = args[1] 195 | 196 | if not os.path.exists(dbfile): 197 | print "No such file: %s" % dbfile 198 | sys.exit(1) 199 | 200 | if os.path.exists(outfile): 201 | sys.stdout.write("File %s already exists. Overwrite [y/N]: " % outfile) 202 | if sys.stdin.read(1) != "y": 203 | sys.exit(1) 204 | os.remove(outfile) 205 | 206 | conn = sqlite3.connect(dbfile) 207 | conn.row_factory = sqlite3.Row 208 | cur = conn.cursor() 209 | 210 | base_html = ( 211 | "\n" 212 | "\n" 213 | "\n" 214 | "\n" 215 | "\n" 216 | "\n" 217 | "%s\n" 218 | "\n" 219 | ) 220 | 221 | 222 | jsn = "var report = %s;\n" % self.get_json(cur) 223 | 224 | with open("%sreport.html" % base_dir) as html, open("%sreport.js" % base_dir) as js, open("%sstyle.css" % base_dir) as css: 225 | html = base_html % (css.read(), jsn, js.read(), html.read()) 226 | 227 | with open(outfile,'w') as out: 228 | out.write(html) 229 | 230 | print "Report saved to %s" % outfile 231 | 232 | -------------------------------------------------------------------------------- /core/crawl/crawler_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | 15 | import json 16 | import os 17 | import tempfile 18 | import threading 19 | import uuid 20 | from time import sleep 21 | 22 | from core.constants import * 23 | from core.crawl.lib.crawl_result import CrawlResult 24 | from core.crawl.lib.probe import Probe 25 | from core.crawl.lib.shared import Shared 26 | from core.crawl.lib.utils import adjust_requests 27 | from core.lib.exception import ThreadExitRequestException 28 | from core.lib.http_get import HttpGet 29 | from core.lib.shell import CommandExecutor 30 | 31 | 32 | # TODO: use NamedTemporaryFile for self._cookie_file 33 | # from core.lib.utils import cmd_to_str 34 | 35 | 36 | class CrawlerThread(threading.Thread): 37 | _PROCESS_RETRIES_INTERVAL = 0.5 38 | _PROCESS_RETRIES = 2 39 | 40 | def __init__(self): 41 | threading.Thread.__init__(self) 42 | 43 | self.status = THSTAT_RUNNING 44 | self.exit = False 45 | 46 | self._thread_uuid = uuid.uuid4() 47 | self._cookie_file = "%s%shtcap_cookiefile-%s.json" % (tempfile.gettempdir(), os.sep, self._thread_uuid) 48 | 49 | def run(self): 50 | self._crawl() 51 | 52 | def _crawl(self): 53 | 54 | while True: 55 | requests = [] 56 | errors = [] 57 | 58 | try: 59 | request = self._wait_request() 60 | except ThreadExitRequestException: 61 | if os.path.exists(self._cookie_file): 62 | os.remove(self._cookie_file) 63 | return 64 | except Exception as e: 65 | print("-->" + str(e)) 66 | continue 67 | 68 | probe = self._send_probe(request, errors) 69 | 70 | if probe: 71 | if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: 72 | 73 | requests = probe.requests 74 | if len(probe.user_output) > 0: 75 | request.user_output = probe.user_output 76 | 77 | # if the probe return some cookies set it has the last one 78 | if probe.cookies: 79 | Shared.end_cookies = probe.cookies 80 | 81 | else: 82 | errors.append(ERROR_PROBEFAILURE) 83 | # get urls with python to continue crawling 84 | if not Shared.options['use_urllib_onerror']: 85 | continue 86 | try: 87 | hr = HttpGet(request, Shared.options['process_timeout'], CrawlerThread._PROCESS_RETRIES, 88 | Shared.options['user_agent'], Shared.options['proxy']) 89 | requests = hr.get_requests() 90 | except Exception as e: 91 | errors.append(str(e)) 92 | 93 | # set out_of_scope, apply user-supplied filters to urls (ie group_qs) 94 | adjust_requests(requests) 95 | 96 | Shared.main_condition.acquire() 97 | res = CrawlResult(request, requests, errors) 98 | Shared.crawl_results.append(res) 99 | Shared.main_condition.notify() 100 | Shared.main_condition.release() 101 | 102 | def _wait_request(self): 103 | Shared.th_condition.acquire() 104 | while True: 105 | if self.exit: 106 | Shared.th_condition.notifyAll() 107 | Shared.th_condition.release() 108 | raise ThreadExitRequestException("exit request received") 109 | 110 | if Shared.requests_index >= len(Shared.requests): 111 | self.status = THSTAT_WAITING 112 | # The wait method releases the lock, blocks the current thread until another thread calls notify 113 | Shared.th_condition.wait() 114 | continue 115 | 116 | request = Shared.requests[Shared.requests_index] 117 | Shared.requests_index += 1 118 | 119 | break 120 | 121 | Shared.th_condition.release() 122 | 123 | self.status = THSTAT_RUNNING 124 | 125 | return request 126 | 127 | def _set_probe_params(self, request): 128 | params = [] 129 | cookies = [] 130 | url = request.url 131 | 132 | if request.method == "POST": 133 | params.append("-P") 134 | if request.data: 135 | params.extend(("-D", request.data)) 136 | 137 | if len(request.cookies) > 0: 138 | for cookie in request.cookies: 139 | cookies.append(cookie.get_dict()) 140 | params.extend(("-c", json.dumps(cookies))) 141 | 142 | if request.http_auth: 143 | params.extend(("-p", request.http_auth)) 144 | 145 | if Shared.options['set_referer'] and request.referer: 146 | params.extend(("-r", request.referer)) 147 | 148 | # DEBUG: 149 | # params.append("-vv") 150 | params.append(url) 151 | 152 | return params 153 | 154 | def _send_probe(self, request, errors): 155 | 156 | probe = None 157 | retries = CrawlerThread._PROCESS_RETRIES 158 | params = self._set_probe_params(request) 159 | 160 | while retries: 161 | # DEBUG: 162 | # print("### INPUT: %s" % repr(Shared.probe_cmd + params)) 163 | cmd = CommandExecutor(Shared.probe_cmd + params) 164 | jsn = cmd.execute(Shared.options['process_timeout'] + 2) 165 | 166 | # DEBUG: 167 | # print("### OUTPUT: %s" % repr(jsn)) 168 | 169 | if jsn is None: 170 | errors.append(ERROR_PROBEKILLED) 171 | sleep(CrawlerThread._PROCESS_RETRIES_INTERVAL) # ... ??? 172 | retries -= 1 173 | continue 174 | else: 175 | probe_array = self._load_probe_json(jsn) 176 | 177 | if probe_array: 178 | probe = Probe(probe_array, request) 179 | 180 | if probe.status == "ok": 181 | break 182 | 183 | errors.append(probe.errcode) 184 | 185 | if probe.errcode in (ERROR_CONTENTTYPE, ERROR_PROBE_TO, ERROR_FORCE_STOP): 186 | break 187 | 188 | sleep(CrawlerThread._PROCESS_RETRIES_INTERVAL) 189 | retries -= 1 190 | return probe 191 | 192 | @staticmethod 193 | def _load_probe_json(jsn): 194 | 195 | if isinstance(jsn, tuple): 196 | jsn = jsn[0] 197 | 198 | try: 199 | data = json.loads(jsn) 200 | return data 201 | except ValueError: 202 | print "-- JSON DECODE ERROR %s" % jsn 203 | except Exception: 204 | raise 205 | -------------------------------------------------------------------------------- /core/lib/request.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import json 14 | import re 15 | from urlparse import urljoin, urlsplit 16 | 17 | from core.constants import * 18 | from core.lib.cookie import Cookie 19 | from core.lib.thirdparty.simhash import Simhash 20 | from core.lib.utils import extract_http_auth, normalize_url, remove_tokens 21 | 22 | 23 | class Request(object): 24 | def __init__(self, type, method, url, parent=None, referer=None, data=None, trigger=None, json_cookies=None, 25 | set_cookie=None, http_auth=None, db_id=None, parent_db_id=None, out_of_scope=None): 26 | self.type = type 27 | self.method = method 28 | self._html = None 29 | self._html_hash = None 30 | self.user_output = [] 31 | url = url.strip() 32 | 33 | try: 34 | url = url.decode("utf-8") 35 | except: 36 | try: 37 | url = url.decode("latin-1") 38 | except Exception as e: 39 | raise AssertionError("unable to decode " + url) 40 | 41 | if type != REQTYPE_UNKNOWN: 42 | # extract http auth if present in url 43 | # if credentials are present in url, the url IS absolute so we can do this before urljoin 44 | # (foo:bar@example.local is NOT A VALID URL) 45 | auth, nurl = extract_http_auth(url) 46 | if auth: 47 | if not http_auth: 48 | http_auth = auth 49 | url = nurl 50 | 51 | self.url = normalize_url(urljoin(parent.url, url) if parent else url) 52 | else: 53 | self.url = url 54 | 55 | # parent is the parent request that can be a redirect, referer is the referer page (ahead of redirects) 56 | self._parent = parent 57 | 58 | self.data = data if data else "" 59 | self.trigger = trigger 60 | self.db_id = db_id 61 | self.parent_db_id = parent_db_id 62 | self.out_of_scope = out_of_scope 63 | self.cookies = [] 64 | 65 | self.http_auth = parent.http_auth if not http_auth and parent else http_auth 66 | 67 | self.redirects = parent.redirects + 1 if type == REQTYPE_REDIRECT and parent else 0 68 | 69 | if not referer and parent: 70 | self.referer = parent.url if type != REQTYPE_REDIRECT else parent.referer 71 | else: 72 | self.referer = referer 73 | 74 | # if type == "unknown": 75 | # return 76 | 77 | if json_cookies: 78 | self.all_cookies = self.cookies_from_json(json_cookies) 79 | else: 80 | set_cookie = set_cookie if set_cookie else [] 81 | self.all_cookies = self.merge_cookies(set_cookie, parent.all_cookies) if parent else set_cookie 82 | 83 | self.cookies = [c for c in self.all_cookies if c.is_valid_for_url(self.url)] 84 | 85 | @property 86 | def parent(self): 87 | if not self._parent and self.parent_db_id: 88 | # fetch from db 89 | pass 90 | return self._parent 91 | 92 | @parent.setter 93 | def parent(self, value): 94 | self._parent = value 95 | 96 | @property 97 | def html(self): 98 | return self._html 99 | 100 | @html.setter 101 | def html(self, value): 102 | self._html = value 103 | self._html_hash = Simhash(value) 104 | 105 | def get_dict(self): 106 | return dict( 107 | type=self.type, 108 | method=self.method, 109 | url=self.url, 110 | referer=self.referer, 111 | data=self.data, 112 | trigger=self.trigger, 113 | cookies=self.cookies, 114 | db_id=self.db_id, 115 | parent_db_id=self.parent_db_id, 116 | out_of_scope=self.out_of_scope 117 | ) 118 | 119 | def cookies_from_json(self, cookies): 120 | # return [Cookie(c, self.parent.url) for c in json.loads(cookies)] 121 | 122 | # create Cookie without "setter" because cookies loaded from db are always valid (no domain restrictions) 123 | # see Cookie.py 124 | return [Cookie(c) for c in json.loads(cookies)] 125 | 126 | def get_cookies_as_json(self): 127 | cookies = [c.get_dict() for c in self.cookies] 128 | return json.dumps(cookies) 129 | 130 | def merge_cookies(self, cookies1, cookies2): 131 | cookies = list(cookies2) 132 | for parent_cookie in cookies1: 133 | if parent_cookie not in cookies: 134 | cookies.append(parent_cookie) 135 | else: 136 | for cookie in cookies: 137 | if parent_cookie == cookie: 138 | cookie.update(parent_cookie.__dict__) 139 | 140 | return cookies 141 | 142 | def get_full_url(self): 143 | """ 144 | returns the url with http credentials 145 | """ 146 | if not self.http_auth: 147 | return self.url 148 | 149 | purl = urlsplit(self.url) 150 | netloc = "%s@%s" % (self.http_auth, purl.netloc) 151 | purl = purl._replace(netloc=netloc) 152 | 153 | return purl.geturl() 154 | 155 | # UNUSED 156 | def tokenize_request(self, request): 157 | """ 158 | returns an array of url components 159 | """ 160 | purl = urlsplit(request.url) 161 | 162 | tokens = [purl.scheme, purl.netloc] 163 | 164 | if purl.path: 165 | tokens.extend(purl.path.split("/")) 166 | 167 | data = [purl.query] if purl.query else [] 168 | 169 | if request.data: 170 | data.append(request.data) 171 | 172 | for d in data: 173 | qtokens = re.split(r'(?:&|&)', d) 174 | for qt in qtokens: 175 | tokens.extend(qt.split("=", 1)) 176 | 177 | # print tokens 178 | return tokens 179 | 180 | # UNUSED 181 | def is_similar(self, other): 182 | # is equal .. so not similar 183 | if self == other: return False 184 | 185 | ot = self.tokenize_request(other) 186 | st = self.tokenize_request(self) 187 | 188 | if len(ot) != len(st): return False 189 | diff = 0 190 | for i in range(0, len(st)): 191 | if st[i] != ot[i]: diff += 1 192 | 193 | if diff > 1: return False 194 | 195 | return True 196 | 197 | def __eq__(self, other): 198 | if other == None: return False 199 | data = self.data 200 | odata = other.data 201 | if self.method == "POST": 202 | data = remove_tokens(data) 203 | odata = remove_tokens(odata) 204 | 205 | return (self.method, self.url, self.type, self.http_auth, data) == ( 206 | other.method, other.url, other.type, other.http_auth, odata) 207 | 208 | def __repr__(self): 209 | print "DEBUG" + self.__str__() 210 | 211 | def __str__(self): 212 | return "%s %s %s %s" % (self.type, self.method, self.get_full_url(), self.data) 213 | -------------------------------------------------------------------------------- /core/lib/http_get.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | 15 | import base64 16 | import cookielib 17 | import ssl 18 | import time 19 | import urllib2 20 | 21 | import core.lib.thirdparty.pysocks.socks as socks 22 | from core.constants import * 23 | from core.crawl.lib.urlfinder import UrlFinder 24 | from core.lib.cookie import Cookie 25 | from core.lib.exception import RedirectException, NotHtmlException 26 | from core.lib.request import Request 27 | from core.lib.thirdparty.pysocks.sockshandler import SocksiPyHandler 28 | 29 | 30 | class HttpGet: 31 | def __init__(self, request, timeout, retries=None, user_agent=None, proxy=None): 32 | self.request = request 33 | self.timeout = timeout 34 | self.retries = retries if retries else 1 35 | self.proxy = proxy 36 | self.retries_interval = 0.5 37 | self.user_agent = user_agent 38 | 39 | def urllib2_opener(self, request, jar_response, follow_redirect=None): 40 | url = request.url 41 | headers = [] 42 | 43 | class RedirectHandler(urllib2.HTTPRedirectHandler): 44 | def http_error_302(self, req, fp, code, msg, headers): 45 | raise RedirectException(headers['Location']) 46 | 47 | http_error_301 = http_error_303 = http_error_307 = http_error_302 48 | 49 | try: 50 | handlers = [urllib2.HTTPCookieProcessor(jar_response)] 51 | 52 | # SSLContext is available from python 2.7.9 53 | if hasattr(ssl, "SSLContext"): 54 | handlers.append(urllib2.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_SSLv23))) 55 | 56 | if not follow_redirect: 57 | handlers.append(RedirectHandler) 58 | 59 | if self.proxy: 60 | if self.proxy['proto'] == "socks5": 61 | # dns queries WONT go thru self.proxy .. consider "monkey patching"... 62 | socksh = SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, self.proxy['host'], int(self.proxy['port'])) 63 | handlers.append(socksh) 64 | elif self.proxy['proto'] == "http": 65 | proxy_string = "http://%s:%s" % (self.proxy['host'], self.proxy['port']) 66 | httproxy = urllib2.ProxyHandler({'http': proxy_string, 'https': proxy_string}) 67 | handlers.append(httproxy) 68 | 69 | if self.user_agent: 70 | headers.append(('User-agent', self.user_agent)) 71 | 72 | if request.http_auth: 73 | auths = base64.b64encode(request.http_auth) 74 | headers.append(("Authorization", "Basic %s" % auths)) 75 | 76 | if request.referer: 77 | headers.append(("Referer", request.referer)) 78 | 79 | opener = urllib2.build_opener(*handlers) 80 | opener.addheaders = headers 81 | 82 | return opener 83 | 84 | 85 | except RedirectException as e: 86 | raise 87 | except Exception as e: 88 | print "\n--->" + url + " " + str(e) 89 | raise 90 | 91 | def get_requests(self): # Shared.options['process_timeout'] 92 | 93 | if self.request.method == "POST": 94 | raise Exception("POST method with urllib is not supported yet") 95 | 96 | # parent = self.request.parent.url if self.request.parent else "" 97 | 98 | self.retries_interval = 0.5 99 | 100 | jar_response = cookielib.LWPCookieJar() 101 | jar_request = cookielib.LWPCookieJar() 102 | 103 | html = "" 104 | set_cookie = [] 105 | 106 | requests = [] 107 | 108 | while True: 109 | try: 110 | # Shared.th_lock.acquire() 111 | 112 | for cookie in self.request.cookies: 113 | jar_request.set_cookie(cookie.get_cookielib_cookie()) 114 | 115 | # Shared.th_lock.release() 116 | 117 | opener = self.urllib2_opener(self.request, jar_response) 118 | req = urllib2.Request(url=self.request.url) 119 | jar_request.add_cookie_header(req) 120 | 121 | res = opener.open(req, None, self.timeout) 122 | 123 | for cookie in jar_response: 124 | set_cookie.append(Cookie(cookie.__dict__, self.request.url)) 125 | 126 | ctype = res.info()['Content-Type'] 127 | if ctype is not None: 128 | if ctype.lower().split(";")[0] != "text/html": 129 | opener.close() 130 | raise NotHtmlException(ERROR_CONTENTTYPE) 131 | 132 | html = res.read() 133 | opener.close() 134 | 135 | if html: 136 | finder = UrlFinder(html) 137 | try: 138 | urls = finder.get_urls() 139 | except Exception as e: 140 | raise 141 | 142 | for url in urls: 143 | # @TODO handle FORMS 144 | requests.append(Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, 145 | parent_db_id=self.request.db_id)) 146 | 147 | break 148 | 149 | except RedirectException as e: 150 | set_cookie = [] 151 | for cookie in jar_response: 152 | set_cookie.append(Cookie(cookie.__dict__, self.request.url)) 153 | 154 | r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, 155 | parent_db_id=self.request.db_id) 156 | requests.append(r) 157 | break 158 | except NotHtmlException: 159 | raise 160 | except Exception as e: 161 | self.retries -= 1 162 | if self.retries == 0: raise 163 | time.sleep(self.retries_interval) 164 | 165 | return requests 166 | 167 | def get_file(self): # Shared.options['process_timeout'] 168 | 169 | if self.request.method == "POST": 170 | raise Exception("get_file: POST method with urllib is not supported yet") 171 | 172 | jar_request = cookielib.LWPCookieJar() 173 | 174 | cont = "" 175 | while True: 176 | try: 177 | 178 | for cookie in self.request.cookies: 179 | jar_request.set_cookie(cookie.get_cookielib_cookie()) 180 | 181 | opener = self.urllib2_opener(self.request, None, True) 182 | req = urllib2.Request(url=self.request.url) 183 | jar_request.add_cookie_header(req) 184 | res = opener.open(req, None, self.timeout) 185 | 186 | cont = res.read() 187 | opener.close() 188 | 189 | break 190 | 191 | except Exception as e: 192 | self.retries -= 1 193 | if self.retries == 0: raise 194 | time.sleep(self.retries_interval) 195 | 196 | return cont 197 | -------------------------------------------------------------------------------- /core/crawl/probe/src/utils.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | 4 | const url = require('url'); 5 | 6 | const ArgsParse = require('../node_modules/argparse').ArgumentParser; 7 | 8 | 9 | exports.getOptionsFromArgs = function() { 10 | 11 | let argumentParser = new ArgsParse(); 12 | 13 | _getArguments(argumentParser); 14 | 15 | let args = argumentParser.parseArgs(); 16 | 17 | return _getOptions(args); 18 | }; 19 | 20 | function _getArguments(argumentParser) { 21 | 22 | let args; 23 | 24 | argumentParser.addArgument( 25 | '-A', 26 | { 27 | help: 'user agent', 28 | dest: 'userAgent', 29 | defaultValue: 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', 30 | }, 31 | ); 32 | argumentParser.addArgument( 33 | '-R', 34 | { 35 | help: 'random string used to generate random values - the same random string will generate the same random values', 36 | dest: 'random', 37 | defaultValue: 'IsHOulDb34RaNd0MsTR1ngbUt1mN0t', 38 | }, 39 | ); 40 | argumentParser.addArgument( 41 | '-f', 42 | { 43 | help: 'do NOT fill values in forms', 44 | dest: 'fillValues', 45 | defaultValue: true, 46 | nargs: 0, 47 | action: 'storeFalse', 48 | }, 49 | ); 50 | argumentParser.addArgument( 51 | '-t', 52 | { 53 | help: 'do NOT trigger events (onload only)', 54 | dest: 'triggerEvents', 55 | defaultValue: true, 56 | nargs: 0, 57 | action: 'storeFalse', 58 | }, 59 | ); 60 | argumentParser.addArgument( 61 | '-X', 62 | { 63 | help: 'comma separated list of excluded urls', 64 | dest: 'excludedUrls', 65 | defaultValue: '', 66 | }, 67 | ); 68 | argumentParser.addArgument( 69 | '-O', 70 | { 71 | help: 'do NOT override timeout functions', 72 | dest: 'overrideTimeoutFunctions', 73 | defaultValue: true, 74 | nargs: 0, 75 | action: 'storeFalse', 76 | }, 77 | ); 78 | argumentParser.addArgument( 79 | '-c', 80 | { 81 | help: 'set cookies (json format)', 82 | dest: 'cookies', 83 | defaultValue: '', 84 | }, 85 | ); 86 | argumentParser.addArgument( 87 | '-r', 88 | { 89 | help: 'url referer', 90 | dest: 'referer', 91 | defaultValue: '', 92 | }, 93 | ); 94 | 95 | argumentParser.addArgument( 96 | '-p', 97 | { 98 | help: 'http auth (user:pass)', 99 | dest: 'httpAuth', 100 | defaultValue: '', 101 | }, 102 | ); 103 | argumentParser.addArgument( 104 | '-P', 105 | { 106 | help: 'load page with POST', 107 | dest: 'sendPOST', 108 | defaultValue: false, 109 | nargs: 0, 110 | action: 'storeTrue', 111 | }, 112 | ); 113 | argumentParser.addArgument( 114 | '-D', 115 | { 116 | help: 'POST data', 117 | dest: 'POSTData', 118 | }, 119 | ); 120 | argumentParser.addArgument( 121 | '--proxy', 122 | { 123 | help: 'Proxy address in format "proxy-scheme://proxy-ip:proxy-port"', 124 | dest: 'proxyAddress', 125 | defaultValue: '', 126 | }, 127 | ); 128 | 129 | argumentParser.addArgument( 130 | '-v', 131 | { 132 | help: 'verbosity level', 133 | dest: 'verbosity', 134 | action: 'count', 135 | defaultValue: 0, 136 | }, 137 | ); 138 | 139 | argumentParser.addArgument( 140 | '--debug', 141 | { 142 | help: 'activate debug mode', 143 | dest: 'debug', 144 | defaultValue: false, 145 | nargs: 0, 146 | action: 'storeTrue', 147 | }, 148 | ); 149 | 150 | argumentParser.addArgument( 151 | 'startUrl', 152 | { 153 | help: 'starting url', 154 | }, 155 | ); 156 | 157 | args = argumentParser.parseArgs(); 158 | 159 | if (!args.startUrl.startsWith('http')) { 160 | argumentParser.error('invalid starting url: "' + args.startUrl + '"'); 161 | } 162 | 163 | return args; 164 | } 165 | 166 | function _getOptions(args) { 167 | let options = {}; 168 | 169 | options.userAgent = args.userAgent; 170 | options.random = args.random; 171 | options.fillValues = args.fillValues; 172 | options.triggerEvents = args.triggerEvents; 173 | options.excludedUrls = args.excludedUrls !== '' ? args.excludedUrls.split(',') : []; 174 | options.overrideTimeoutFunctions = args.overrideTimeoutFunctions; 175 | options.verbosity = args.verbosity; 176 | options.debug = args.debug; 177 | 178 | options.inputValues = _generateRandomValues(options.random); 179 | 180 | if (args.cookies !== '') { 181 | options.cookies = JSON.parse(args.cookies); 182 | } else { 183 | options.cookies = []; 184 | } 185 | 186 | if (args.referer !== '') { 187 | options.referer = args.referer; 188 | } 189 | 190 | if (args.httpAuth !== '') { 191 | let a = args.httpAuth.split(':'); 192 | options.httpAuth = { 193 | username: a[0], 194 | password: a[1], 195 | }; 196 | } 197 | 198 | if (args.sendPOST) { 199 | options.sendPOST = args.sendPOST; 200 | options.POSTData = args.POSTData; 201 | } 202 | 203 | if (args.proxyAddress !== '') { 204 | options.proxyAddress = args.proxyAddress; 205 | } 206 | 207 | options.startUrl = url.parse(args.startUrl); 208 | 209 | return options; 210 | 211 | } 212 | 213 | 214 | /** 215 | * generate a static map of random values using a "static" seed for input fields 216 | * the same seed generates the same values 217 | * generated values MUST be the same for all run of the probe otherwise the same form will look different 218 | * for example if a page sends a form to itself with input=random1, 219 | * the same form on the same page (after first post) will became input=random2 220 | * => form.data1 != form.data2 => form.data2 is considered a different request and it'll be crawled. 221 | * this process will lead to and infinite loop! 222 | * @param seed String 223 | * @return {{}} 224 | * @private 225 | */ 226 | function _generateRandomValues(seed) { 227 | let values = {}, 228 | letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', 229 | numbers = '0123456789', 230 | symbols = '!#&^;.,?%$*', 231 | months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], 232 | years = ['1982', '1989', '1990', '1994', '1995', '1996'], 233 | names = ['james', 'john', 'robert', 'michael', 'william', 'david', 'richard', 'charles', 'joseph', 'thomas', 'christopher', 'daniel', 'paul', 'mark', 'donald', 'george', 'kenneth'], 234 | surnames = ['anderson', 'thomas', 'jackson', 'white', 'harris', 'martin', 'thompson', 'garcia', 'martinez', 'robinson', 'clark', 'rodriguez', 'lewis', 'lee', 'walker', 'hall'], 235 | domains = ['.com', '.org', '.net', '.it', '.tv', '.de', '.fr']; 236 | 237 | let randoms = [], 238 | randoms_i = 0; 239 | 240 | for (let a = 0; a < seed.length; a++) { 241 | randoms.push(seed[a].charCodeAt(0)); 242 | } 243 | 244 | const rand = function(max) { 245 | let i = randoms[randoms_i] % max; 246 | randoms_i = (randoms_i + 1) % randoms.length; 247 | return i; 248 | }; 249 | 250 | const randomizeArray = function(arr, len) { 251 | let r, ret = ''; 252 | for (let a = 0; a < len; a++) { 253 | r = rand(arr.length - 1); 254 | ret += arr[r]; 255 | } 256 | return ret; 257 | }; 258 | 259 | let generators = { 260 | string: function() { 261 | return randomizeArray(letters, 8); 262 | }, 263 | number: function() { 264 | return randomizeArray(numbers, 3); 265 | }, 266 | month: function() { 267 | return randomizeArray(months, 1); 268 | }, 269 | year: function() { 270 | return randomizeArray(years, 1); 271 | }, 272 | date: function() { 273 | return generators.year() + '-' + generators.month() + '-' + generators.month(); 274 | }, 275 | color: function() { 276 | return '#' + randomizeArray(numbers, 6); 277 | }, 278 | week: function() { 279 | return generators.year() + '-W' + randomizeArray(months.slice(0, 6), 1); 280 | }, 281 | time: function() { 282 | return generators.month() + ':' + generators.month(); 283 | }, 284 | datetimeLocal: function() { 285 | return generators.date() + 'T' + generators.time(); 286 | }, 287 | domain: function() { 288 | return randomizeArray(letters, 12) 289 | .toLowerCase() + randomizeArray(domains, 1); 290 | }, 291 | email: function() { 292 | return randomizeArray(names, 1) + '.' + generators.surname() + '@' + generators.domain(); 293 | }, 294 | url: function() { 295 | return 'http://www.' + generators.domain(); 296 | }, 297 | humandate: function() { 298 | return generators.month() + '/' + generators.month() + '/' + generators.year(); 299 | }, 300 | password: function() { 301 | return randomizeArray(letters, 3) + randomizeArray(symbols, 1) + randomizeArray(letters, 2) + randomizeArray(numbers, 3) + randomizeArray(symbols, 2); 302 | }, 303 | surname: function() { 304 | return randomizeArray(surnames, 1); 305 | }, 306 | firstname: function() { 307 | return randomizeArray(names, 1); 308 | }, 309 | tel: function() { 310 | return '+' + randomizeArray(numbers, 1) + ' ' + randomizeArray(numbers, 10); 311 | }, 312 | }; 313 | 314 | for (let type in generators) { 315 | values[type] = generators[type](); 316 | } 317 | 318 | return values; 319 | 320 | } 321 | 322 | 323 | })(); 324 | -------------------------------------------------------------------------------- /core/crawl/probe/src/page-handler.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 'use strict'; 3 | 4 | const EventEmitter = require('events'); 5 | 6 | const logger = require('../logger').debug; 7 | const probe = require('./probe'); 8 | 9 | /** 10 | * 11 | * @param {Puppeteer} puppeteer 12 | * @param {String} proxy - in format: `hostname:port` 13 | * @param {boolean} debug - activate debug mode 14 | * @return {Promise.|*} 15 | */ 16 | exports.getBrowserAndPage = function(puppeteer, proxy, debug) { 17 | let browserArgs = [ 18 | '--no-sandbox', // in docker 19 | '--disable-setuid-sandbox', // in docker 20 | '--disable-gpu', // headless 21 | '--hide-scrollbars', // headless 22 | '--mute-audio', // headless 23 | '--ignore-certificate-errors', // no security 24 | '--ignore-certificate-errors-spki-list ', // no security 25 | '--ssl-version-max=tls1.3', // no security 26 | '--ssl-version-min=tls1', // no security 27 | '--disable-web-security', // no security 28 | '--allow-running-insecure-content', // no security 29 | `--load-extension=${__dirname}/../chrome_extension/`, // load extension 30 | `--disable-extensions-except=${__dirname}/../chrome_extension/`, // load extension 31 | ]; 32 | 33 | 34 | if (proxy) { 35 | browserArgs.push(`--proxy-server=${proxy}`); 36 | } 37 | 38 | let launchParams = { 39 | headless: false, 40 | ignoreHTTPSErrors: true, 41 | args: browserArgs, 42 | }; 43 | 44 | if (debug) { 45 | launchParams['dumpio'] = true; 46 | launchParams['devtools'] = true; 47 | } 48 | 49 | return puppeteer.launch(launchParams) 50 | .then(createdBrowser => { 51 | return createdBrowser.newPage() 52 | .then(createdPage => { 53 | return [createdBrowser, createdPage]; 54 | }); 55 | }); 56 | }; 57 | 58 | class Handler extends EventEmitter { 59 | /** 60 | * @constructor 61 | */ 62 | constructor(page, constants, options) { 63 | super(); 64 | this._page = page; 65 | this._constants = constants; 66 | this._options = options; 67 | this._lastRedirectResponse = undefined; 68 | this._reformatFirstRequest = (options.referer || options.sendPOST); 69 | } 70 | 71 | initialize() { 72 | this._page.on('request', interceptedRequest => { 73 | if (this._options.verbosity >= 2) { 74 | logger.debug(`intercepted request: ${interceptedRequest.resourceType()} ${interceptedRequest.url()}`); 75 | } 76 | 77 | // block image loading 78 | if (interceptedRequest.resourceType() === 'image') { 79 | if (this._options.verbosity >= 2) { 80 | logger.debug(`abort request: ${interceptedRequest.resourceType()} ${interceptedRequest.url()}`); 81 | } 82 | interceptedRequest.abort(); 83 | 84 | // Block redirect 85 | // Since no option exist in puppeteer, this is the workaround proposed here: 86 | // https://github.com/GoogleChrome/puppeteer/issues/1132#issuecomment-339420642 87 | } else if (this._lastRedirectResponse && this._lastRedirectResponse.headers().location === interceptedRequest.url()) { 88 | this.getCookies() 89 | .then(cookies => { 90 | 91 | let cookiesResult = ['cookies', cookies], 92 | status = {'status': 'ok', 'redirect': interceptedRequest.url()}; 93 | this.emit(Handler.Events.ProbeResult, cookiesResult); 94 | this.emit(Handler.Events.Finished, 0, status); 95 | 96 | if (this._options.verbosity >= 3) { 97 | logger.debug(`abort request: ${interceptedRequest.resourceType()} ${interceptedRequest.url()}`); 98 | } 99 | 100 | interceptedRequest.abort(); 101 | }); 102 | // Set the first request as POST or/and Headers 103 | // Since the feature is missing, handling it here. 104 | // https://github.com/GoogleChrome/puppeteer/issues/1062 105 | } else if (this._reformatFirstRequest) { 106 | 107 | let overrides = {headers: interceptedRequest.headers()}; 108 | 109 | if (this._options.sendPOST) { 110 | overrides.method = 'POST'; 111 | overrides.postData = this._options.POSTData || undefined; 112 | } 113 | 114 | if (this._options.referer) { 115 | overrides.headers['Referer'] = this._options.referer; 116 | } 117 | 118 | if (this._options.verbosity >= 2) { 119 | logger.debug(`accept request with overrides: ${interceptedRequest.resourceType()} ${interceptedRequest.url()}`); 120 | } 121 | 122 | interceptedRequest.continue(overrides) 123 | .then(() => { 124 | this._reformatFirstRequest = false; 125 | }); 126 | 127 | } else { 128 | 129 | if (this._options.verbosity >= 2) { 130 | logger.debug(`accept request: ${interceptedRequest.resourceType()} ${interceptedRequest.url()}`); 131 | } 132 | interceptedRequest.continue(); 133 | } 134 | 135 | }); 136 | 137 | this._page.on('response', response => { 138 | if (_isRedirect(response)) { 139 | this._lastRedirectResponse = response; 140 | } 141 | }); 142 | 143 | this._page.on('dialog', dialog => { 144 | if (this._options.verbosity >= 3) { 145 | logger.debug(`Page dialog, type "${dialog.type()}": "${dialog.message()}"`); 146 | } 147 | dialog.accept(); 148 | }); 149 | 150 | this._page.on('error', error => { 151 | if (this._options.verbosity >= 1) { 152 | logger.error(`Page crash: "${error.code}", "${error.message()}"`); 153 | } 154 | let status = {'status': 'error', 'code': 'pageCrash', 'message': `Page crash with: "${error.code}", "${error.message()}"`}; 155 | this.emit(Handler.Events.Finished, 1, status); 156 | }); 157 | 158 | this._page.on('framenavigated', frameTo => { 159 | if (this._options.verbosity >= 2) { 160 | logger.debug(`framenavigated to ${frameTo.url()}`); 161 | } 162 | }); 163 | 164 | this._page.on('console', consoleMessage => { 165 | if (this._options.verbosity >= 1) { 166 | if (['error', 'warning', 'trace'].includes(consoleMessage.type())) { 167 | logger.warn(`Page console error message : "${consoleMessage.text()}"`); 168 | } else if (consoleMessage.type() === 'info' && this._options.verbosity >= 2) { 169 | logger.info(`Page console message : ${consoleMessage.text()}`); 170 | } else if (consoleMessage.type() === 'log' && this._options.verbosity >= 3) { 171 | logger.debug(`Page console message : "${consoleMessage.text()}"`); 172 | } else if (this._options.verbosity >= 4) { 173 | logger.debug(`Page console message, type ${consoleMessage.type()} : "${consoleMessage.text()}"`); 174 | } 175 | } 176 | }); 177 | 178 | this._page.on('frameattached', frameTo => { 179 | if (this._options.verbosity >= 2) { 180 | logger.debug(`frameattached to ${frameTo.url()}`); 181 | } 182 | }); 183 | 184 | this._page.on('requestfailed', failedRequest => { 185 | if (this._options.verbosity >= 2) { 186 | logger.debug(`requestfailed: ${failedRequest.url()}`); 187 | } 188 | }); 189 | 190 | this._page.on('requestfinished', finishedRequest => { 191 | if (this._options.verbosity >= 2) { 192 | logger.debug(`requestfinished: ${finishedRequest.response() 193 | .status()}, ${finishedRequest.method} ${finishedRequest.url()}`); 194 | } 195 | }); 196 | 197 | this._page.on('load', () => { 198 | if (this._options.verbosity >= 1) { 199 | logger.info('load done'); 200 | } 201 | }); 202 | 203 | 204 | // set function to return value from probe 205 | this._page.exposeFunction('__PROBE_FN_RETURN_REQUEST__', (request) => { 206 | if (this._options.verbosity >= 2) { 207 | logger.info(`Found request: ${JSON.stringify(request[1])}`); 208 | } 209 | this.emit(Handler.Events.ProbeResult, request); 210 | }); 211 | 212 | // set function to request end from probe 213 | this._page.exposeFunction('__PROBE_FN_REQUEST_END__', () => { 214 | if (this._options.verbosity >= 1) { 215 | logger.info('Probe finished'); 216 | } 217 | let status = {'status': 'ok'}; 218 | this.emit(Handler.Events.Finished, 0, status); 219 | }); 220 | 221 | return Promise.all([ 222 | this._page.setUserAgent(this._options.userAgent), 223 | this._page.setCookie(...this._options.cookies), 224 | this._page.setViewport(this._constants.viewport), 225 | this._page.setRequestInterception(true), 226 | this._page.authenticate(this._options.httpAuth), 227 | ]) 228 | .then(() => { 229 | this._setProbe(); 230 | return this._page; 231 | }); 232 | } 233 | 234 | _setProbe() { 235 | // on every new document, initializing the probe into the page context 236 | this._page.evaluateOnNewDocument(probe.setProbe, ...[this._options, this._constants]); 237 | } 238 | 239 | startProbe() { 240 | this._page.evaluate(() => { 241 | window.__PROBE__.startAnalysis(); 242 | }); 243 | } 244 | 245 | /** 246 | * @return {Promise|Cookie} 247 | */ 248 | getCookies() { 249 | return this._page.cookies(); 250 | } 251 | } 252 | 253 | Handler.Events = { 254 | Finished: 'finished', 255 | ProbeResult: 'probeResult', 256 | }; 257 | 258 | function _isRedirect(response) { 259 | return [301, 302, 303, 307, 308].includes(response.status()) && response.request() 260 | .resourceType() === 'document'; 261 | } 262 | 263 | exports.Handler = Handler; 264 | 265 | })(); 266 | -------------------------------------------------------------------------------- /core/util/utilities/htmlreport/style.css: -------------------------------------------------------------------------------- 1 | body{ 2 | font-family:Helvetica; 3 | margin:0; 4 | padding:0; 5 | overflow:auto; 6 | } 7 | html{ 8 | height:100%; 9 | } 10 | a{ 11 | text-decoration:none; 12 | outline: 0; 13 | color:#000; 14 | } 15 | a:active{ 16 | color:#000; 17 | } 18 | hr{ 19 | border: 0; 20 | height: 1px; 21 | background-image: linear-gradient(to right, #000000 1%,#ffffff 100%); 22 | } 23 | label{ 24 | cursor:pointer; 25 | } 26 | /*.row{ 27 | 28 | 29 | }*/ 30 | 31 | 32 | .accordion{ 33 | 34 | } 35 | 36 | .accordion-closed{ 37 | height:0px; 38 | overflow:hidden; 39 | } 40 | 41 | 42 | /*.accordion-open{ 43 | overflow:visible; 44 | }*/ 45 | 46 | 47 | 48 | .parent-url{ 49 | margin:0; 50 | color:gray; 51 | font-size:11px; 52 | } 53 | 54 | .url-post-data, .result-post-data{ 55 | margin:0; 56 | color:#5A5A60; 57 | font-size:12px; 58 | } 59 | 60 | .mainAccordion{ 61 | margin:0px 0px 0px 20px; 62 | border-left:1px dotted #047288; 63 | padding-left:6px; 64 | } 65 | 66 | .result-accordion-hdr{ 67 | margin:3px 3px 3px -3px; 68 | font-weight:bold; 69 | cursor:pointer; 70 | } 71 | 72 | 73 | .result-accordion-hdr.hdr-accordion-open:before{ 74 | line-height:24px; 75 | color:gray; 76 | font-size:10px; 77 | content: "▼ "; 78 | } 79 | 80 | .result-accordion-hdr.hdr-accordion-closed:before{ 81 | line-height:24px; 82 | color:gray; 83 | font-size:10px; 84 | content: "► "; 85 | } 86 | 87 | /* must have margin and padding = 0 to be collapsed on scroll */ 88 | #collapse_top{ 89 | position:relative; 90 | overflow: hidden; 91 | padding:0px; 92 | margin:0px; 93 | } 94 | 95 | #top_header{ 96 | margin:10px 0 0 10px; 97 | } 98 | 99 | #title{ 100 | margin:0 0 10px 0; 101 | padding:0; 102 | font-weight:bold; 103 | } 104 | 105 | #top{ 106 | position:fixed; 107 | background-color:#fff; 108 | z-index:777; 109 | top:0; 110 | left:0; 111 | margin:0; 112 | min-width:900px; 113 | width:100%; 114 | } 115 | #top hr{ 116 | max-width:1200px; 117 | margin:0; 118 | } 119 | 120 | #filtersbox{ 121 | margin:10px 0 0 10px; 122 | } 123 | 124 | .hidden{ display: none !important} 125 | 126 | .icon { 127 | display: inline-block; 128 | 129 | background-color:#047288; 130 | padding:4px; 131 | padding-top:2px; 132 | padding-bottom:2px; 133 | border-radius:2px; 134 | color:#fff; 135 | margin-right:4px; 136 | margin-left:1px; 137 | cursor:default; 138 | font-size:10px; 139 | } 140 | 141 | section{ 142 | margin:0px 0px 10px 10px; 143 | padding:0; 144 | white-space: nowrap; 145 | } 146 | 147 | section.marked{ 148 | border-left:4px solid #00F500; 149 | margin-left:6px; 150 | } 151 | 152 | section.marked .mark-button:before{ 153 | content:"un"; 154 | } 155 | 156 | .icon.icon-filtered{ 157 | opacity:0.5; 158 | } 159 | .open-new-win{ 160 | display:inline-block; 161 | background-image: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAKQWlDQ1BJQ0MgUHJvZmlsZQAASA2dlndUU9kWh8+9N73QEiIgJfQaegkg0jtIFQRRiUmAUAKGhCZ2RAVGFBEpVmRUwAFHhyJjRRQLg4Ji1wnyEFDGwVFEReXdjGsJ7601896a/cdZ39nnt9fZZ+9917oAUPyCBMJ0WAGANKFYFO7rwVwSE8vE9wIYEAEOWAHA4WZmBEf4RALU/L09mZmoSMaz9u4ugGS72yy/UCZz1v9/kSI3QyQGAApF1TY8fiYX5QKUU7PFGTL/BMr0lSkyhjEyFqEJoqwi48SvbPan5iu7yZiXJuShGlnOGbw0noy7UN6aJeGjjAShXJgl4GejfAdlvVRJmgDl9yjT0/icTAAwFJlfzOcmoWyJMkUUGe6J8gIACJTEObxyDov5OWieAHimZ+SKBIlJYqYR15hp5ejIZvrxs1P5YjErlMNN4Yh4TM/0tAyOMBeAr2+WRQElWW2ZaJHtrRzt7VnW5mj5v9nfHn5T/T3IevtV8Sbsz55BjJ5Z32zsrC+9FgD2JFqbHbO+lVUAtG0GQOXhrE/vIADyBQC03pzzHoZsXpLE4gwnC4vs7GxzAZ9rLivoN/ufgm/Kv4Y595nL7vtWO6YXP4EjSRUzZUXlpqemS0TMzAwOl89k/fcQ/+PAOWnNycMsnJ/AF/GF6FVR6JQJhIlou4U8gViQLmQKhH/V4X8YNicHGX6daxRodV8AfYU5ULhJB8hvPQBDIwMkbj96An3rWxAxCsi+vGitka9zjzJ6/uf6Hwtcim7hTEEiU+b2DI9kciWiLBmj34RswQISkAd0oAo0gS4wAixgDRyAM3AD3iAAhIBIEAOWAy5IAmlABLJBPtgACkEx2AF2g2pwANSBetAEToI2cAZcBFfADXALDIBHQAqGwUswAd6BaQiC8BAVokGqkBakD5lC1hAbWgh5Q0FQOBQDxUOJkBCSQPnQJqgYKoOqoUNQPfQjdBq6CF2D+qAH0CA0Bv0BfYQRmALTYQ3YALaA2bA7HAhHwsvgRHgVnAcXwNvhSrgWPg63whfhG/AALIVfwpMIQMgIA9FGWAgb8URCkFgkAREha5EipAKpRZqQDqQbuY1IkXHkAwaHoWGYGBbGGeOHWYzhYlZh1mJKMNWYY5hWTBfmNmYQM4H5gqVi1bGmWCesP3YJNhGbjS3EVmCPYFuwl7ED2GHsOxwOx8AZ4hxwfrgYXDJuNa4Etw/XjLuA68MN4SbxeLwq3hTvgg/Bc/BifCG+Cn8cfx7fjx/GvyeQCVoEa4IPIZYgJGwkVBAaCOcI/YQRwjRRgahPdCKGEHnEXGIpsY7YQbxJHCZOkxRJhiQXUiQpmbSBVElqIl0mPSa9IZPJOmRHchhZQF5PriSfIF8lD5I/UJQoJhRPShxFQtlOOUq5QHlAeUOlUg2obtRYqpi6nVpPvUR9Sn0vR5Mzl/OX48mtk6uRa5Xrl3slT5TXl3eXXy6fJ18hf0r+pvy4AlHBQMFTgaOwVqFG4bTCPYVJRZqilWKIYppiiWKD4jXFUSW8koGStxJPqUDpsNIlpSEaQtOledK4tE20Otpl2jAdRzek+9OT6cX0H+i99AllJWVb5SjlHOUa5bPKUgbCMGD4M1IZpYyTjLuMj/M05rnP48/bNq9pXv+8KZX5Km4qfJUilWaVAZWPqkxVb9UU1Z2qbapP1DBqJmphatlq+9Uuq43Pp893ns+dXzT/5PyH6rC6iXq4+mr1w+o96pMamhq+GhkaVRqXNMY1GZpumsma5ZrnNMe0aFoLtQRa5VrntV4wlZnuzFRmJbOLOaGtru2nLdE+pN2rPa1jqLNYZ6NOs84TXZIuWzdBt1y3U3dCT0svWC9fr1HvoT5Rn62fpL9Hv1t/ysDQINpgi0GbwaihiqG/YZ5ho+FjI6qRq9Eqo1qjO8Y4Y7ZxivE+41smsImdSZJJjclNU9jU3lRgus+0zwxr5mgmNKs1u8eisNxZWaxG1qA5wzzIfKN5m/krCz2LWIudFt0WXyztLFMt6ywfWSlZBVhttOqw+sPaxJprXWN9x4Zq42Ozzqbd5rWtqS3fdr/tfTuaXbDdFrtOu8/2DvYi+yb7MQc9h3iHvQ732HR2KLuEfdUR6+jhuM7xjOMHJ3snsdNJp9+dWc4pzg3OowsMF/AX1C0YctFx4bgccpEuZC6MX3hwodRV25XjWuv6zE3Xjed2xG3E3dg92f24+ysPSw+RR4vHlKeT5xrPC16Il69XkVevt5L3Yu9q76c+Oj6JPo0+E752vqt9L/hh/QL9dvrd89fw5/rX+08EOASsCegKpARGBFYHPgsyCRIFdQTDwQHBu4IfL9JfJFzUFgJC/EN2hTwJNQxdFfpzGC4sNKwm7Hm4VXh+eHcELWJFREPEu0iPyNLIR4uNFksWd0bJR8VF1UdNRXtFl0VLl1gsWbPkRoxajCCmPRYfGxV7JHZyqffS3UuH4+ziCuPuLjNclrPs2nK15anLz66QX8FZcSoeGx8d3xD/iRPCqeVMrvRfuXflBNeTu4f7kufGK+eN8V34ZfyRBJeEsoTRRJfEXYljSa5JFUnjAk9BteB1sl/ygeSplJCUoykzqdGpzWmEtPi000IlYYqwK10zPSe9L8M0ozBDuspp1e5VE6JA0ZFMKHNZZruYjv5M9UiMJJslg1kLs2qy3mdHZZ/KUcwR5vTkmuRuyx3J88n7fjVmNXd1Z752/ob8wTXuaw6thdauXNu5Tnddwbrh9b7rj20gbUjZ8MtGy41lG99uit7UUaBRsL5gaLPv5sZCuUJR4b0tzlsObMVsFWzt3WazrWrblyJe0fViy+KK4k8l3JLr31l9V/ndzPaE7b2l9qX7d+B2CHfc3em681iZYlle2dCu4F2t5czyovK3u1fsvlZhW3FgD2mPZI+0MqiyvUqvakfVp+qk6oEaj5rmvep7t+2d2sfb17/fbX/TAY0DxQc+HhQcvH/I91BrrUFtxWHc4azDz+ui6rq/Z39ff0TtSPGRz0eFR6XHwo911TvU1zeoN5Q2wo2SxrHjccdv/eD1Q3sTq+lQM6O5+AQ4ITnx4sf4H++eDDzZeYp9qukn/Z/2ttBailqh1tzWibakNml7THvf6YDTnR3OHS0/m/989Iz2mZqzymdLz5HOFZybOZ93fvJCxoXxi4kXhzpXdD66tOTSna6wrt7LgZevXvG5cqnbvfv8VZerZ645XTt9nX297Yb9jdYeu56WX+x+aem172296XCz/ZbjrY6+BX3n+l37L972un3ljv+dGwOLBvruLr57/17cPel93v3RB6kPXj/Mejj9aP1j7OOiJwpPKp6qP6391fjXZqm99Oyg12DPs4hnj4a4Qy//lfmvT8MFz6nPK0a0RupHrUfPjPmM3Xqx9MXwy4yX0+OFvyn+tveV0auffnf7vWdiycTwa9HrmT9K3qi+OfrW9m3nZOjk03dp76anit6rvj/2gf2h+2P0x5Hp7E/4T5WfjT93fAn88ngmbWbm3/eE8/syOll+AAAACXBIWXMAAAsTAAALEwEAmpwYAAABh0lEQVQ4EY2Svy4EURTGZ+6dRbIRRKFR0NoHEI3EU3gExYbEJhIKBQ1WIzyGKLReQisKofQAbIKZ6/ft3DNmk5nESb4595zv/LvnTpokSQo8aJIiOqV7zrlt9A8YAV8UxV3GIUQnqlE6eAvv/Tr6OITwnqbpkiIp2FX3OcgtdH0KFe3kef6IfgYOyBdIOqTAGeeEYieqskOB0IQsy64UiMzoQ2xfcegheAWXcg5w6l49sAxWwUrUXbS6K24vJp/L5jwCNyL2OXzh0z4ahZjdmHxhAfj6TLhhE3xDzEdShbQPLU/8QUweykamwN++CNAVVGBWLKLFChIP/wZ/XZqVX6YauLaxtXFJzluvoT/GVlnwCN8t9hNw4wVFsk0p2UZe4AlPmWgzBvv/FNB1bCLl6a/U0iWhfgW7t+kypPyaT1pNzZ54ujxm2P8fzbEyznTFaQKNJ8KmMV8VxEFd9VLiJhqYY5rFvEBqtGo8znVRI7uC7STNeJJ73noRUj9IW3K90Cc5D9GR/wJQ+1+DjtkA4QAAAABJRU5ErkJggg==); 162 | width:12px; 163 | height:12px; 164 | margin-left:6px; 165 | margin-right:2px; 166 | background-size: 100% auto; 167 | } 168 | 169 | #urlhider, #reshider{ 170 | width:300px; 171 | height:40px; 172 | } 173 | 174 | 175 | .modal-bar{ 176 | background-color:#E1EBF7; 177 | padding:3px; 178 | margin:0 0 6px 0; 179 | border:1px solid #fff; 180 | border-bottom:1px solid #C6DCF5; 181 | } 182 | 183 | .modal-content{ 184 | position:relative; 185 | margin:0; 186 | padding:0; 187 | height:calc(100% - 40px); 188 | width:100%; 189 | overflow:auto; 190 | } 191 | 192 | 193 | /*#trash section, #marked section{ 194 | display:block !important; 195 | } 196 | */ 197 | 198 | .modal{ 199 | position:fixed; 200 | top:10px; 201 | left:10px; 202 | border:1px solid #047288; 203 | background-color:#fff; 204 | width:calc(100% - 20px); 205 | height:calc(100% - 20px); 206 | overflow:auto; 207 | z-index:888; 208 | box-shadow: 5px 5px 2px #ddd; 209 | } 210 | 211 | #buttons{ 212 | position:relative; 213 | margin:10px 0 5px 10; 214 | padding:0; 215 | } 216 | 217 | .button,.button-spacer{ 218 | display:inline-block; 219 | height:16px; 220 | padding:0 4px 1px 4px; 221 | font-size:13px; 222 | -webkit-user-select: none; 223 | -moz-user-select: none; 224 | -ms-user-select: none; 225 | } 226 | 227 | 228 | .button-spacer{ 229 | border-left:1px dotted gray; 230 | width:1px; 231 | } 232 | 233 | .button{ 234 | border:1px solid gray; 235 | min-width:60px; 236 | cursor:pointer; 237 | text-align:center; 238 | margin-right:10px; 239 | } 240 | 241 | .button:active{ 242 | opacity:0.6; 243 | } 244 | 245 | /*#save_status{ 246 | display:inline-block; 247 | padding:0; 248 | margin:0; 249 | } 250 | #save_status input{ 251 | display:inline-block; 252 | height:16px; 253 | padding:0 4px 1px 4px; 254 | border: 1px dashed gray; 255 | } 256 | */ 257 | .result-button{ 258 | margin-top:10px; 259 | } 260 | 261 | /*@keyframes button-fadein { 262 | from {opacity:0.5;} 263 | to {opacity:1;} 264 | }*/ 265 | 266 | .button-delayed{ 267 | 268 | -webkit-transition: opacity 0.6s ease-in; 269 | -moz-transition: opacity 0.6s ease-in; 270 | -ms-transition: opacity 0.6s ease-in; 271 | -o-transition: opacity 0.6s ease-in; 272 | transition: opacity 0.6s ease-in; 273 | opacity:0.5; 274 | } 275 | .button-delayed:hover{ 276 | opacity:1; 277 | 278 | } 279 | /*#trash .trash-button{ 280 | opacity:1; 281 | } 282 | */ 283 | #trash .trash-button:before{ 284 | content:"un"; 285 | } 286 | 287 | 288 | #trash-close, #marked-close, #outofscope-close, #vulnerability-close, #nonhtml-close, #notes-close{ 289 | position:absolute; 290 | right:10px; 291 | top:0px; 292 | font-size:22px; 293 | cursor:pointer; 294 | } 295 | #trash-close:hover, #marked-close:hover, #outofscope-close:hover, #nonhtml-close:hover, #notes-close:hover{ 296 | color:#F04158; 297 | } 298 | 299 | 300 | .resbuttons{ 301 | margin-top:10px; 302 | } 303 | 304 | #infos{ 305 | 306 | } 307 | .addregexp{ 308 | margin:2px 0 0 0; 309 | font-size:12px; 310 | cursor:pointer; 311 | text-decoration:underline; 312 | } 313 | 314 | .url{ 315 | cursor:pointer; 316 | display:inline-block; 317 | vertical-align: bottom; 318 | 319 | 320 | white-space:nowrap; 321 | 322 | padding:0; 323 | margin:0; 324 | 325 | /*border-right-width: 13px; 326 | border-right-style: solid; 327 | border-image-slice: 1; 328 | border-image:linear-gradient(to right, rgba(255,255,255,0) 0%,rgba(0,0,0,1) 100%);;*/ 329 | } 330 | 331 | #error_container{ 332 | font-size:12px; 333 | color:red; 334 | } 335 | 336 | .url-error{ 337 | color:red; 338 | } 339 | 340 | .url-outofscope{ 341 | color:gray; 342 | } 343 | 344 | 345 | .icon-hidden{ 346 | opacity:0.05 !important; 347 | } 348 | 349 | .result{ 350 | font-size:14px; 351 | } 352 | 353 | 354 | #outofscope p{ 355 | margin: 2px 0 6px 4px; 356 | } 357 | #outofscope p .parent-url{ 358 | padding-left:6px; 359 | } 360 | 361 | #nonhtml p{ 362 | margin: 2px 0 6px 4px; 363 | } 364 | #nonhtml p .parent-url{ 365 | padding-left:6px; 366 | } 367 | 368 | .result-counter{ 369 | display: inline-block; 370 | font-weight:normal; 371 | background-color:#fff; 372 | margin:0; 373 | font-size:9px; 374 | margin:0px 0 -2px -1px; 375 | border:1px solid #047288; 376 | border-radius:6px; 377 | padding:0px 2px 0px 2px; 378 | vertical-align: bottom; 379 | } 380 | 381 | .results{ 382 | margin-bottom:8px; 383 | } 384 | 385 | !.accordion-open + p{ 386 | color:red; 387 | } 388 | 389 | #vulnerability pre{ 390 | padding:5px; 391 | } 392 | 393 | .vuln-name{ 394 | cursor:pointer; 395 | } 396 | 397 | 398 | span.trigger{ 399 | display: inline-block; 400 | margin:0 4px 0 0; 401 | padding: 1; 402 | 403 | min-width:12px; 404 | height:12px; 405 | cursor:pointer; 406 | text-align:center; 407 | font-size:13px; 408 | line-height:14px; 409 | color:#02582F; 410 | } 411 | 412 | span.trigger.empty{ 413 | opacity:.1; 414 | cursor:default; 415 | } 416 | 417 | 418 | 419 | #notes textarea{ 420 | display:block; 421 | margin:0 auto; 422 | width:95%; 423 | height:100%; 424 | font-size:14px; 425 | padding:10px; 426 | } -------------------------------------------------------------------------------- /core/lib/database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | import json 13 | import sqlite3 14 | 15 | from core.lib.request import Request 16 | 17 | 18 | class Database: 19 | def __init__(self, dbname): 20 | """ 21 | constructor 22 | 23 | :param dbname: name of the database 24 | """ 25 | self.dbname = dbname 26 | self.conn = None 27 | 28 | def __str__(self): 29 | return self.dbname 30 | 31 | def connect(self): 32 | """ 33 | open connection 34 | """ 35 | self.conn = sqlite3.connect(self.dbname) 36 | self.conn.row_factory = sqlite3.Row 37 | 38 | def close(self): 39 | """ 40 | close connection 41 | """ 42 | self.conn.close() 43 | 44 | def begin(self): 45 | """ 46 | send a "BEGIN TRANSACTION" command 47 | """ 48 | self.conn.isolation_level = None 49 | self.conn.execute(_BEGIN_TRANSACTION_QUERY) 50 | 51 | def commit(self): 52 | """ 53 | commit transaction(s) to the current database 54 | """ 55 | self.conn.commit() 56 | 57 | def initialize(self): 58 | """ 59 | connect, create the base structure then close connection 60 | """ 61 | 62 | self.connect() 63 | 64 | cur = self.conn.cursor() 65 | cur.execute(_CREATE_CRAWL_INFO_TABLE_QUERY) 66 | cur.execute(_CREATE_REQUEST_TABLE_QUERY) 67 | cur.execute(_CREATE_REQUEST_INDEX_QUERY) 68 | cur.execute(_CREATE_REQUEST_CHILD_TABLE_QUERY) 69 | cur.execute(_CREATE_REQUEST_CHILD_INDEX_QUERY) 70 | cur.execute(_CREATE_ASSESSMENT_TABLE_QUERY) 71 | cur.execute(_CREATE_VULNERABILITY_TABLE_QUERY) 72 | 73 | self.commit() 74 | self.close() 75 | 76 | def save_crawl_info(self, 77 | htcap_version=None, target=None, start_date=None, commandline=None, 78 | user_agent=None, start_cookies=[]): 79 | """ 80 | connect, save the provided crawl info then close the connection 81 | 82 | :param start_cookies: start cookies provided by the user 83 | :param htcap_version: version of the running instance of htcap 84 | :param target: start url of the crawl 85 | :param start_date: start date of the crawl 86 | :param commandline: parameter given to htcap for the crawl 87 | :param user_agent: user defined agent 88 | :return: the id of the crawl 89 | """ 90 | values = [htcap_version, target, start_date, commandline, user_agent, 91 | json.dumps([c.get_dict() for c in start_cookies])] 92 | 93 | insert_query = "INSERT INTO crawl_info (htcap_version,target,start_date,commandline,user_agent,start_cookies) VALUES (?,?,?,?,?,?)" 94 | 95 | self.connect() 96 | cur = self.conn.cursor() 97 | cur.execute(insert_query, values) 98 | cur.execute("SELECT last_insert_rowid() AS id") # retrieve its id 99 | crawl_id = cur.fetchone()['id'] 100 | self.commit() 101 | self.close() 102 | 103 | return crawl_id 104 | 105 | def update_crawl_info(self, crawl_id, crawl_end_date, random_seed, end_cookies): 106 | """ 107 | connect, save the end date then close the connection 108 | :param crawl_id: 109 | :param crawl_end_date: 110 | :param random_seed: 111 | :param end_cookies: 112 | """ 113 | update_crawl_query = "UPDATE crawl_info SET end_date = ?, random_seed = ?, end_cookies = ? WHERE rowid = ?" 114 | 115 | self.connect() 116 | cur = self.conn.cursor() 117 | cur.execute(update_crawl_query, 118 | [crawl_end_date, random_seed, json.dumps([c.get_dict() for c in end_cookies]), crawl_id]) 119 | self.commit() 120 | self.close() 121 | 122 | def save_request(self, request): 123 | """ 124 | save the given request (do NOT open or close the connection) 125 | 126 | if it is a new request (do not exist in the db), it is inserted. 127 | if it has a parent request, it is bound to it 128 | 129 | :param request: request to be saved 130 | """ 131 | 132 | insert_values = ( 133 | request.parent_db_id, 134 | request.type, 135 | request.method, 136 | request.url, 137 | request.referer, 138 | request.redirects, 139 | request.data, 140 | json.dumps([r.get_dict() for r in request.cookies]), 141 | request.http_auth if request.http_auth else "", 142 | 1 if request.out_of_scope else 0, 143 | json.dumps(request.trigger) if request.trigger else "", 144 | json.dumps(request.user_output) if len(request.user_output) > 0 else "" 145 | ) 146 | insert_query = "INSERT INTO request (id_parent, type, method, url, referer, redirects, data, cookies, http_auth, out_of_scope, trigger, user_output) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)" 147 | 148 | # ignore referrer and cookies.. correct? 149 | select_values = ( 150 | request.type, 151 | request.method, 152 | request.url, 153 | request.http_auth if request.http_auth else "", 154 | request.data, 155 | json.dumps(request.trigger) if request.trigger else "" 156 | ) 157 | 158 | # include trigger in query to save the same request with different triggers 159 | # (normally requests are compared using type,method,url and data only) 160 | select_query = "SELECT * FROM request WHERE type=? AND method=? AND url=? AND http_auth=? AND data=? AND trigger=?" 161 | 162 | cur = self.conn.cursor() 163 | cur.execute(select_query, select_values) 164 | existing_req = cur.fetchone() 165 | 166 | if not existing_req: # if no existing request 167 | cur.execute(insert_query, insert_values) # insert the new request 168 | cur.execute("SELECT last_insert_rowid() AS id") # retrieve its id 169 | request.db_id = cur.fetchone()['id'] # complete the request with the db_id 170 | else: 171 | request.db_id = existing_req['id'] # set the db_id for the request 172 | 173 | req_id = request.db_id 174 | 175 | # set the parent-child relationships 176 | if request.parent_db_id: 177 | qry_child = "INSERT INTO request_child (id_request, id_child) VALUES (?,?)" 178 | cur.execute(qry_child, (request.parent_db_id, req_id)) 179 | 180 | def save_crawl_result(self, result, crawled): 181 | """ 182 | save the given result ie. update an existing request with the result (do NOT open or close the connection) 183 | 184 | :param result: result to save 185 | :param crawled: (boolean) have been crawled 186 | """ 187 | qry = "UPDATE request SET crawled=?, crawler_errors=?, user_output=? WHERE id=?" 188 | values = ( 189 | 1 if crawled else 0, 190 | json.dumps(result.errors), 191 | json.dumps(result.request.user_output) if len(result.request.user_output) > 0 else "", 192 | result.request.db_id 193 | ) 194 | 195 | cur = self.conn.cursor() 196 | cur.execute(qry, values) 197 | 198 | def make_request_crawlable(self, request): 199 | """ 200 | update the scope and crawled status 201 | 202 | :param request: 203 | """ 204 | qry = "UPDATE request SET crawled=0, out_of_scope=0 WHERE id=:id" 205 | values = {"id": request.db_id} 206 | 207 | cur = self.conn.cursor() 208 | cur.execute(qry, values) 209 | 210 | def get_requests(self, types="xhr"): 211 | """ 212 | return a list of request matching the given types 213 | 214 | connect, retrieve the requests list then close the connection 215 | 216 | :param types: string of types (comma separated) 217 | :return: list of matching request 218 | """ 219 | types = types.split(",") 220 | ret = [] 221 | qry = "SELECT * FROM request WHERE out_of_scope=0 AND type IN (%s)" % ",".join("?" * len(types)) 222 | 223 | self.connect() 224 | cur = self.conn.cursor() 225 | cur.execute(qry, types) # nosemgrep 3393760109 226 | for r in cur.fetchall(): 227 | # !! parent must be null (or unset) 228 | req = Request( 229 | r['type'], r['method'], r['url'], referer=r['referer'], data=r['data'], 230 | json_cookies=r['cookies'], db_id=r['id'], parent_db_id=r['id_parent'] 231 | ) 232 | ret.append(req) 233 | self.close() 234 | 235 | return ret 236 | 237 | def create_assessment(self, scanner, date): 238 | """ 239 | connect, create a new assessment then close the connection 240 | :param scanner: 241 | :param date: 242 | :return: id of the newly created assessment 243 | """ 244 | 245 | qry = "INSERT INTO assessment (scanner, start_date) VALUES (?,?)" 246 | 247 | self.connect() 248 | 249 | cur = self.conn.cursor() 250 | 251 | cur.execute(qry, (scanner, date)) 252 | cur.execute("SELECT last_insert_rowid() as id") 253 | id = cur.fetchone()['id'] 254 | self.commit() 255 | self.close() 256 | return id 257 | 258 | def save_assessment(self, id_assessment, end_date): 259 | """ 260 | connect, update the existing assessment with the given end date 261 | 262 | :param id_assessment: 263 | :param end_date: 264 | """ 265 | qry = "UPDATE assessment SET end_date=? WHERE id=?" 266 | 267 | self.connect() 268 | cur = self.conn.cursor() 269 | cur.execute(qry, (end_date, id_assessment)) 270 | self.commit() 271 | self.close() 272 | 273 | def insert_vulnerability(self, id_assessment, id_request, type, description, error=""): 274 | """ 275 | connect, create a vulnerability then close the connection 276 | 277 | :param id_assessment: 278 | :param id_request: 279 | :param type: 280 | :param description: 281 | :param error: default="" 282 | """ 283 | qry = "INSERT INTO vulnerability (id_assessment, id_request, type, description, error) VALUES (?,?,?,?,?)" 284 | 285 | self.connect() 286 | 287 | cur = self.conn.cursor() 288 | 289 | cur.execute(qry, (id_assessment, id_request, type, description, error)) 290 | self.commit() 291 | self.close() 292 | 293 | def get_crawled_request(self): 294 | """ 295 | connect, retrieve existing already crawled requests then close the connection 296 | :return: list of request 297 | """ 298 | requests = [] 299 | query = "SELECT * FROM request WHERE crawled=1" 300 | 301 | self.connect() 302 | cur = self.conn.cursor() 303 | cur.execute(query) 304 | for request in cur.fetchall(): 305 | req = Request( 306 | request['type'], request['method'], request['url'], referer=request['referer'], data=request['data'], 307 | json_cookies=request['cookies'], db_id=request['id'], parent_db_id=request['id_parent'] 308 | ) 309 | requests.append(req) 310 | self.close() 311 | 312 | return requests 313 | 314 | def get_not_crawled_request(self): 315 | """ 316 | connect, retrieve existing never crawled requests then close the connection 317 | :return: list of request 318 | """ 319 | requests = [] 320 | query = "SELECT * FROM request WHERE crawled=0 AND out_of_scope=0" 321 | 322 | self.connect() 323 | cur = self.conn.cursor() 324 | cur.execute(query) 325 | for request in cur.fetchall(): 326 | req = Request( 327 | request['type'], request['method'], request['url'], referer=request['referer'], data=request['data'], 328 | json_cookies=request['cookies'], db_id=request['id'], parent_db_id=request['id_parent'] 329 | ) 330 | requests.append(req) 331 | self.close() 332 | 333 | return requests 334 | 335 | def retrieve_crawl_info(self, crawl_id): 336 | """ 337 | return the information stored for the given crawl 338 | :param crawl_id: 339 | :return: random_seed 340 | """ 341 | query = "SELECT random_seed, end_cookies FROM crawl_info WHERE rowid=?" 342 | 343 | self.connect() 344 | cur = self.conn.cursor() 345 | cur.execute(query, [crawl_id]) 346 | result = cur.fetchone() 347 | self.close() 348 | 349 | return result["random_seed"], result["end_cookies"] 350 | 351 | 352 | _CREATE_CRAWL_INFO_TABLE_QUERY = """ 353 | CREATE TABLE crawl_info ( 354 | htcap_version TEXT, 355 | target TEXT, 356 | start_date INTEGER, 357 | end_date INTEGER, 358 | commandline TEXT, 359 | user_agent TEXT, 360 | random_seed TEXT, 361 | start_cookies TEXT, 362 | end_cookies TEXT 363 | ) 364 | """ 365 | 366 | _CREATE_REQUEST_TABLE_QUERY = """ 367 | CREATE TABLE request ( 368 | id INTEGER PRIMARY KEY, 369 | id_parent INTEGER, 370 | type TEXT, 371 | method TEXT, 372 | url TEXT, 373 | referer TEXT, 374 | redirects INTEGER, 375 | data TEXT NOT NULL DEFAULT '', 376 | cookies TEXT NOT NULL DEFAULT '[]', 377 | http_auth TEXT, 378 | out_of_scope INTEGER NOT NULL DEFAULT 0, 379 | trigger TEXT, 380 | crawled INTEGER NOT NULL DEFAULT 0, 381 | crawler_errors TEXT, 382 | user_output TEXT 383 | ) 384 | """ 385 | 386 | _CREATE_REQUEST_INDEX_QUERY = """ 387 | CREATE INDEX request_index ON request (type, method, url, http_auth, data, trigger) 388 | """ 389 | 390 | _CREATE_REQUEST_CHILD_TABLE_QUERY = """ 391 | CREATE TABLE request_child ( 392 | id INTEGER PRIMARY KEY, 393 | id_request INTEGER NOT NULL, 394 | id_child INTEGER NOT NULL 395 | ) 396 | """ 397 | 398 | _CREATE_REQUEST_CHILD_INDEX_QUERY = """ 399 | CREATE INDEX request_child_index ON request_child (id_request, id_child) 400 | """ 401 | 402 | _CREATE_ASSESSMENT_TABLE_QUERY = """ 403 | CREATE TABLE assessment( 404 | id INTEGER PRIMARY KEY, 405 | scanner TEXT, 406 | start_date INTEGER, 407 | end_date INTEGER 408 | ) 409 | """ 410 | 411 | _CREATE_VULNERABILITY_TABLE_QUERY = """ 412 | CREATE TABLE vulnerability( 413 | id INTEGER PRIMARY KEY, 414 | id_assessment INTEGER, 415 | id_request INTEGER, 416 | type TEXT, 417 | description TEXT, 418 | error TEXT 419 | ) 420 | """ 421 | 422 | _BEGIN_TRANSACTION_QUERY = """BEGIN TRANSACTION""" 423 | -------------------------------------------------------------------------------- /tests/lib_tests/database_tests.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import unittest 3 | 4 | from mock import MagicMock, PropertyMock, call, patch 5 | 6 | from core.lib.database import Database 7 | 8 | 9 | class DatabaseTestCase(unittest.TestCase): 10 | def setUp(self): 11 | self.connection_mock = MagicMock() 12 | self.cursor_mock = MagicMock() 13 | self.cursor_mock.execute = MagicMock() 14 | self.cursor_mock.fetchone = MagicMock() 15 | self.cursor_mock.fetchall = MagicMock(return_value=[]) 16 | self.connection_mock.cursor = MagicMock(return_value=self.cursor_mock) 17 | self.connect_method_mock = MagicMock() 18 | self.commit_method_mock = MagicMock() 19 | self.close_method_mock = MagicMock() 20 | 21 | self.db = Database('my_db') 22 | 23 | self.db.conn = self.connection_mock 24 | self.db.connect = self.connect_method_mock 25 | self.db.commit = self.commit_method_mock 26 | self.db.close = self.close_method_mock 27 | 28 | 29 | class DatabaseTest(DatabaseTestCase): 30 | def test_constructor(self): 31 | db = Database('my_db') 32 | 33 | self.assertEqual(db.dbname, 'my_db') 34 | self.assertEqual(db.conn, None) 35 | 36 | def test___str__(self): 37 | db = Database('my_db') 38 | 39 | self.assertEqual(str(db), 'my_db') 40 | 41 | def test_connect(self): 42 | sqlite3_mock = MagicMock() 43 | row_factory_mock = PropertyMock(return_value=None) 44 | 45 | type(sqlite3_mock).row_factory = row_factory_mock 46 | sqlite3.connect = sqlite3_mock 47 | 48 | db = Database('my_db') 49 | db.connect() 50 | 51 | sqlite3.connect.assert_called_with('my_db') 52 | self.assertIsInstance(db.conn, MagicMock) 53 | 54 | def test_close(self): 55 | close_mock = MagicMock() 56 | self.connection_mock.close = close_mock 57 | db = Database('my_db') 58 | db.conn = self.connection_mock 59 | 60 | db.close() 61 | 62 | close_mock.assert_called_once() 63 | 64 | def test_begin(self): 65 | self.db.begin() 66 | 67 | self.assertEqual(self.connection_mock.isolation_level, None) 68 | self.connection_mock.execute.assert_called_once_with("BEGIN TRANSACTION") 69 | 70 | def test_commit(self): 71 | self.connection_mock.commit = MagicMock() 72 | db = Database('my_db') 73 | db.conn = self.connection_mock 74 | 75 | db.commit() 76 | 77 | self.connection_mock.commit.assert_called_once() 78 | 79 | def test_create_success(self): 80 | self.db.initialize() 81 | 82 | self.connect_method_mock.assert_called_once() 83 | self.assertEqual(self.cursor_mock.execute.call_count, 7) 84 | self.commit_method_mock.assert_called_once() 85 | self.close_method_mock.assert_called_once() 86 | 87 | def test_save_crawl_info(self): 88 | self.cursor_mock.fetchone.return_value = {"id": 42} 89 | 90 | cookie_mock = MagicMock() 91 | cookie_mock.get_dict = MagicMock(return_value="some cookie") 92 | 93 | result = self.db.save_crawl_info( 94 | htcap_version="42.0", target="my target", start_date="my start date", 95 | commandline="my commandline", user_agent="some user agent", start_cookies=[cookie_mock, cookie_mock] 96 | ) 97 | 98 | self.connect_method_mock.assert_called_once() 99 | self.assertEqual(cookie_mock.get_dict.call_count, 2) 100 | self.assertEqual( 101 | self.cursor_mock.execute.call_args_list[0], 102 | call( 103 | "INSERT INTO crawl_info (htcap_version,target,start_date,commandline,user_agent,start_cookies) VALUES (?,?,?,?,?,?)", 104 | ["42.0", "my target", "my start date", 105 | "my commandline", "some user agent", '["some cookie", "some cookie"]'])) 106 | self.assertEqual( 107 | self.cursor_mock.execute.call_args_list[1], 108 | call( 109 | "SELECT last_insert_rowid() AS id" 110 | ) 111 | ) 112 | self.commit_method_mock.assert_called_once() 113 | self.close_method_mock.assert_called_once() 114 | self.assertEqual(result, 42) 115 | 116 | def test_update_crawl_info(self): 117 | cookie_mock = MagicMock() 118 | cookie_mock.get_dict = MagicMock(return_value="some cookie") 119 | 120 | self.db.update_crawl_info(53, "some end date", "my random seed", [cookie_mock, cookie_mock]) 121 | 122 | self.connect_method_mock.assert_called_once() 123 | self.assertEqual(cookie_mock.get_dict.call_count, 2) 124 | self.cursor_mock.execute.assert_called_once_with( 125 | "UPDATE crawl_info SET end_date = ?, random_seed = ?, end_cookies = ? WHERE rowid = ?", 126 | ["some end date", "my random seed", '["some cookie", "some cookie"]', 53]) 127 | self.commit_method_mock.assert_called_once() 128 | self.close_method_mock.assert_called_once() 129 | 130 | def test_save_request_new_request_no_parent(self): 131 | """ 132 | case where the request is new and has no parent 133 | """ 134 | fetchone_returns = [None, {'id': 42}] 135 | 136 | def fetchone_side_effect(): 137 | result = fetchone_returns.pop(0) 138 | return result 139 | 140 | request = MagicMock() 141 | request.parent_db_id = None 142 | request.type = "request type" 143 | request.method = "METHOD" 144 | request.url = "my url" 145 | request.referer = "some referrer" 146 | request.redirects = "some redirection" 147 | request.data = "some data" 148 | request.cookies = {} 149 | request.http_auth = None 150 | request.out_of_scope = False 151 | request.trigger = None 152 | request.user_output = [] 153 | 154 | self.cursor_mock.fetchone.side_effect = fetchone_side_effect 155 | 156 | self.db.save_request(request) 157 | 158 | self.assertEqual(self.cursor_mock.execute.call_count, 3) 159 | self.assertEqual( 160 | self.cursor_mock.execute.call_args_list[0], 161 | call( 162 | 'SELECT * FROM request WHERE type=? AND method=? AND url=? AND http_auth=? AND data=? AND trigger=?', 163 | ("request type", "METHOD", "my url", "", "some data", "") 164 | ) 165 | ) 166 | self.assertEqual( 167 | self.cursor_mock.execute.call_args_list[1], 168 | call( 169 | 'INSERT INTO request (id_parent, type, method, url, referer, redirects, data, cookies, http_auth, out_of_scope, trigger, user_output) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)', 170 | ( 171 | None, "request type", "METHOD", "my url", "some referrer", "some redirection", "some data", "[]", 172 | "", 0, 173 | "", "") 174 | ) 175 | ) 176 | self.assertEqual( 177 | self.cursor_mock.execute.call_args_list[2], 178 | call( 179 | "SELECT last_insert_rowid() AS id" 180 | ) 181 | ) 182 | 183 | def test_save_request_old_request_with_parent(self): 184 | """ 185 | case where the request exist and has a parent 186 | """ 187 | 188 | cookie_mock = MagicMock() 189 | cookie_mock.get_dict = MagicMock(return_value={"cookie_value_1": "value1"}) 190 | 191 | request = MagicMock() 192 | request.parent_db_id = 42 193 | request.type = "request type" 194 | request.method = "METHOD" 195 | request.url = "my url" 196 | request.referer = "some referrer" 197 | request.redirects = "some redirection" 198 | request.data = "some data" 199 | request.cookies = [cookie_mock] 200 | request.http_auth = "auth" 201 | request.out_of_scope = True 202 | request.trigger = ["trigger1", "trigger2"] 203 | request.html = "" 204 | request.user_output = ['some', 'output'] 205 | 206 | self.cursor_mock.fetchone.return_value = {"id": 53} 207 | 208 | self.db.save_request(request) 209 | 210 | self.assertEqual(request.db_id, 53) 211 | self.assertEqual(self.cursor_mock.execute.call_count, 2) 212 | self.assertEqual( 213 | self.cursor_mock.execute.call_args_list[0], 214 | call( 215 | 'SELECT * FROM request WHERE type=? AND method=? AND url=? AND http_auth=? AND data=? AND trigger=?', 216 | ("request type", "METHOD", "my url", "auth", "some data", '["trigger1", "trigger2"]') 217 | ) 218 | ) 219 | self.assertEqual( 220 | self.cursor_mock.execute.call_args_list[1], 221 | call( 222 | "INSERT INTO request_child (id_request, id_child) VALUES (?,?)", 223 | (42, 53) 224 | ) 225 | ) 226 | 227 | def test_save_crawl_result_not_crawled(self): 228 | result = MagicMock() 229 | result.errors = [] 230 | result.request = MagicMock() 231 | result.request.user_output = [] 232 | result.request.db_id = 42 233 | 234 | self.db.save_crawl_result(result=result, crawled=None) 235 | 236 | self.cursor_mock.execute.assert_called_once_with( 237 | "UPDATE request SET crawled=?, crawler_errors=?, user_output=? WHERE id=?", 238 | (0, "[]", "", 42) 239 | ) 240 | 241 | def test_save_crawl_result_crawled(self): 242 | result = MagicMock() 243 | result.errors = ["some", "errors"] 244 | result.request = MagicMock() 245 | result.request.user_output = ["some", "outputs"] 246 | result.request.db_id = 42 247 | 248 | self.db.save_crawl_result(result=result, crawled=True) 249 | 250 | self.cursor_mock.execute.assert_called_once_with( 251 | "UPDATE request SET crawled=?, crawler_errors=?, user_output=? WHERE id=?", 252 | (1, '["some", "errors"]', '["some", "outputs"]', 42) 253 | ) 254 | 255 | def test_make_request_crawlable(self): 256 | request = MagicMock() 257 | request.db_id = 42 258 | 259 | self.db.make_request_crawlable(request) 260 | 261 | self.cursor_mock.execute.assert_called_once_with( 262 | "UPDATE request SET crawled=0, out_of_scope=0 WHERE id=:id", 263 | {"id": 42} 264 | ) 265 | 266 | def test_get_requests_without_result(self): 267 | results = self.db.get_requests() 268 | 269 | self.connect_method_mock.assert_called_once() 270 | self.cursor_mock.execute.assert_called_once_with( 271 | "SELECT * FROM request WHERE out_of_scope=0 AND type IN (?)", 272 | ["xhr"] 273 | ) 274 | self.close_method_mock.assert_called_once() 275 | self.assertEqual(results, []) 276 | 277 | @patch('core.lib.database.Request') 278 | def test_get_requests_with_result(self, request_mock): 279 | self.cursor_mock.fetchall.return_value = [ 280 | { 281 | "id": 42, "id_parent": 53, 282 | "type": "my type", "method": "METHOD", "url": "some url", 283 | "referer": "from here", "data": "some data", "cookies": "some cookies" 284 | } 285 | ] 286 | 287 | self.db.get_requests("xhr,an_other_type") 288 | 289 | request_mock.assert_called_once_with( 290 | "my type", "METHOD", "some url", data="some data", db_id=42, 291 | json_cookies="some cookies", parent_db_id=53, 292 | referer="from here" 293 | ) 294 | 295 | def test_create_assessment(self): 296 | self.cursor_mock.fetchone.return_value = {"id": 42} 297 | 298 | result = self.db.create_assessment('my scanner', 'start date') 299 | 300 | self.connect_method_mock.assert_called_once() 301 | 302 | self.assertEqual(self.cursor_mock.execute.call_count, 2) 303 | self.assertEqual( 304 | self.cursor_mock.execute.call_args_list[0], 305 | call( 306 | "INSERT INTO assessment (scanner, start_date) VALUES (?,?)", 307 | ("my scanner", "start date") 308 | ) 309 | ) 310 | self.assertEqual( 311 | self.cursor_mock.execute.call_args_list[1], 312 | call( 313 | "SELECT last_insert_rowid() as id" 314 | ) 315 | ) 316 | self.assertEqual(result, 42) 317 | self.commit_method_mock.assert_called_once() 318 | self.close_method_mock.assert_called_once() 319 | 320 | def test_save_assessment(self): 321 | self.db.save_assessment(42, "end date") 322 | 323 | self.connect_method_mock.assert_called_once() 324 | self.cursor_mock.execute.assert_called_once_with( 325 | "UPDATE assessment SET end_date=? WHERE id=?", 326 | ("end date", 42) 327 | ) 328 | self.commit_method_mock.assert_called_once() 329 | self.close_method_mock.assert_called_once() 330 | 331 | def test_insert_vulnerability(self): 332 | self.db.insert_vulnerability(42, 53, "my type", "my description") 333 | 334 | self.connect_method_mock.assert_called_once() 335 | self.cursor_mock.execute.assert_called_once_with( 336 | "INSERT INTO vulnerability (id_assessment, id_request, type, description, error) VALUES (?,?,?,?,?)", 337 | (42, 53, "my type", "my description", "") 338 | ) 339 | self.commit_method_mock.assert_called_once() 340 | self.close_method_mock.assert_called_once() 341 | 342 | @patch('core.lib.database.Request') 343 | def test_get_crawled_request(self, request_mock): 344 | self.cursor_mock.fetchall.return_value = [ 345 | { 346 | "id": 42, "id_parent": 53, 347 | "type": "my type", "method": "METHOD", "url": "some url", 348 | "referer": "from here", "data": "some data", "cookies": "some cookies" 349 | } 350 | ] 351 | results = self.db.get_crawled_request() 352 | 353 | self.connect_method_mock.assert_called_once() 354 | self.cursor_mock.execute.assert_called_once_with( 355 | "SELECT * FROM request WHERE crawled=1" 356 | ) 357 | request_mock.assert_called_once_with( 358 | "my type", "METHOD", "some url", data="some data", db_id=42, 359 | json_cookies="some cookies", parent_db_id=53, 360 | referer="from here" 361 | ) 362 | self.close_method_mock.assert_called_once() 363 | self.assertEqual(len(results), 1) 364 | 365 | @patch('core.lib.database.Request') 366 | def test_get_not_crawled_request(self, request_mock): 367 | self.cursor_mock.fetchall.return_value = [ 368 | { 369 | "id": 42, "id_parent": 53, 370 | "type": "my type", "method": "METHOD", "url": "some url", 371 | "referer": "from here", "data": "some data", "cookies": "some cookies" 372 | } 373 | ] 374 | results = self.db.get_not_crawled_request() 375 | 376 | self.connect_method_mock.assert_called_once() 377 | self.cursor_mock.execute.assert_called_once_with( 378 | "SELECT * FROM request WHERE crawled=0 AND out_of_scope=0" 379 | ) 380 | request_mock.assert_called_once_with( 381 | "my type", "METHOD", "some url", data="some data", db_id=42, 382 | json_cookies="some cookies", parent_db_id=53, 383 | referer="from here" 384 | ) 385 | self.close_method_mock.assert_called_once() 386 | self.assertEqual(len(results), 1) 387 | 388 | def test_retrieve_crawl_options(self): 389 | self.cursor_mock.fetchone.return_value = {"random_seed": "my seed", "end_cookies": "my end cookies"} 390 | 391 | results = self.db.retrieve_crawl_info(42) 392 | 393 | self.connect_method_mock.assert_called_once() 394 | self.cursor_mock.execute.assert_called_once_with( 395 | "SELECT random_seed, end_cookies FROM crawl_info WHERE rowid=?", [42] 396 | ) 397 | self.close_method_mock.assert_called_once() 398 | self.assertEqual(results, ("my seed", "my end cookies")) 399 | --------------------------------------------------------------------------------