├── README.md ├── core ├── __init__.py ├── constants.py ├── crawl │ ├── __init__.py │ ├── crawler.py │ ├── crawler_thread.py │ ├── lib │ │ ├── __init__.py │ │ ├── crawl_result.py │ │ ├── probe.py │ │ ├── shared.py │ │ ├── urlfinder.py │ │ └── utils.py │ └── probe │ │ ├── analyze.js │ │ ├── functions.js │ │ ├── options.js │ │ └── probe.js ├── lib │ ├── DB_config.py │ ├── __init__.py │ ├── cookie.py │ ├── database.py │ ├── exception.py │ ├── http_get.py │ ├── request.py │ ├── request_pattern.py │ ├── shell.py │ ├── thirdparty │ │ ├── __init__.py │ │ ├── pysocks │ │ │ ├── __init__.py │ │ │ ├── socks.py │ │ │ └── sockshandler.py │ │ └── simhash │ │ │ └── __init__.py │ └── utils.py └── util │ ├── __init__.py │ ├── base_util.py │ ├── util.py │ └── utilities │ ├── __init__.py │ ├── htmlreport │ ├── report.html │ ├── report.js │ └── style.css │ ├── login.py │ ├── login │ └── login.js │ ├── lsajax.py │ ├── lsvuln.py │ ├── report.py │ ├── tocurl.py │ ├── updcookie.py │ └── usgen.py ├── htcap.py ├── new.sql └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | ## HTCAP 2 | 3 | Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting ajax calls and DOM changes. 4 | Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications. 5 | 6 | More infos at [htcap.org](http://htcap.org). 7 | 8 | ## SETUP 9 | 10 | ### Requirements 11 | 12 | 1. Python 2.7 13 | 2. PhantomJS v2 [PS:因为PhantomJS作者不再维护PhantomJS项目了..估计这个也不会继续更新了] 14 | 15 | ### Download and Run 16 | 17 | ```console 18 | $ git clone https://github.com/0xa-saline/htcap_mysql htcap 19 | $ cd htcap 20 | $ vi core/lib/DB_config.py 21 | #数据库信息 22 | 'host' : 'localhost', 23 | 'user' : 'root', 24 | 'port' : '3306', 25 | 'password' : 'mysqlroot', 26 | 'db' : 'w3a_scan', 27 | $ sudo pip install -r requirements.txt 28 | $ python htcap.py crawl http://0day5.com 29 | 30 | ``` 31 | 32 | 使用姿势和原本的一样的 33 | 34 | ```bash 35 | $ python htcap.py crawl http://testphp.vulnweb.com 36 | ***************************************************** 37 | * / _ \| _ \ / \ \ / / ___/ ___| ___ __ _| \ | | * 38 | *| | | | | | |/ _ \ V /|___ \___ \ / __/ _` | \| | * 39 | *| |_| | |_| / ___ \| | ___) |__) | (_| (_| | |\ |* 40 | * \___/|____/_/ \_\_| |____/____/ \___\__,_|_| \_|* 41 | ***************************************************** 42 | . No handlers could be found for logger "tldextract" 43 | [*][debug] http://testphp.vulnweb.com/pictures/ 44 | [*][debug] http://testphp.vulnweb.com/images/ 45 | [*][debug] http://testphp.vulnweb.com/bxss/ 46 | [*][debug] http://testphp.vulnweb.com/Connections/ 47 | [*][debug] http://testphp.vulnweb.com/admin/ 48 | [*][debug] http://testphp.vulnweb.com/CVS/ 49 | [*][debug] http://testphp.vulnweb.com/secured/ 50 | [*][debug] http://testphp.vulnweb.com/userinfo.php 51 | [*][debug] http://testphp.vulnweb.com/cart.php 52 | [*][debug] http://testphp.vulnweb.com/logout.php 53 | [*][debug] http://testphp.vulnweb.com/search.php 54 | [*][debug] http://testphp.vulnweb.com/comment.php 55 | [*][debug] http://testphp.vulnweb.com/login.php 56 | [*][debug] http://testphp.vulnweb.com/index.php 57 | [*][debug] http://testphp.vulnweb.com/product.php 58 | [*][debug] http://testphp.vulnweb.com/guestbook.php 59 | . initialized, crawl started with 10 threads 60 | [=================================] 108 of 108 pages processed in 43 minutes 61 | Crawl finished, 108 pages analyzed in 43 minutes 62 | ``` 63 | 64 | PhantomJs can be downloaded [here](http://phantomjs.org//download.html). It comes as a self-contained executable with all libraries linked statically, so there is no need to install or compile anything else. 65 | 66 | 67 | ## DOCUMENTATION 68 | 69 | Documentation, examples and demos can be found at the official website [http://htcap.org](http://htcap.org). 70 | 71 | 72 | ## TO DO 73 | 74 | 0.禁止dns刷新缓存 done 75 | 76 | 77 | 1.修改htcap的数据库为mysql done 78 | 79 | 80 | 2.增加常见统计代码和分享网站的过滤功能 done 81 | 82 | 83 | 3.增加常见静态后缀的识别 done 84 | 85 | 86 | 4.获取url在原有的robots基础上增加目录爆破和搜索引擎采集.识别一些不能访问的目录 done 87 | 88 | 89 | 5.砍掉sqlmap和Arachni扫描功能. done 90 | 91 | 92 | 6.增加页面信息识别功能. 93 | 94 | 95 | 7.增加重复去重和相似度去重功能 96 | 97 | 98 | ## demo 99 | 100 | http://htcap.org/scanme/ 101 | 102 | 103 | 104 | ## LICENSE 105 | 106 | This program is free software; you can redistribute it and/or modify it under the terms of the [GNU General Public License](https://www.gnu.org/licenses/gpl-2.0.html) as published by the Free Software Foundation; either version 2 of the License, or(at your option) any later version. 107 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/__init__.py -------------------------------------------------------------------------------- /core/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | THSTAT_WAITING = 0 15 | THSTAT_RUNNING = 1 16 | 17 | CRAWLSCOPE_DOMAIN = "domain" 18 | CRAWLSCOPE_DIRECTORY = "directory" 19 | CRAWLSCOPE_URL = "url" 20 | 21 | 22 | CRAWLMODE_PASSIVE = "passive" 23 | CRAWLMODE_ACTIVE = "active" 24 | CRAWLMODE_AGGRESSIVE = "aggressive" 25 | 26 | REQTYPE_LINK = "link" 27 | REQTYPE_XHR = "xhr" 28 | REQTYPE_WS = "websocket" 29 | REQTYPE_JSONP = "jsonp" 30 | REQTYPE_FORM = "form" 31 | REQTYPE_REDIRECT = "redirect" 32 | REQTYPE_UNKNOWN = "unknown" 33 | 34 | 35 | ERROR_CONTENTTYPE = "contentType" 36 | ERROR_TIMEOUT = "timeout" 37 | ERROR_PROBE_TO = "probe_timeout" 38 | ERROR_LOAD = "loaderror" 39 | ERROR_PROBEKILLED = "probe_killed" 40 | ERROR_PROBEFAILURE = "probe_failure" 41 | ERROR_MAXREDIRECTS = "too_many_redirects" 42 | ERROR_CRAWLDEPTH = "crawler_depth_limit_reached" 43 | VULNTYPE_SQLI = "sqli" 44 | VULNTYPE_XSS = "xss" 45 | -------------------------------------------------------------------------------- /core/crawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/__init__.py -------------------------------------------------------------------------------- /core/crawl/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import sys 15 | import os 16 | import datetime 17 | import time 18 | import getopt 19 | import json 20 | import re 21 | from urlparse import urlsplit, urljoin 22 | from urllib import unquote 23 | import urllib2 24 | import threading 25 | import subprocess 26 | from random import choice 27 | import string 28 | import ssl 29 | 30 | 31 | from core.lib.exception import * 32 | from core.lib.cookie import Cookie 33 | from core.lib.database import Database 34 | 35 | 36 | from lib.shared import * 37 | from lib.crawl_result import * 38 | from core.lib.request import Request 39 | from core.lib.http_get import HttpGet 40 | from core.lib.shell import CommandExecutor 41 | #from core.dirburp.dirscan import Dirbuster 42 | from crawler_thread import CrawlerThread 43 | 44 | from core.lib.utils import * 45 | from core.constants import * 46 | from tld import get_tld 47 | from lib.utils import * 48 | 49 | class Crawler: 50 | 51 | def __init__(self, argv): 52 | 53 | self.base_dir = getrealdir(__file__) + os.sep 54 | 55 | self.crawl_start_time = int(time.time()) 56 | self.crawl_end_time = None 57 | self.taskid = '' 58 | 59 | self.defaults = { 60 | "useragent": random_useragent(), 61 | "num_threads": 10, 62 | "max_redirects": 10*10, 63 | "out_file_overwrite": False, 64 | "proxy": None, 65 | "http_auth": None, 66 | "use_urllib_onerror": True, 67 | "group_qs": False, 68 | "process_timeout": 450, # when lots of element(~25000) are added dynamically it can take some time.. 69 | "set_referer": True, 70 | "scope": CRAWLSCOPE_DOMAIN, 71 | "mode": CRAWLMODE_AGGRESSIVE, 72 | "max_depth": 10000, 73 | "max_post_depth": 1000, 74 | "override_timeout_functions": True, 75 | 'crawl_forms': True# only if mode == CRAWLMODE_AGGRESSIVE 76 | } 77 | 78 | 79 | self.main(argv) 80 | 81 | 82 | 83 | def usage(self): 84 | print ( 85 | "usage: htcap [options] url outfile\n" 86 | "Options: \n" 87 | " -h this help\n" 88 | " -q do not display progress informations\n" 89 | " -m MODE set crawl mode:\n" 90 | " - "+CRAWLMODE_PASSIVE+": do not intract with the page\n" 91 | " - "+CRAWLMODE_ACTIVE+": trigger events\n" 92 | " - "+CRAWLMODE_AGGRESSIVE+": also fill input values and crawl forms (default)\n" 93 | " -s SCOPE set crawl scope\n" 94 | " - "+CRAWLSCOPE_DOMAIN+": limit crawling to current domain (default)\n" 95 | " - "+CRAWLSCOPE_DIRECTORY+": limit crawling to current directory (and subdirecotries) \n" 96 | " - "+CRAWLSCOPE_URL+": do not crawl, just analyze a single page\n" 97 | " -D maximum crawl depth (default: " + str(Shared.options['max_depth']) + ")\n" 98 | " -P maximum crawl depth for consecutive forms (default: " + str(Shared.options['max_post_depth']) + ")\n" 99 | " -F even if in aggressive mode, do not crawl forms\n" 100 | " -H save HTML generated by the page\n" 101 | " -d DOMAINS comma separated list of allowed domains (ex *.target.com)\n" 102 | " -c COOKIES cookies as json or name=value pairs separaded by semicolon\n" 103 | " -C COOKIE_FILE path to file containing COOKIES \n" 104 | " -r REFERER set initial referer\n" 105 | " -x EXCLUDED comma separated list of urls to exclude (regex) - ie logout urls\n" 106 | " -p PROXY proxy string protocol:host:port - protocol can be 'http' or 'socks5'\n" 107 | " -n THREADS number of parallel threads (default: " + str(self.defaults['num_threads']) + ")\n" 108 | " -A CREDENTIALS username and password used for HTTP authentication separated by a colon\n" 109 | " -U USERAGENT set user agent\n" 110 | " -t TIMEOUT maximum seconds spent to analyze a page (default " + str(self.defaults['process_timeout']) + ")\n" 111 | " -u USER_SCRIPT inject USER_SCRIPT into any loaded page\n" 112 | " -S skip initial checks\n" 113 | " -G group query_string parameters with the same name ('[]' ending excluded)\n" 114 | " -N don't normalize URL path (keep ../../)\n" 115 | " -R maximum number of redirects to follow (default " + str(self.defaults['max_redirects']) + ")\n" 116 | " -I ignore robots.txt\n" 117 | " -O dont't override timeout functions (setTimeout, setInterval)\n" 118 | " -K keep elements in the DOM (prevent removal)\n" 119 | ) 120 | 121 | 122 | def generate_filename(self, name, out_file_overwrite): 123 | fname = generate_filename(name, None, out_file_overwrite) 124 | if out_file_overwrite: 125 | if os.path.exists(fname): 126 | os.remove(fname) 127 | 128 | return fname 129 | 130 | 131 | 132 | def kill_threads(self, threads): 133 | for th in threads: 134 | if th.isAlive(): th.exit = True 135 | # start notify() chain 136 | Shared.th_condition.acquire() 137 | Shared.th_condition.notifyAll() 138 | Shared.th_condition.release() 139 | 140 | 141 | 142 | def parse_cookie_string(self, string): 143 | 144 | cookies = [] 145 | try: 146 | cookies = json.loads(string) 147 | except ValueError: 148 | tok = re.split("; *", string) 149 | for t in tok: 150 | k, v = t.split("=", 1) 151 | cookies.append({"name":k.strip(), "value":unquote(v.strip())}) 152 | except Exception as e: 153 | raise 154 | 155 | return cookies 156 | 157 | 158 | 159 | def init_db(self, dbname, report_name): 160 | infos = { 161 | "target": Shared.starturl, 162 | "scan_date": -1, 163 | "urls_scanned": -1, 164 | "scan_time": -1, 165 | 'command_line': " ".join(sys.argv) 166 | } 167 | 168 | database = Database(dbname, report_name, infos) 169 | return database 170 | 171 | def check_startrequest(self, request): 172 | 173 | h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy']) 174 | try: 175 | h.get_requests() 176 | except NotHtmlException: 177 | print "\nError: Document is not html" 178 | sys.exit(1) 179 | except Exception as e: 180 | print "\nError: unable to open url: %s" % e 181 | sys.exit(1) 182 | 183 | def get_requests_from_robots(self, request): 184 | purl = urlsplit(request.url) 185 | url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc) 186 | 187 | getreq = Request(REQTYPE_LINK, "GET", url) 188 | try: 189 | # request, timeout, retries=None, useragent=None, proxy=None): 190 | httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy']) 191 | lines = httpget.get_file().split("\n") 192 | except urllib2.HTTPError: 193 | return [] 194 | except: 195 | raise 196 | 197 | requests = [] 198 | for line in lines: 199 | directive = "" 200 | url = None 201 | try: 202 | directive, url = re.sub("\#.*","",line).split(":",1) 203 | except: 204 | continue # ignore errors 205 | 206 | if re.match("(dis)?allow", directive.strip(), re.I): 207 | req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request) 208 | requests.append(req) 209 | 210 | 211 | return adjust_requests(requests) if requests else [] 212 | 213 | 214 | def randstr(self, length): 215 | all_chars = string.digits + string.letters + string.punctuation 216 | random_string = ''.join(choice(all_chars) for _ in range(length)) 217 | return random_string 218 | 219 | 220 | 221 | def main_loop(self, threads, start_requests, database, display_progress = True, verbose = False,taskid=''): 222 | pending = len(start_requests) 223 | crawled = 0 224 | 225 | req_to_crawl = start_requests 226 | try: 227 | while True: 228 | 229 | if display_progress and not verbose: 230 | tot = (crawled + pending) 231 | print_progressbar(tot, crawled, self.crawl_start_time, "pages processed") 232 | 233 | if pending == 0: 234 | # is the check of running threads really needed? 235 | running_threads = [t for t in threads if t.status == THSTAT_RUNNING] 236 | if len(running_threads) == 0: 237 | if display_progress or verbose: 238 | print "" 239 | break 240 | 241 | if len(req_to_crawl) > 0: 242 | Shared.th_condition.acquire() 243 | Shared.requests.extend(req_to_crawl) 244 | Shared.th_condition.notifyAll() 245 | Shared.th_condition.release() 246 | 247 | req_to_crawl = [] 248 | Shared.main_condition.acquire() 249 | Shared.main_condition.wait(1) 250 | if len(Shared.crawl_results) > 0: 251 | #database.connect() 252 | #database.begin() 253 | for result in Shared.crawl_results: 254 | crawled += 1 255 | pending -= 1 256 | if verbose: 257 | print "crawl result for: %s " % result.request 258 | if len(result.request.user_output) > 0: 259 | print " user: %s" % json.dumps(result.request.user_output) 260 | if result.errors: 261 | print "* crawler errors: %s" % ", ".join(result.errors) 262 | 263 | #database.save_crawl_result(result, True) 264 | for req in result.found_requests: 265 | ######tips 266 | #print req.url,req.data,req.method,Shared.allowed_domains 267 | 268 | if verbose: 269 | print " new request found %s" % req 270 | 271 | urlfilt = HostFilter(req.url) 272 | if urlfilt.urlfilter(): 273 | database.save_request(req,taskid) 274 | 275 | if request_is_crawlable(req) and req not in Shared.requests and req not in req_to_crawl: 276 | if request_depth(req) > Shared.options['max_depth'] or request_post_depth(req) > Shared.options['max_post_depth']: 277 | if verbose: 278 | print " * cannot crawl: %s : crawl depth limit reached" % req 279 | result = CrawlResult(req, errors=[ERROR_CRAWLDEPTH]) 280 | #database.save_crawl_result(result, False) 281 | continue 282 | 283 | if req.redirects > Shared.options['max_redirects']: 284 | if verbose: 285 | print " * cannot crawl: %s : too many redirects" % req 286 | result = CrawlResult(req, errors=[ERROR_MAXREDIRECTS]) 287 | #database.save_crawl_result(result, False) 288 | continue 289 | 290 | pending += 1 291 | req_to_crawl.append(req) 292 | 293 | Shared.crawl_results = [] 294 | Shared.main_condition.release() 295 | 296 | except KeyboardInterrupt: 297 | print "\nTerminated by user" 298 | try: 299 | Shared.main_condition.release() 300 | Shared.th_condition.release() 301 | except: 302 | pass 303 | 304 | 305 | def check_user_script_syntax(self, probe_cmd, user_script): 306 | try: 307 | exe = CommandExecutor(probe_cmd + ["-u", user_script, "-v"] , False) 308 | out = exe.execute(5) 309 | if out: 310 | print "\n* USER_SCRIPT error: %s" % out 311 | sys.exit(1) 312 | stdoutw(". ") 313 | except KeyboardInterrupt: 314 | print "\nAborted" 315 | sys.exit(0) 316 | 317 | 318 | def init_crawl(self, start_req, check_starturl, get_robots_txt): 319 | start_requests = [start_req] 320 | try: 321 | if check_starturl: 322 | self.check_startrequest(start_req) 323 | stdoutw(". ") 324 | 325 | if get_robots_txt: 326 | rrequests = self.get_requests_from_robots(start_req) 327 | stdoutw(". ") 328 | for req in rrequests: 329 | if request_is_crawlable(req) and not req in start_requests: 330 | start_requests.append(req) 331 | except KeyboardInterrupt: 332 | print "\nAborted" 333 | sys.exit(0) 334 | 335 | return start_requests 336 | 337 | 338 | def main(self, argv): 339 | Shared.options = self.defaults 340 | Shared.th_condition = threading.Condition() 341 | Shared.main_condition = threading.Condition() 342 | 343 | 344 | probe_cmd = get_phantomjs_cmd() 345 | if not probe_cmd: 346 | print "Error: unable to find phantomjs executable" 347 | sys.exit(1) 348 | 349 | start_cookies = [] 350 | start_referer = None 351 | 352 | probe_options = ["-R", self.randstr(20)] 353 | threads = [] 354 | num_threads = self.defaults['num_threads'] 355 | 356 | out_file = "" 357 | out_file_overwrite = self.defaults['out_file_overwrite'] 358 | cookie_string = None 359 | display_progress = True 360 | verbose = False 361 | initial_checks = True 362 | http_auth = None 363 | get_robots_txt = True 364 | save_html = False 365 | user_script = None 366 | 367 | try: 368 | opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:') 369 | except getopt.GetoptError as err: 370 | print str(err) 371 | sys.exit(1) 372 | 373 | 374 | if len(args) < 1: 375 | self.usage() 376 | sys.exit(1) 377 | 378 | 379 | 380 | for o, v in opts: 381 | if o == '-h': 382 | self.usage() 383 | sys.exit(0) 384 | elif o == '-c': 385 | cookie_string = v 386 | elif o == '-C': 387 | try: 388 | with open(v) as cf: 389 | cookie_string = cf.read() 390 | except Exception as e: 391 | print "error reading cookie file" 392 | sys.exit(1) 393 | elif o == '-r': 394 | start_referer = v 395 | elif o == '-n': 396 | num_threads = int(v) 397 | elif o == '-t': 398 | Shared.options['process_timeout'] = int(v) 399 | elif o == '-q': 400 | display_progress = False 401 | elif o == '-A': 402 | http_auth = v 403 | elif o == '-p': 404 | if v == "tor": v = "socks5:127.0.0.1:9150" 405 | proxy = v.split(":") 406 | if proxy[0] not in ("http", "socks5"): 407 | print "only http and socks5 proxies are supported" 408 | sys.exit(1) 409 | Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]} 410 | elif o == '-d': 411 | for ad in v.split(","): 412 | # convert *.domain.com to *.\.domain\.com 413 | pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)") 414 | Shared.allowed_domains.add(pattern) 415 | elif o == '-x': 416 | for eu in v.split(","): 417 | Shared.excluded_urls.add(eu) 418 | elif o == "-G": 419 | Shared.options['group_qs'] = True 420 | #elif o == "-w": 421 | # out_file_overwrite = True 422 | elif o == "-R": 423 | Shared.options['max_redirects'] = int(v) 424 | elif o == "-U": 425 | Shared.options['useragent'] = v 426 | elif o == "-s": 427 | if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL): 428 | self.usage() 429 | print "* ERROR: wrong scope set '%s'" % v 430 | sys.exit(1) 431 | Shared.options['scope'] = v 432 | elif o == "-m": 433 | if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE): 434 | self.usage() 435 | print "* ERROR: wrong mode set '%s'" % v 436 | sys.exit(1) 437 | Shared.options['mode'] = v 438 | elif o == "-S": 439 | initial_checks = False 440 | elif o == "-I": 441 | get_robots_txt = False 442 | elif o == "-H": 443 | save_html = True 444 | elif o == "-D": 445 | Shared.options['max_depth'] = int(v) 446 | elif o == "-P": 447 | Shared.options['max_post_depth'] = int(v) 448 | elif o == "-O": 449 | Shared.options['override_timeout_functions'] = False 450 | elif o == "-F": 451 | Shared.options['crawl_forms'] = False 452 | elif o == "-u": 453 | if os.path.isfile(v): 454 | user_script = os.path.abspath(v) 455 | else: 456 | print "error: unable to open USER_SCRIPT" 457 | sys.exit(1) 458 | 459 | 460 | if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0: 461 | print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN 462 | 463 | if cookie_string: 464 | try: 465 | start_cookies = self.parse_cookie_string(cookie_string) 466 | except Exception as e: 467 | print "error decoding cookie string" 468 | sys.exit(1) 469 | 470 | if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE: 471 | probe_options.append("-f") # dont fill values 472 | if Shared.options['mode'] == CRAWLMODE_PASSIVE: 473 | probe_options.append("-t") # dont trigger events 474 | 475 | if Shared.options['proxy']: 476 | probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto']) 477 | probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port'])) 478 | 479 | probe_cmd.append(self.base_dir + 'probe/analyze.js') 480 | 481 | 482 | if len(Shared.excluded_urls) > 0: 483 | probe_options.extend(("-X", ",".join(Shared.excluded_urls))) 484 | 485 | if save_html: 486 | probe_options.append("-H") 487 | 488 | if user_script: 489 | probe_options.extend(("-u", user_script)) 490 | 491 | probe_options.extend(("-x", str(Shared.options['process_timeout']))) 492 | probe_options.extend(("-A", Shared.options['useragent'])) 493 | 494 | if not Shared.options['override_timeout_functions']: 495 | probe_options.append("-O") 496 | 497 | Shared.probe_cmd = probe_cmd + probe_options 498 | 499 | 500 | Shared.starturl = normalize_url(args[0]) 501 | #out_file = args[1] 502 | 503 | purl = urlsplit(Shared.starturl) 504 | try: 505 | pdomain = get_tld(Shared.starturl) 506 | except: 507 | pdomain = purl.hostname 508 | if purl.hostname == pdomain: 509 | Shared.allowed_domains.add(purl.hostname) 510 | else: 511 | Shared.allowed_domains.add(pdomain) 512 | Shared.allowed_domains.add(purl.hostname) 513 | 514 | 515 | for sc in start_cookies: 516 | Shared.start_cookies.append(Cookie(sc, Shared.starturl)) 517 | 518 | 519 | start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer) 520 | 521 | if not hasattr(ssl, "SSLContext"): 522 | print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors" 523 | 524 | if user_script and initial_checks: 525 | self.check_user_script_syntax(probe_cmd, user_script) 526 | 527 | start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt) 528 | 529 | database = None 530 | fname = None 531 | try: 532 | database = self.init_db(fname, out_file) 533 | except Exception as e: 534 | print str(e) 535 | 536 | taskid = database.save_crawl_info( 537 | target = Shared.starturl, 538 | start_date = self.crawl_start_time, 539 | commandline = cmd_to_str(argv), 540 | user_agent = Shared.options['useragent'] 541 | ) 542 | self.taskid = taskid 543 | 544 | for req in start_requests: 545 | urlfilt = HostFilter(req.url) 546 | if urlfilt.urlfilter(): 547 | database.save_request(req,self.taskid) 548 | 549 | print "initialized, crawl started with %d threads" % (num_threads) 550 | 551 | for n in range(0, num_threads): 552 | thread = CrawlerThread() 553 | threads.append(thread) 554 | thread.start() 555 | 556 | 557 | self.main_loop(threads, start_requests, database, display_progress, verbose,self.taskid) 558 | 559 | self.kill_threads(threads) 560 | 561 | self.crawl_end_time = int(time.time()) 562 | 563 | print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60) 564 | database.update_crawl_info(self.taskid,self.crawl_end_time) 565 | -------------------------------------------------------------------------------- /core/crawl/crawler_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from __future__ import unicode_literals 14 | import time 15 | import re 16 | import json 17 | import urllib 18 | import cookielib 19 | import threading 20 | import base64 21 | 22 | import tempfile 23 | import os 24 | import uuid 25 | 26 | from urlparse import urlparse, urlsplit, urljoin, parse_qsl 27 | 28 | from core.lib.exception import * 29 | from core.crawl.lib.shared import * 30 | 31 | 32 | from core.crawl.lib.probe import Probe 33 | 34 | from core.lib.http_get import HttpGet 35 | from core.lib.cookie import Cookie 36 | from core.lib.shell import CommandExecutor 37 | from core.lib.request import Request 38 | 39 | from core.lib.utils import * 40 | from core.constants import * 41 | 42 | from lib.utils import * 43 | from lib.crawl_result import * 44 | 45 | 46 | class CrawlerThread(threading.Thread): 47 | 48 | def __init__(self): 49 | threading.Thread.__init__(self) 50 | self.thread_uuid = uuid.uuid4() 51 | self.process_retries = 2 52 | self.process_retries_interval = 0.5 53 | 54 | self.status = THSTAT_RUNNING 55 | self.exit = False 56 | 57 | self.cookie_file = "%s%shtcap_cookiefile-%s.json" % (tempfile.gettempdir(), os.sep, self.thread_uuid) 58 | 59 | 60 | def run(self): 61 | self.crawl() 62 | 63 | 64 | 65 | def wait_request(self): 66 | request = None 67 | Shared.th_condition.acquire() 68 | while True: 69 | if self.exit == True: 70 | Shared.th_condition.notifyAll() 71 | Shared.th_condition.release() 72 | raise ThreadExitRequestException("exit request received") 73 | 74 | if Shared.requests_index >= len(Shared.requests): 75 | self.status = THSTAT_WAITING 76 | Shared.th_condition.wait() # The wait method releases the lock, blocks the current thread until another thread calls notify 77 | continue 78 | 79 | request = Shared.requests[Shared.requests_index] 80 | Shared.requests_index += 1 81 | 82 | break 83 | 84 | Shared.th_condition.release() 85 | 86 | self.status = THSTAT_RUNNING 87 | 88 | return request 89 | 90 | 91 | 92 | def load_probe_json(self, jsn): 93 | jsn = jsn.strip() 94 | if not jsn: jsn = "[" 95 | if jsn[-1] != "]": 96 | jsn += '{"status":"ok", "partialcontent":true}]' 97 | try: 98 | return json.loads(jsn) 99 | except Exception: 100 | #print "-- JSON DECODE ERROR %s" % jsn 101 | raise 102 | 103 | 104 | def send_probe(self, request, errors): 105 | 106 | url = request.url 107 | jsn = None 108 | probe = None 109 | retries = self.process_retries 110 | params = [] 111 | cookies = [] 112 | 113 | 114 | if request.method == "POST": 115 | params.append("-P") 116 | if request.data: 117 | params.extend(("-D", request.data)) 118 | 119 | 120 | if len(request.cookies) > 0: 121 | for cookie in request.cookies: 122 | cookies.append(cookie.get_dict()) 123 | 124 | with open(self.cookie_file,'w') as fil: 125 | fil.write(json.dumps(cookies)) 126 | 127 | params.extend(("-c", self.cookie_file)) 128 | 129 | 130 | 131 | if request.http_auth: 132 | params.extend(("-p" ,request.http_auth)) 133 | 134 | if Shared.options['set_referer'] and request.referer: 135 | params.extend(("-r", request.referer)) 136 | 137 | 138 | params.extend(("-i", str(request.db_id))) 139 | 140 | params.append(url) 141 | 142 | 143 | while retries: 144 | #while False: 145 | 146 | # print cmd_to_str(Shared.probe_cmd + params) 147 | # print "" 148 | 149 | cmd = CommandExecutor(Shared.probe_cmd + params) 150 | jsn = cmd.execute(Shared.options['process_timeout'] + 2) 151 | 152 | if jsn == None: 153 | errors.append(ERROR_PROBEKILLED) 154 | time.sleep(self.process_retries_interval) # ... ??? 155 | retries -= 1 156 | continue 157 | 158 | 159 | # try to decode json also after an exception .. sometimes phantom crashes BUT returns a valid json .. 160 | try: 161 | if jsn and type(jsn) is not str: 162 | jsn = jsn[0] 163 | probeArray = self.load_probe_json(jsn) 164 | except Exception as e: 165 | raise 166 | 167 | 168 | if probeArray: 169 | probe = Probe(probeArray, request) 170 | 171 | if probe.status == "ok": 172 | break 173 | 174 | errors.append(probe.errcode) 175 | 176 | if probe.errcode in (ERROR_CONTENTTYPE, ERROR_PROBE_TO): 177 | break 178 | 179 | time.sleep(self.process_retries_interval) 180 | retries -= 1 181 | 182 | return probe 183 | 184 | 185 | 186 | def crawl(self): 187 | 188 | while True: 189 | url = None 190 | cookies = [] 191 | requests = [] 192 | 193 | requests_to_crawl = [] 194 | redirects = 0 195 | errors = [] 196 | 197 | try: 198 | request = self.wait_request() 199 | except ThreadExitRequestException: 200 | if os.path.exists(self.cookie_file): 201 | os.remove(self.cookie_file) 202 | return 203 | except Exception as e: 204 | print "-->"+str(e) 205 | continue 206 | 207 | url = request.url 208 | 209 | purl = urlsplit(url) 210 | 211 | 212 | probe = None 213 | 214 | probe = self.send_probe(request, errors) 215 | 216 | if probe: 217 | if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO: 218 | 219 | requests = probe.requests 220 | 221 | if probe.html: 222 | request.html = probe.html 223 | 224 | if len(probe.user_output) > 0: 225 | request.user_output = probe.user_output 226 | 227 | else : 228 | errors.append(ERROR_PROBEFAILURE) 229 | # get urls with python to continue crawling 230 | if Shared.options['use_urllib_onerror'] == False: 231 | continue 232 | try: 233 | hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy']) 234 | requests = hr.get_requests() 235 | except Exception as e: 236 | errors.append(str(e)) 237 | 238 | 239 | # set out_of_scope, apply user-supplied filters to urls (ie group_qs) 240 | adjust_requests(requests) 241 | 242 | Shared.main_condition.acquire() 243 | res = CrawlResult(request, requests, errors) 244 | Shared.crawl_results.append(res) 245 | Shared.main_condition.notify() 246 | Shared.main_condition.release() 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /core/crawl/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/lib/__init__.py -------------------------------------------------------------------------------- /core/crawl/lib/crawl_result.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | 15 | class CrawlResult: 16 | def __init__(self, request, found_requests = None, errors = None): 17 | self.request = request 18 | self.found_requests = found_requests if found_requests else [] 19 | self.errors = errors if errors else [] 20 | 21 | -------------------------------------------------------------------------------- /core/crawl/lib/probe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | from core.lib.request import Request 15 | from core.lib.cookie import Cookie 16 | from core.constants import * 17 | 18 | class Probe: 19 | 20 | def __init__(self, data, parent): 21 | self.status = "ok" 22 | self.requests = [] 23 | self.cookies = [] 24 | self.redirect = None; 25 | # if True the probe returned no error BUT the json is not closed properly 26 | self.partialcontent = False 27 | self.html = None 28 | self.user_output = [] 29 | 30 | status = data.pop() 31 | 32 | if status['status'] == "error": 33 | self.status = "error" 34 | self.errcode = status['code'] 35 | 36 | 37 | if "partialcontent" in status: 38 | self.partialcontent = status['partialcontent'] 39 | 40 | # grap cookies before creating rquests 41 | for key,val in data: 42 | if key == "cookies": 43 | for cookie in val: 44 | self.cookies.append(Cookie(cookie, parent.url)) 45 | 46 | if "redirect" in status: 47 | self.redirect = status['redirect'] 48 | r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id) 49 | self.requests.append(r) 50 | 51 | for key,val in data: 52 | if key == "request": 53 | trigger = val['trigger'] if 'trigger' in val else None 54 | r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, data=val['data'], trigger=trigger, parent_db_id=parent.db_id ) 55 | self.requests.append(r) 56 | elif key == "html": 57 | self.html = val 58 | elif key == "user": 59 | self.user_output.append(val) 60 | 61 | 62 | 63 | # @TODO handle cookies set by ajax (in probe too) 64 | -------------------------------------------------------------------------------- /core/crawl/lib/shared.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | 14 | class Shared: 15 | """ 16 | data shared between threads 17 | """ 18 | 19 | main_condition = None 20 | th_condition = None 21 | 22 | requests = [] 23 | requests_index = 0 24 | crawl_results = [] 25 | 26 | starturl = "" 27 | start_cookies = [] 28 | allowed_domains = set() 29 | excluded_urls = set() 30 | 31 | options = {} 32 | 33 | -------------------------------------------------------------------------------- /core/crawl/lib/urlfinder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import re 14 | from HTMLParser import HTMLParser 15 | from urlparse import urljoin, urlparse 16 | 17 | 18 | class UrlFinder: 19 | def __init__(self, html): 20 | self.html = html 21 | 22 | def get_urls(self): 23 | 24 | try: 25 | parser = UrlHTMLParser() 26 | parser.feed(self.html) 27 | except: 28 | raise 29 | 30 | return parser.urls 31 | 32 | 33 | class UrlHTMLParser(HTMLParser): 34 | def __init__(self): 35 | 36 | HTMLParser.__init__(self) 37 | self.base_url = "" 38 | self.urls = [] 39 | 40 | def handle_starttag(self, tag, attrs): 41 | # more info about the tag: https://www.w3.org/wiki/HTML/Elements/base 42 | if tag == "base": 43 | for key, val in attrs: 44 | if key == "href": 45 | self.base_url = urlparse(val.strip()).geturl() 46 | 47 | elif tag == "a": 48 | for key, val in attrs: 49 | if key == "href": 50 | if re.match("^https?://", val, re.I): 51 | self.urls.extend([val]) 52 | elif not re.match("^[a-z]+:", val, re.I) and not val.startswith("#"): 53 | self.urls.extend([urljoin(self.base_url, val)]) 54 | -------------------------------------------------------------------------------- /core/crawl/lib/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - beta 1 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | from urlparse import urljoin 14 | from core.lib.cookie import Cookie 15 | from core.lib.utils import * 16 | from shared import * 17 | import posixpath 18 | import json 19 | import re 20 | 21 | 22 | 23 | def request_in_scope(request): 24 | url = request.url 25 | purl = urlsplit(url) 26 | spurl = urlsplit(Shared.starturl) 27 | scope = Shared.options['scope'] 28 | in_scope = False 29 | 30 | # check for scopes 31 | if scope == CRAWLSCOPE_DOMAIN: 32 | for pattern in Shared.allowed_domains: 33 | if re.match(pattern, purl.hostname): 34 | in_scope = True 35 | break 36 | 37 | elif scope == CRAWLSCOPE_DIRECTORY: 38 | if purl.hostname != spurl.hostname: 39 | in_scope = False 40 | else: 41 | path = [p for p in posixpath.dirname(purl.path).split("/") if p] 42 | spath = [p for p in posixpath.dirname(spurl.path).split("/") if p] 43 | in_scope = path[:len(spath)] == spath 44 | 45 | elif scope == CRAWLSCOPE_URL: 46 | in_scope = url == Shared.starturl 47 | 48 | 49 | # check for excluded urls 50 | for pattern in Shared.excluded_urls: 51 | if re.match(pattern, request.url): 52 | in_scope = False 53 | break 54 | 55 | return in_scope 56 | 57 | 58 | 59 | def adjust_requests(requests): 60 | """ 61 | adjust an array of requsts according to current status/settings 62 | 1. sets the out_of_scope property 63 | 2. normalize url accoding to user settings 64 | """ 65 | 66 | for request in requests: 67 | if request.type == REQTYPE_UNKNOWN or not request_in_scope(request): 68 | request.out_of_scope = True 69 | 70 | if Shared.options['group_qs']: 71 | request.url = group_qs_params(request.url) 72 | 73 | return requests 74 | 75 | 76 | def request_depth(request): 77 | if request.parent == None: 78 | return 1 79 | 80 | return 1 + request_depth(request.parent) 81 | 82 | 83 | 84 | def request_post_depth(request): 85 | if request.method != "POST": 86 | return 0 87 | 88 | if request.parent == None or request.parent.method != "POST": 89 | return 1 90 | 91 | return 1 + request_post_depth(request.parent) 92 | 93 | 94 | 95 | def request_is_crawlable(request): 96 | if request.out_of_scope: 97 | return False 98 | 99 | types = [REQTYPE_LINK, REQTYPE_REDIRECT] 100 | if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']: 101 | types.append(REQTYPE_FORM) 102 | 103 | return request.type in types and re.match("^https?://", request.url, re.I) 104 | 105 | 106 | -------------------------------------------------------------------------------- /core/crawl/probe/analyze.js: -------------------------------------------------------------------------------- 1 | /* 2 | HTCAP - beta 1 3 | Author: filippo.cavallarin@wearesegment.com 4 | 5 | This program is free software; you can redistribute it and/or modify it under 6 | the terms of the GNU General Public License as published by the Free Software 7 | Foundation; either version 2 of the License, or (at your option) any later 8 | version. 9 | */ 10 | 11 | var system = require('system'); 12 | var fs = require('fs'); 13 | 14 | 15 | 16 | phantom.injectJs("functions.js"); 17 | phantom.injectJs("options.js"); 18 | phantom.injectJs("probe.js"); 19 | 20 | 21 | var startTime = Date.now(); 22 | 23 | 24 | var site = ""; 25 | var response = null; 26 | //var showHelp = false; 27 | 28 | var headers = {}; 29 | 30 | var args = getopt(system.args,"hVaftUJdICc:MSEp:Tsx:A:r:mHX:PD:R:Oi:u:v"); 31 | 32 | var page = require('webpage').create(); 33 | var page_settings = {encoding: "utf8"}; 34 | var random = "IsHOulDb34RaNd0MsTR1ngbUt1mN0t"; 35 | //var injectScript = "{}"; 36 | var US = null; 37 | 38 | var userInterface = { 39 | id: null, 40 | vars: {}, 41 | pageEval: function(fnc){ 42 | var sfnc = 'return (' + fnc.toString() + ').apply(null, arguments)'; 43 | return page.evaluate(function(fnc){ 44 | return (new Function('', fnc)).apply(null, window.__PROBE__.currentUserScriptParameters) 45 | }, sfnc); 46 | }, 47 | render: function(file){ 48 | try { 49 | page.render(file); 50 | return true; 51 | } catch(e){ 52 | return false; 53 | } 54 | }, 55 | print: function(str){ 56 | console.log('["user",' + JSON.stringify(str) + '],'); 57 | }, 58 | fread: function(file){ 59 | try{ 60 | return "" + fs.read(file); 61 | } catch(e){ 62 | return false; 63 | } 64 | }, 65 | fwrite: function(file, content, mode){ 66 | try { 67 | fs.write(file, content, mode || 'w'); 68 | return true; 69 | } catch(e) { 70 | console.log(e) 71 | return false; 72 | } 73 | } 74 | } 75 | 76 | if(typeof args == 'string'){ 77 | console.log("Error: " + args); 78 | phantom.exit(-1); 79 | } 80 | 81 | for(var a = 0; a < args.opts.length; a++){ 82 | switch(args.opts[a][0]){ 83 | case "h": 84 | usage(); 85 | phantom.exit(1); 86 | break; 87 | case "P": 88 | page_settings.operation = "POST"; 89 | break; 90 | case "D": 91 | page_settings.data = args.opts[a][1]; 92 | break; 93 | case "R": 94 | random = args.opts[a][1]; 95 | break; 96 | case "u": 97 | if(!phantom.injectJs(args.opts[a][1])){ 98 | console.log("File not found: " + args.opts[a][1]); 99 | phantom.exit(0); 100 | } 101 | if(!window.US){ 102 | phantom.exit(0); 103 | } 104 | break; 105 | case "v": 106 | phantom.exit(0); 107 | } 108 | } 109 | 110 | 111 | parseArgsToOptions(args); 112 | userInterface.id = options.id; 113 | 114 | site = args.args[1]; 115 | 116 | if(!site){ 117 | usage(); 118 | phantom.exit(-1); 119 | } 120 | 121 | site = site.trim(); 122 | if(site.length < 4 || site.substring(0,4).toLowerCase() != "http"){ 123 | site = "http://" + site; 124 | } 125 | 126 | console.log("["); 127 | 128 | /* maximum execution time */ 129 | setTimeout(execTimedOut,options.maxExecTime); 130 | 131 | 132 | 133 | phantom.onError = function(msg, trace) { 134 | var msgStack = ['PHANTOM ERROR: ' + msg]; 135 | if (trace && trace.length) { 136 | msgStack.push('TRACE:'); 137 | trace.forEach(function(t) { 138 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : '')); 139 | }); 140 | } 141 | console.error(msgStack.join('\n')); 142 | phantom.exit(1); 143 | }; 144 | 145 | 146 | 147 | page.onConsoleMessage = function(msg, lineNum, sourceId) { 148 | if(options.verbose) 149 | console.log("console: " + msg); 150 | } 151 | page.onError = function(msg, lineNum, sourceId) { 152 | if(options.verbose) 153 | console.log("console error: on " + JSON.stringify(lineNum) + " " + msg); 154 | } 155 | 156 | page.onAlert = function(msg) { 157 | if(options.verbose) 158 | console.log('ALERT: ' + msg); 159 | }; 160 | 161 | page.settings.userAgent = options.userAgent; 162 | page.settings.loadImages = options.loadImages; 163 | 164 | 165 | 166 | page.onResourceReceived = function(resource) { 167 | if(window.response == null){ 168 | window.response = resource; 169 | // @TODO sanytize response.contentType 170 | 171 | } 172 | }; 173 | 174 | 175 | page.onResourceRequested = function(requestData, networkRequest) { 176 | //console.log(JSON.stringify(requestData)) 177 | }; 178 | 179 | // to detect window.location= / document.location.href= 180 | page.onNavigationRequested = onNavigationRequested; 181 | 182 | page.onConfirm = function(msg) {return true;} // recently changed 183 | 184 | /* phantomjs issue #11684 workaround */ 185 | var isPageInitialized = false; 186 | page.onInitialized = function(){ 187 | if(isPageInitialized) return; 188 | isPageInitialized = true; 189 | 190 | // try to hide phantomjs 191 | page.evaluate(function(){ 192 | window.__callPhantom = window.callPhantom; 193 | delete window.callPhantom; 194 | }); 195 | 196 | startProbe(random/*, injectScript*/); 197 | 198 | }; 199 | 200 | 201 | page.onCallback = function(data) { 202 | switch(data.cmd){ 203 | case "triggerUserEvent": 204 | var ret = window.US[data.argument.name](window.userInterface) 205 | return ret; 206 | case "print": 207 | console.log(data.argument); 208 | break; 209 | 210 | case "end": 211 | if(options.returnHtml){ 212 | page.evaluate(function(options){ 213 | window.__PROBE__.printPageHTML(); 214 | }, options); 215 | } 216 | 217 | page.evaluate(function(options){ 218 | window.__PROBE__.triggerUserEvent("onEnd"); 219 | }); 220 | 221 | printStatus("ok", window.response.contentType); 222 | phantom.exit(0); 223 | break; 224 | 225 | } 226 | 227 | } 228 | 229 | 230 | 231 | if(options.httpAuth){ 232 | headers['Authorization'] = 'Basic ' + btoa(options.httpAuth[0] + ":" + options.httpAuth[1]); 233 | } 234 | 235 | if(options.referer){ 236 | headers['Referer'] = options.referer; 237 | } 238 | 239 | page.customHeaders = headers; 240 | 241 | 242 | for(var a = 0; a < options.setCookies.length; a++){ 243 | // maybe this is wrogn acconding to rfc .. but phantomjs cannot set cookie witout a domain... 244 | if(!options.setCookies[a].domain){ 245 | var purl = document.createElement("a"); 246 | purl.href=site; 247 | options.setCookies[a].domain = purl.hostname 248 | } 249 | if(options.setCookies[a].expires) 250 | options.setCookies[a].expires *= 1000; 251 | 252 | phantom.addCookie(options.setCookies[a]); 253 | 254 | } 255 | 256 | page.viewportSize = { 257 | width: 1920, 258 | height: 1080 259 | }; 260 | 261 | 262 | 263 | 264 | page.open(site, page_settings, function(status) { 265 | var response = window.response; // just to be clear 266 | if (status !== 'success'){ 267 | var mess = ""; 268 | var out = {response: response}; 269 | if(!response || response.headers.length == 0){ 270 | printStatus("error", "load"); 271 | phantom.exit(1); 272 | } 273 | 274 | // check for redirect first 275 | for(var a = 0; a < response.headers.length; a++){ 276 | if(response.headers[a].name.toLowerCase() == 'location'){ 277 | 278 | if(options.getCookies){ 279 | printCookies(response.headers, site); 280 | } 281 | printStatus("ok", null, null, response.headers[a].value); 282 | phantom.exit(0); 283 | } 284 | } 285 | 286 | assertContentTypeHtml(response); 287 | 288 | phantom.exit(1); 289 | } 290 | 291 | 292 | if(options.getCookies){ 293 | printCookies(response.headers, site); 294 | } 295 | 296 | assertContentTypeHtml(response); 297 | 298 | page.evaluate(function(){ 299 | 300 | window.__PROBE__.waitAjax(function(ajaxTriggered){ 301 | window.__PROBE__.triggerUserEvent("onStart"); 302 | if(ajaxTriggered){ 303 | window.__PROBE__.triggerUserEvent("onAllXhrsCompleted"); 304 | } 305 | console.log("startAnalysis") 306 | window.__PROBE__.startAnalysis(); 307 | }); 308 | }) 309 | 310 | 311 | }); 312 | 313 | 314 | 315 | -------------------------------------------------------------------------------- /core/crawl/probe/functions.js: -------------------------------------------------------------------------------- 1 | /* 2 | HTCAP - beta 1 3 | Author: filippo.cavallarin@wearesegment.com 4 | 5 | This program is free software; you can redistribute it and/or modify it under 6 | the terms of the GNU General Public License as published by the Free Software 7 | Foundation; either version 2 of the License, or (at your option) any later 8 | version. 9 | */ 10 | 11 | // @todo error on Unknown option ds 12 | function getopt(arguments, optstring){ 13 | var args = arguments.slice(); 14 | var ret = { 15 | opts: [], 16 | args: args 17 | }; 18 | 19 | var m = optstring.match(/[a-zA-Z]\:*/g); 20 | for(var a = 0; a < m.length; a++){ 21 | var ai = args.indexOf("-" + m[a][0]); 22 | if(ai > -1){ 23 | if(m[a][1] == ":"){ 24 | if(args[ai+1]){ 25 | ret.opts.push([m[a][0], args[ai+1]]); 26 | args.splice(ai,2); 27 | } else { 28 | return "missing argumnet for option " + m[a][0]; 29 | } 30 | } else { 31 | ret.opts.push([m[a][0]]); 32 | args.splice(ai,1); 33 | } 34 | } 35 | } 36 | 37 | return ret; 38 | } 39 | 40 | 41 | function removeHash(url){ 42 | var anchor = document.createElement("a"); 43 | anchor.href = url; 44 | 45 | return anchor.protocol + "//" + anchor.host + anchor.pathname + anchor.search; 46 | } 47 | 48 | 49 | 50 | function compareUrls(url1, url2, includeHash){ 51 | var a1 = document.createElement("a"); 52 | var a2 = document.createElement("a"); 53 | a1.href = url1; 54 | a2.href = url2; 55 | 56 | var eq = (a1.protocol == a2.protocol && a1.host == a2.host && a1.pathname == a2.pathname && a1.search == a2.search); 57 | 58 | if(includeHash) eq = eq && a1.hash == a2.hash; 59 | 60 | return eq; 61 | 62 | } 63 | 64 | 65 | function printCookies(headers, site){ 66 | var cookies = getCookies(headers, site); 67 | console.log('["cookies",' + JSON.stringify(cookies) + "],"); 68 | } 69 | 70 | 71 | function printStatus(status, errcode, message, redirect){ 72 | var o = {status:status}; 73 | if(status == "error"){ 74 | o.code = errcode; 75 | switch(errcode){ 76 | case "load": 77 | break; 78 | case "contentType": 79 | o.message = message; 80 | break; 81 | case "requestTimeout": 82 | break; 83 | case "probe_timeout": 84 | break; 85 | } 86 | } 87 | if(redirect) o.redirect = redirect; 88 | o.time = Math.floor((Date.now() - window.startTime)/1000); 89 | console.log(JSON.stringify(o)); 90 | console.log("]") 91 | } 92 | 93 | 94 | 95 | function execTimedOut(){ 96 | if(!response || response.headers.length == 0){ 97 | printStatus("error", "requestTimeout"); 98 | phantom.exit(0); 99 | } 100 | printStatus("error", "probe_timeout"); 101 | phantom.exit(0); 102 | 103 | } 104 | 105 | 106 | 107 | function usage(){ 108 | var usage = "Usage: analyze.js [options] \n" + 109 | " -V verbose\n" + 110 | " -a don't check ajax\n" + 111 | " -f don't fill values\n" + 112 | " -t don't trigger events (onload only)\n" + 113 | " -s don't check websockets\n" + 114 | " -M dont' map events\n" + 115 | " -T don't trigger mapped events\n" + 116 | " -S don't check for \n" 216 | "\n" 217 | "%s\n" 218 | "\n" 219 | ) 220 | 221 | 222 | jsn = "var report = %s;\n" % self.get_json(cur) 223 | 224 | with open("%sreport.html" % base_dir) as html, open("%sreport.js" % base_dir) as js, open("%sstyle.css" % base_dir) as css: 225 | html = base_html % (css.read(), jsn, js.read(), html.read()) 226 | 227 | with open(outfile,'w') as out: 228 | out.write(html) 229 | 230 | print "Report saved to %s" % outfile 231 | 232 | -------------------------------------------------------------------------------- /core/util/utilities/tocurl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import sqlite3 4 | import json 5 | 6 | from core.lib.utils import * 7 | from core.util.base_util import BaseUtil 8 | 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | 12 | 13 | class Tocurl(BaseUtil): 14 | 15 | @staticmethod 16 | def get_settings(): 17 | return dict( 18 | descr = "Export saved requests to curl arguments", 19 | optargs = '', 20 | minargs = 1 21 | ) 22 | 23 | def usage(self): 24 | return ( 25 | "%s\n" 26 | "usage: %s []\n" 27 | % (self.get_settings()['descr'], self.utilname) 28 | ) 29 | 30 | def main(self, args, opts): 31 | qry = "SELECT method, url, data, referer, cookies FROM request WHERE %s" 32 | 33 | dbfile = args[0] 34 | where = args[1] if len(args) > 1 else "1=1" 35 | 36 | conn = sqlite3.connect(dbfile) 37 | conn.row_factory = sqlite3.Row 38 | 39 | cur = conn.cursor() 40 | cur.execute(qry % where) 41 | for req in cur.fetchall(): 42 | cookies = ["%s=%s" % (c['name'],c['value']) for c in json.loads(req['cookies'])] 43 | cookies_str = "Cookie: %s" % ";".join(cookies) if len(cookies) > 0 else "" 44 | method = "POST" if req['method'] == "POST" else "GET" 45 | referer = "Referer: %s" % req['referer'] if req['referer'] else "" 46 | cmd = [ "-k", '-H',cookies_str, '-X', method, '-H', referer, req['url']] 47 | if req['data']: 48 | cmd.extend(['--data', req['data']]) 49 | 50 | print cmd_to_str(cmd) 51 | -------------------------------------------------------------------------------- /core/util/utilities/updcookie.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import sqlite3 4 | import json 5 | import getopt 6 | import os 7 | 8 | from core.util.base_util import BaseUtil 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | class Updcookie(BaseUtil): 14 | 15 | @staticmethod 16 | def get_settings(): 17 | return dict( 18 | descr = "Update the value of a cookie of saved requests", 19 | optargs = '', 20 | minargs = 3 21 | ) 22 | 23 | def usage(self): 24 | return ( 25 | "%s\n" 26 | "usage: %s []\n" 27 | % (self.get_settings()['descr'], self.utilname) 28 | ) 29 | 30 | 31 | def main(self, argv): 32 | qry = """ 33 | SELECT id, cookies 34 | FROM request 35 | WHERE %s 36 | """ 37 | 38 | dbfile = args[0] 39 | cname = args[1] 40 | cvalue = args[2] 41 | 42 | if not os.path.exists(dbfile): 43 | print "No such file %s" % dbfile 44 | sys.exit(1) 45 | 46 | where = args[3] if len(args) > 3 else "1=1" 47 | 48 | conn = sqlite3.connect(dbfile) 49 | conn.row_factory = sqlite3.Row 50 | 51 | cur = conn.cursor() 52 | wcur = conn.cursor() 53 | cur.execute(qry % where) 54 | pages = {} 55 | for res in cur.fetchall(): 56 | cookies = res['cookies'] 57 | if cookies: 58 | #print cookies 59 | cookies = json.loads(cookies) 60 | for cookie in cookies: 61 | if cookie['name'] == cname: 62 | cookie['value'] = cvalue 63 | wcur.execute("update request set cookies=? where id=?",(json.dumps(cookies), res['id'])) 64 | 65 | conn.commit() 66 | cur.close() 67 | wcur.close() 68 | conn.close() 69 | -------------------------------------------------------------------------------- /core/util/utilities/usgen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | HTCAP - htcap.org 5 | Author: filippo.cavallarin@wearesegment.com 6 | 7 | This program is free software; you can redistribute it and/or modify it under 8 | the terms of the GNU General Public License as published by the Free Software 9 | Foundation; either version 2 of the License, or (at your option) any later 10 | version. 11 | """ 12 | 13 | import sys 14 | import json 15 | import os 16 | 17 | from core.lib.utils import * 18 | from core.util.base_util import BaseUtil 19 | 20 | 21 | class Usgen(BaseUtil): 22 | 23 | @staticmethod 24 | def get_settings(): 25 | return dict( 26 | descr = "Generate a sample user script", 27 | optargs = '', 28 | minargs = 1 29 | ) 30 | 31 | def usage(self): 32 | return ( 33 | "%s\n" 34 | "usage: %s \n" 35 | % (self.get_settings()['descr'], self.utilname) 36 | ) 37 | 38 | 39 | def main(self, args, opts): 40 | usfile = generate_filename(args[0], 'js', False, True) 41 | try: 42 | with open(usfile,'w') as f: 43 | f.write(CONTENT) 44 | print "User Script saved to %s" % usfile 45 | except Exception as e: 46 | print "Unable to write file %s" % usfile 47 | sys.exit(1) 48 | 49 | 50 | CONTENT = """/* 51 | ui Methods: 52 | ui.pageEval(function) - evaluate function in the context of the webpage (no scope chain available) 53 | ui.print(message) - save a per-request user message into the request table 54 | ui.fread(path_to_file) - read from file 55 | ui.fwrite(path_to_file, content, mode) - write to file 56 | ui.render(path_to_file) - save a screenshot of the page current state 57 | */ 58 | 59 | US = { 60 | onInit: function(ui){ 61 | // init local variables 62 | ui.vars.cnt = 0; 63 | 64 | // override native methods 65 | ui.pageEval(function(){ 66 | window.prompt = function(){ return "AAA" }; 67 | }); 68 | }, 69 | 70 | onStart: function(ui){ 71 | ui.pageEval(function(){}); 72 | }, 73 | 74 | onTriggerEvent: function(ui){ 75 | var ok = ui.pageEval(function(element, event){ 76 | if(event == "click" && element.className == 'kill'){ 77 | return false; 78 | } 79 | return true; 80 | }); 81 | // cancel triggering of event 82 | if(!ok) return false; 83 | }, 84 | 85 | onEventTriggered: function(ui){ 86 | ui.pageEval(function(element, event){}); 87 | }, 88 | 89 | onXhr: function(ui){ 90 | var url = ui.pageEval(function(request){ 91 | return request.url; 92 | }); 93 | // cancel XHR request if url matches XXX 94 | if(url.match(/step=4/)){ 95 | ui.print("Skipped XHR to " + url) 96 | return false; 97 | } 98 | }, 99 | onFillInput: function(ui){ 100 | // here it's possible to force a value or prevent it to be filled 101 | // WARNING: do NOT set dynamic values! for instance something like 102 | // element.value = Math.random() 103 | // will lead to INFINITE CRAWLING if you crawl forms 104 | return ui.pageEval(function(element){ 105 | if(element.id == "car_vendor"){ 106 | element.value = "Ferrari"; 107 | // prevent element value to be set 108 | return false; 109 | } 110 | }); 111 | }, 112 | onAllXhrsCompleted: function(ui){ 113 | ui.pageEval(function(){}); 114 | }, 115 | 116 | onDomModified: function(ui){ 117 | ui.pageEval(function(rootElements, allElements){}); 118 | // save a screenshot on every DOM change 119 | ui.render(ui.id + "-screen-" + ui.vars.cnt + ".png"); 120 | ui.vars.cnt++; 121 | }, 122 | 123 | onEnd: function(ui){ 124 | ui.pageEval(function(){}); 125 | } 126 | } 127 | """ 128 | 129 | -------------------------------------------------------------------------------- /htcap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | HTCAP - beta 1 6 | Author: filippo.cavallarin@wearesegment.com 7 | 8 | This program is free software; you can redistribute it and/or modify it under 9 | the terms of the GNU General Public License as published by the Free Software 10 | Foundation; either version 2 of the License, or (at your option) any later 11 | version. 12 | """ 13 | 14 | from __future__ import unicode_literals 15 | import sys 16 | import os 17 | import datetime 18 | import time 19 | import getopt 20 | 21 | from core.lib.utils import * 22 | from core.crawl.crawler import Crawler 23 | 24 | from core.util.util import Util 25 | 26 | reload(sys) 27 | sys.setdefaultencoding('utf8') 28 | log = "*"*53 29 | log += """ 30 | * / _ \| _ \ / \\ \ / / ___/ ___| ___ __ _| \ | | * 31 | *| | | | | | |/ _ \\ V /|___ \___ \ / __/ _` | \| | * 32 | *| |_| | |_| / ___ \| | ___) |__) | (_| (_| | |\ |* 33 | * \___/|____/_/ \_\_| |____/____/ \___\__,_|_| \_|* 34 | """ 35 | log += "*"*53 36 | print log 37 | 38 | def usage(): 39 | print ( 40 | "usage: htcap \n" 41 | "Commands: \n" 42 | " crawl run crawler\n" 43 | " util run utility\n" 44 | ) 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | if len(sys.argv) < 2: 50 | usage() 51 | sys.exit(1) 52 | 53 | elif sys.argv[1] == "crawl": 54 | Crawler(sys.argv[2:]) 55 | elif sys.argv[1] == "util": 56 | Util(sys.argv[2:]) 57 | else: 58 | usage(); 59 | sys.exit(1) 60 | 61 | sys.exit(0) 62 | -------------------------------------------------------------------------------- /new.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat Premium Data Transfer 3 | 4 | Source Server : 本地测试 5 | Source Server Type : MySQL 6 | Source Server Version : 50542 7 | Source Host : localhost 8 | Source Database : w3a_scan 9 | 10 | Target Server Type : MySQL 11 | Target Server Version : 50542 12 | File Encoding : utf-8 13 | 14 | Date: 06/26/2017 13:51:33 PM 15 | */ 16 | 17 | SET NAMES utf8; 18 | SET FOREIGN_KEY_CHECKS = 0; 19 | 20 | -- ---------------------------- 21 | -- Table structure for `crawl_info` 22 | -- ---------------------------- 23 | DROP TABLE IF EXISTS `crawl_info`; 24 | CREATE TABLE `crawl_info` ( 25 | `id` int(11) NOT NULL AUTO_INCREMENT, 26 | `target` varchar(255) NOT NULL COMMENT '扫描目标', 27 | `start_date` varchar(255) NOT NULL COMMENT '扫描开始时间', 28 | `end_date` varchar(255) NOT NULL COMMENT '扫描结束时间', 29 | `commandline` varchar(255) NOT NULL COMMENT '扫描执行的命令', 30 | `user_agent` varchar(255) NOT NULL COMMENT '扫描的ua', 31 | PRIMARY KEY (`id`) 32 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 33 | 34 | -- ---------------------------- 35 | -- Table structure for `request` 36 | -- ---------------------------- 37 | DROP TABLE IF EXISTS `request`; 38 | CREATE TABLE `request` ( 39 | `id` int(11) NOT NULL AUTO_INCREMENT, 40 | `taskid` int(11) DEFAULT NULL, 41 | `method` varchar(255) NOT NULL COMMENT '请求类型', 42 | `host` varchar(255) DEFAULT NULL COMMENT '请求地址',, 43 | `url` varchar(255) NOT NULL COMMENT '地址', 44 | `data` text COMMENT '参数', 45 | `referer` varchar(255) DEFAULT NULL COMMENT '来路', 46 | `type` varchar(255) NOT NULL COMMENT '类型', 47 | `redirects` varchar(255) DEFAULT '' COMMENT '什么鬼', 48 | `cookies` varchar(255) DEFAULT NULL COMMENT 'cookie', 49 | `http_auth` varchar(255) DEFAULT NULL COMMENT 'http认证', 50 | `out_of_scope` varchar(255) DEFAULT NULL, 51 | `trigger` varchar(255) DEFAULT NULL COMMENT '触发事件', 52 | `html` text COMMENT 'http内容', 53 | `user_output` text, 54 | PRIMARY KEY (`id`) 55 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8; 56 | 57 | SET FOREIGN_KEY_CHECKS = 1; 58 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | DBUtils 2 | pymysql 3 | HTMLParser 4 | requests 5 | requests_cache 6 | tld --------------------------------------------------------------------------------