├── README.md
├── core
├── __init__.py
├── constants.py
├── crawl
│ ├── __init__.py
│ ├── crawler.py
│ ├── crawler_thread.py
│ ├── lib
│ │ ├── __init__.py
│ │ ├── crawl_result.py
│ │ ├── probe.py
│ │ ├── shared.py
│ │ ├── urlfinder.py
│ │ └── utils.py
│ └── probe
│ │ ├── analyze.js
│ │ ├── functions.js
│ │ ├── options.js
│ │ └── probe.js
├── lib
│ ├── DB_config.py
│ ├── __init__.py
│ ├── cookie.py
│ ├── database.py
│ ├── exception.py
│ ├── http_get.py
│ ├── request.py
│ ├── request_pattern.py
│ ├── shell.py
│ ├── thirdparty
│ │ ├── __init__.py
│ │ ├── pysocks
│ │ │ ├── __init__.py
│ │ │ ├── socks.py
│ │ │ └── sockshandler.py
│ │ └── simhash
│ │ │ └── __init__.py
│ └── utils.py
└── util
│ ├── __init__.py
│ ├── base_util.py
│ ├── util.py
│ └── utilities
│ ├── __init__.py
│ ├── htmlreport
│ ├── report.html
│ ├── report.js
│ └── style.css
│ ├── login.py
│ ├── login
│ └── login.js
│ ├── lsajax.py
│ ├── lsvuln.py
│ ├── report.py
│ ├── tocurl.py
│ ├── updcookie.py
│ └── usgen.py
├── htcap.py
├── new.sql
└── requirements.txt
/README.md:
--------------------------------------------------------------------------------
1 | ## HTCAP
2 |
3 | Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting ajax calls and DOM changes.
4 | Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications.
5 |
6 | More infos at [htcap.org](http://htcap.org).
7 |
8 | ## SETUP
9 |
10 | ### Requirements
11 |
12 | 1. Python 2.7
13 | 2. PhantomJS v2 [PS:因为PhantomJS作者不再维护PhantomJS项目了..估计这个也不会继续更新了]
14 |
15 | ### Download and Run
16 |
17 | ```console
18 | $ git clone https://github.com/0xa-saline/htcap_mysql htcap
19 | $ cd htcap
20 | $ vi core/lib/DB_config.py
21 | #数据库信息
22 | 'host' : 'localhost',
23 | 'user' : 'root',
24 | 'port' : '3306',
25 | 'password' : 'mysqlroot',
26 | 'db' : 'w3a_scan',
27 | $ sudo pip install -r requirements.txt
28 | $ python htcap.py crawl http://0day5.com
29 |
30 | ```
31 |
32 | 使用姿势和原本的一样的
33 |
34 | ```bash
35 | $ python htcap.py crawl http://testphp.vulnweb.com
36 | *****************************************************
37 | * / _ \| _ \ / \ \ / / ___/ ___| ___ __ _| \ | | *
38 | *| | | | | | |/ _ \ V /|___ \___ \ / __/ _` | \| | *
39 | *| |_| | |_| / ___ \| | ___) |__) | (_| (_| | |\ |*
40 | * \___/|____/_/ \_\_| |____/____/ \___\__,_|_| \_|*
41 | *****************************************************
42 | . No handlers could be found for logger "tldextract"
43 | [*][debug] http://testphp.vulnweb.com/pictures/
44 | [*][debug] http://testphp.vulnweb.com/images/
45 | [*][debug] http://testphp.vulnweb.com/bxss/
46 | [*][debug] http://testphp.vulnweb.com/Connections/
47 | [*][debug] http://testphp.vulnweb.com/admin/
48 | [*][debug] http://testphp.vulnweb.com/CVS/
49 | [*][debug] http://testphp.vulnweb.com/secured/
50 | [*][debug] http://testphp.vulnweb.com/userinfo.php
51 | [*][debug] http://testphp.vulnweb.com/cart.php
52 | [*][debug] http://testphp.vulnweb.com/logout.php
53 | [*][debug] http://testphp.vulnweb.com/search.php
54 | [*][debug] http://testphp.vulnweb.com/comment.php
55 | [*][debug] http://testphp.vulnweb.com/login.php
56 | [*][debug] http://testphp.vulnweb.com/index.php
57 | [*][debug] http://testphp.vulnweb.com/product.php
58 | [*][debug] http://testphp.vulnweb.com/guestbook.php
59 | . initialized, crawl started with 10 threads
60 | [=================================] 108 of 108 pages processed in 43 minutes
61 | Crawl finished, 108 pages analyzed in 43 minutes
62 | ```
63 |
64 | PhantomJs can be downloaded [here](http://phantomjs.org//download.html). It comes as a self-contained executable with all libraries linked statically, so there is no need to install or compile anything else.
65 |
66 |
67 | ## DOCUMENTATION
68 |
69 | Documentation, examples and demos can be found at the official website [http://htcap.org](http://htcap.org).
70 |
71 |
72 | ## TO DO
73 |
74 | 0.禁止dns刷新缓存 done
75 |
76 |
77 | 1.修改htcap的数据库为mysql done
78 |
79 |
80 | 2.增加常见统计代码和分享网站的过滤功能 done
81 |
82 |
83 | 3.增加常见静态后缀的识别 done
84 |
85 |
86 | 4.获取url在原有的robots基础上增加目录爆破和搜索引擎采集.识别一些不能访问的目录 done
87 |
88 |
89 | 5.砍掉sqlmap和Arachni扫描功能. done
90 |
91 |
92 | 6.增加页面信息识别功能.
93 |
94 |
95 | 7.增加重复去重和相似度去重功能
96 |
97 |
98 | ## demo
99 |
100 | http://htcap.org/scanme/
101 |
102 |
103 |
104 | ## LICENSE
105 |
106 | This program is free software; you can redistribute it and/or modify it under the terms of the [GNU General Public License](https://www.gnu.org/licenses/gpl-2.0.html) as published by the Free Software Foundation; either version 2 of the License, or(at your option) any later version.
107 |
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/__init__.py
--------------------------------------------------------------------------------
/core/constants.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 |
14 | THSTAT_WAITING = 0
15 | THSTAT_RUNNING = 1
16 |
17 | CRAWLSCOPE_DOMAIN = "domain"
18 | CRAWLSCOPE_DIRECTORY = "directory"
19 | CRAWLSCOPE_URL = "url"
20 |
21 |
22 | CRAWLMODE_PASSIVE = "passive"
23 | CRAWLMODE_ACTIVE = "active"
24 | CRAWLMODE_AGGRESSIVE = "aggressive"
25 |
26 | REQTYPE_LINK = "link"
27 | REQTYPE_XHR = "xhr"
28 | REQTYPE_WS = "websocket"
29 | REQTYPE_JSONP = "jsonp"
30 | REQTYPE_FORM = "form"
31 | REQTYPE_REDIRECT = "redirect"
32 | REQTYPE_UNKNOWN = "unknown"
33 |
34 |
35 | ERROR_CONTENTTYPE = "contentType"
36 | ERROR_TIMEOUT = "timeout"
37 | ERROR_PROBE_TO = "probe_timeout"
38 | ERROR_LOAD = "loaderror"
39 | ERROR_PROBEKILLED = "probe_killed"
40 | ERROR_PROBEFAILURE = "probe_failure"
41 | ERROR_MAXREDIRECTS = "too_many_redirects"
42 | ERROR_CRAWLDEPTH = "crawler_depth_limit_reached"
43 | VULNTYPE_SQLI = "sqli"
44 | VULNTYPE_XSS = "xss"
45 |
--------------------------------------------------------------------------------
/core/crawl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/__init__.py
--------------------------------------------------------------------------------
/core/crawl/crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 | from __future__ import unicode_literals
14 | import sys
15 | import os
16 | import datetime
17 | import time
18 | import getopt
19 | import json
20 | import re
21 | from urlparse import urlsplit, urljoin
22 | from urllib import unquote
23 | import urllib2
24 | import threading
25 | import subprocess
26 | from random import choice
27 | import string
28 | import ssl
29 |
30 |
31 | from core.lib.exception import *
32 | from core.lib.cookie import Cookie
33 | from core.lib.database import Database
34 |
35 |
36 | from lib.shared import *
37 | from lib.crawl_result import *
38 | from core.lib.request import Request
39 | from core.lib.http_get import HttpGet
40 | from core.lib.shell import CommandExecutor
41 | #from core.dirburp.dirscan import Dirbuster
42 | from crawler_thread import CrawlerThread
43 |
44 | from core.lib.utils import *
45 | from core.constants import *
46 | from tld import get_tld
47 | from lib.utils import *
48 |
49 | class Crawler:
50 |
51 | def __init__(self, argv):
52 |
53 | self.base_dir = getrealdir(__file__) + os.sep
54 |
55 | self.crawl_start_time = int(time.time())
56 | self.crawl_end_time = None
57 | self.taskid = ''
58 |
59 | self.defaults = {
60 | "useragent": random_useragent(),
61 | "num_threads": 10,
62 | "max_redirects": 10*10,
63 | "out_file_overwrite": False,
64 | "proxy": None,
65 | "http_auth": None,
66 | "use_urllib_onerror": True,
67 | "group_qs": False,
68 | "process_timeout": 450, # when lots of element(~25000) are added dynamically it can take some time..
69 | "set_referer": True,
70 | "scope": CRAWLSCOPE_DOMAIN,
71 | "mode": CRAWLMODE_AGGRESSIVE,
72 | "max_depth": 10000,
73 | "max_post_depth": 1000,
74 | "override_timeout_functions": True,
75 | 'crawl_forms': True# only if mode == CRAWLMODE_AGGRESSIVE
76 | }
77 |
78 |
79 | self.main(argv)
80 |
81 |
82 |
83 | def usage(self):
84 | print (
85 | "usage: htcap [options] url outfile\n"
86 | "Options: \n"
87 | " -h this help\n"
88 | " -q do not display progress informations\n"
89 | " -m MODE set crawl mode:\n"
90 | " - "+CRAWLMODE_PASSIVE+": do not intract with the page\n"
91 | " - "+CRAWLMODE_ACTIVE+": trigger events\n"
92 | " - "+CRAWLMODE_AGGRESSIVE+": also fill input values and crawl forms (default)\n"
93 | " -s SCOPE set crawl scope\n"
94 | " - "+CRAWLSCOPE_DOMAIN+": limit crawling to current domain (default)\n"
95 | " - "+CRAWLSCOPE_DIRECTORY+": limit crawling to current directory (and subdirecotries) \n"
96 | " - "+CRAWLSCOPE_URL+": do not crawl, just analyze a single page\n"
97 | " -D maximum crawl depth (default: " + str(Shared.options['max_depth']) + ")\n"
98 | " -P maximum crawl depth for consecutive forms (default: " + str(Shared.options['max_post_depth']) + ")\n"
99 | " -F even if in aggressive mode, do not crawl forms\n"
100 | " -H save HTML generated by the page\n"
101 | " -d DOMAINS comma separated list of allowed domains (ex *.target.com)\n"
102 | " -c COOKIES cookies as json or name=value pairs separaded by semicolon\n"
103 | " -C COOKIE_FILE path to file containing COOKIES \n"
104 | " -r REFERER set initial referer\n"
105 | " -x EXCLUDED comma separated list of urls to exclude (regex) - ie logout urls\n"
106 | " -p PROXY proxy string protocol:host:port - protocol can be 'http' or 'socks5'\n"
107 | " -n THREADS number of parallel threads (default: " + str(self.defaults['num_threads']) + ")\n"
108 | " -A CREDENTIALS username and password used for HTTP authentication separated by a colon\n"
109 | " -U USERAGENT set user agent\n"
110 | " -t TIMEOUT maximum seconds spent to analyze a page (default " + str(self.defaults['process_timeout']) + ")\n"
111 | " -u USER_SCRIPT inject USER_SCRIPT into any loaded page\n"
112 | " -S skip initial checks\n"
113 | " -G group query_string parameters with the same name ('[]' ending excluded)\n"
114 | " -N don't normalize URL path (keep ../../)\n"
115 | " -R maximum number of redirects to follow (default " + str(self.defaults['max_redirects']) + ")\n"
116 | " -I ignore robots.txt\n"
117 | " -O dont't override timeout functions (setTimeout, setInterval)\n"
118 | " -K keep elements in the DOM (prevent removal)\n"
119 | )
120 |
121 |
122 | def generate_filename(self, name, out_file_overwrite):
123 | fname = generate_filename(name, None, out_file_overwrite)
124 | if out_file_overwrite:
125 | if os.path.exists(fname):
126 | os.remove(fname)
127 |
128 | return fname
129 |
130 |
131 |
132 | def kill_threads(self, threads):
133 | for th in threads:
134 | if th.isAlive(): th.exit = True
135 | # start notify() chain
136 | Shared.th_condition.acquire()
137 | Shared.th_condition.notifyAll()
138 | Shared.th_condition.release()
139 |
140 |
141 |
142 | def parse_cookie_string(self, string):
143 |
144 | cookies = []
145 | try:
146 | cookies = json.loads(string)
147 | except ValueError:
148 | tok = re.split("; *", string)
149 | for t in tok:
150 | k, v = t.split("=", 1)
151 | cookies.append({"name":k.strip(), "value":unquote(v.strip())})
152 | except Exception as e:
153 | raise
154 |
155 | return cookies
156 |
157 |
158 |
159 | def init_db(self, dbname, report_name):
160 | infos = {
161 | "target": Shared.starturl,
162 | "scan_date": -1,
163 | "urls_scanned": -1,
164 | "scan_time": -1,
165 | 'command_line': " ".join(sys.argv)
166 | }
167 |
168 | database = Database(dbname, report_name, infos)
169 | return database
170 |
171 | def check_startrequest(self, request):
172 |
173 | h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
174 | try:
175 | h.get_requests()
176 | except NotHtmlException:
177 | print "\nError: Document is not html"
178 | sys.exit(1)
179 | except Exception as e:
180 | print "\nError: unable to open url: %s" % e
181 | sys.exit(1)
182 |
183 | def get_requests_from_robots(self, request):
184 | purl = urlsplit(request.url)
185 | url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)
186 |
187 | getreq = Request(REQTYPE_LINK, "GET", url)
188 | try:
189 | # request, timeout, retries=None, useragent=None, proxy=None):
190 | httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy'])
191 | lines = httpget.get_file().split("\n")
192 | except urllib2.HTTPError:
193 | return []
194 | except:
195 | raise
196 |
197 | requests = []
198 | for line in lines:
199 | directive = ""
200 | url = None
201 | try:
202 | directive, url = re.sub("\#.*","",line).split(":",1)
203 | except:
204 | continue # ignore errors
205 |
206 | if re.match("(dis)?allow", directive.strip(), re.I):
207 | req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)
208 | requests.append(req)
209 |
210 |
211 | return adjust_requests(requests) if requests else []
212 |
213 |
214 | def randstr(self, length):
215 | all_chars = string.digits + string.letters + string.punctuation
216 | random_string = ''.join(choice(all_chars) for _ in range(length))
217 | return random_string
218 |
219 |
220 |
221 | def main_loop(self, threads, start_requests, database, display_progress = True, verbose = False,taskid=''):
222 | pending = len(start_requests)
223 | crawled = 0
224 |
225 | req_to_crawl = start_requests
226 | try:
227 | while True:
228 |
229 | if display_progress and not verbose:
230 | tot = (crawled + pending)
231 | print_progressbar(tot, crawled, self.crawl_start_time, "pages processed")
232 |
233 | if pending == 0:
234 | # is the check of running threads really needed?
235 | running_threads = [t for t in threads if t.status == THSTAT_RUNNING]
236 | if len(running_threads) == 0:
237 | if display_progress or verbose:
238 | print ""
239 | break
240 |
241 | if len(req_to_crawl) > 0:
242 | Shared.th_condition.acquire()
243 | Shared.requests.extend(req_to_crawl)
244 | Shared.th_condition.notifyAll()
245 | Shared.th_condition.release()
246 |
247 | req_to_crawl = []
248 | Shared.main_condition.acquire()
249 | Shared.main_condition.wait(1)
250 | if len(Shared.crawl_results) > 0:
251 | #database.connect()
252 | #database.begin()
253 | for result in Shared.crawl_results:
254 | crawled += 1
255 | pending -= 1
256 | if verbose:
257 | print "crawl result for: %s " % result.request
258 | if len(result.request.user_output) > 0:
259 | print " user: %s" % json.dumps(result.request.user_output)
260 | if result.errors:
261 | print "* crawler errors: %s" % ", ".join(result.errors)
262 |
263 | #database.save_crawl_result(result, True)
264 | for req in result.found_requests:
265 | ######tips
266 | #print req.url,req.data,req.method,Shared.allowed_domains
267 |
268 | if verbose:
269 | print " new request found %s" % req
270 |
271 | urlfilt = HostFilter(req.url)
272 | if urlfilt.urlfilter():
273 | database.save_request(req,taskid)
274 |
275 | if request_is_crawlable(req) and req not in Shared.requests and req not in req_to_crawl:
276 | if request_depth(req) > Shared.options['max_depth'] or request_post_depth(req) > Shared.options['max_post_depth']:
277 | if verbose:
278 | print " * cannot crawl: %s : crawl depth limit reached" % req
279 | result = CrawlResult(req, errors=[ERROR_CRAWLDEPTH])
280 | #database.save_crawl_result(result, False)
281 | continue
282 |
283 | if req.redirects > Shared.options['max_redirects']:
284 | if verbose:
285 | print " * cannot crawl: %s : too many redirects" % req
286 | result = CrawlResult(req, errors=[ERROR_MAXREDIRECTS])
287 | #database.save_crawl_result(result, False)
288 | continue
289 |
290 | pending += 1
291 | req_to_crawl.append(req)
292 |
293 | Shared.crawl_results = []
294 | Shared.main_condition.release()
295 |
296 | except KeyboardInterrupt:
297 | print "\nTerminated by user"
298 | try:
299 | Shared.main_condition.release()
300 | Shared.th_condition.release()
301 | except:
302 | pass
303 |
304 |
305 | def check_user_script_syntax(self, probe_cmd, user_script):
306 | try:
307 | exe = CommandExecutor(probe_cmd + ["-u", user_script, "-v"] , False)
308 | out = exe.execute(5)
309 | if out:
310 | print "\n* USER_SCRIPT error: %s" % out
311 | sys.exit(1)
312 | stdoutw(". ")
313 | except KeyboardInterrupt:
314 | print "\nAborted"
315 | sys.exit(0)
316 |
317 |
318 | def init_crawl(self, start_req, check_starturl, get_robots_txt):
319 | start_requests = [start_req]
320 | try:
321 | if check_starturl:
322 | self.check_startrequest(start_req)
323 | stdoutw(". ")
324 |
325 | if get_robots_txt:
326 | rrequests = self.get_requests_from_robots(start_req)
327 | stdoutw(". ")
328 | for req in rrequests:
329 | if request_is_crawlable(req) and not req in start_requests:
330 | start_requests.append(req)
331 | except KeyboardInterrupt:
332 | print "\nAborted"
333 | sys.exit(0)
334 |
335 | return start_requests
336 |
337 |
338 | def main(self, argv):
339 | Shared.options = self.defaults
340 | Shared.th_condition = threading.Condition()
341 | Shared.main_condition = threading.Condition()
342 |
343 |
344 | probe_cmd = get_phantomjs_cmd()
345 | if not probe_cmd:
346 | print "Error: unable to find phantomjs executable"
347 | sys.exit(1)
348 |
349 | start_cookies = []
350 | start_referer = None
351 |
352 | probe_options = ["-R", self.randstr(20)]
353 | threads = []
354 | num_threads = self.defaults['num_threads']
355 |
356 | out_file = ""
357 | out_file_overwrite = self.defaults['out_file_overwrite']
358 | cookie_string = None
359 | display_progress = True
360 | verbose = False
361 | initial_checks = True
362 | http_auth = None
363 | get_robots_txt = True
364 | save_html = False
365 | user_script = None
366 |
367 | try:
368 | opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
369 | except getopt.GetoptError as err:
370 | print str(err)
371 | sys.exit(1)
372 |
373 |
374 | if len(args) < 1:
375 | self.usage()
376 | sys.exit(1)
377 |
378 |
379 |
380 | for o, v in opts:
381 | if o == '-h':
382 | self.usage()
383 | sys.exit(0)
384 | elif o == '-c':
385 | cookie_string = v
386 | elif o == '-C':
387 | try:
388 | with open(v) as cf:
389 | cookie_string = cf.read()
390 | except Exception as e:
391 | print "error reading cookie file"
392 | sys.exit(1)
393 | elif o == '-r':
394 | start_referer = v
395 | elif o == '-n':
396 | num_threads = int(v)
397 | elif o == '-t':
398 | Shared.options['process_timeout'] = int(v)
399 | elif o == '-q':
400 | display_progress = False
401 | elif o == '-A':
402 | http_auth = v
403 | elif o == '-p':
404 | if v == "tor": v = "socks5:127.0.0.1:9150"
405 | proxy = v.split(":")
406 | if proxy[0] not in ("http", "socks5"):
407 | print "only http and socks5 proxies are supported"
408 | sys.exit(1)
409 | Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]}
410 | elif o == '-d':
411 | for ad in v.split(","):
412 | # convert *.domain.com to *.\.domain\.com
413 | pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)")
414 | Shared.allowed_domains.add(pattern)
415 | elif o == '-x':
416 | for eu in v.split(","):
417 | Shared.excluded_urls.add(eu)
418 | elif o == "-G":
419 | Shared.options['group_qs'] = True
420 | #elif o == "-w":
421 | # out_file_overwrite = True
422 | elif o == "-R":
423 | Shared.options['max_redirects'] = int(v)
424 | elif o == "-U":
425 | Shared.options['useragent'] = v
426 | elif o == "-s":
427 | if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL):
428 | self.usage()
429 | print "* ERROR: wrong scope set '%s'" % v
430 | sys.exit(1)
431 | Shared.options['scope'] = v
432 | elif o == "-m":
433 | if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE):
434 | self.usage()
435 | print "* ERROR: wrong mode set '%s'" % v
436 | sys.exit(1)
437 | Shared.options['mode'] = v
438 | elif o == "-S":
439 | initial_checks = False
440 | elif o == "-I":
441 | get_robots_txt = False
442 | elif o == "-H":
443 | save_html = True
444 | elif o == "-D":
445 | Shared.options['max_depth'] = int(v)
446 | elif o == "-P":
447 | Shared.options['max_post_depth'] = int(v)
448 | elif o == "-O":
449 | Shared.options['override_timeout_functions'] = False
450 | elif o == "-F":
451 | Shared.options['crawl_forms'] = False
452 | elif o == "-u":
453 | if os.path.isfile(v):
454 | user_script = os.path.abspath(v)
455 | else:
456 | print "error: unable to open USER_SCRIPT"
457 | sys.exit(1)
458 |
459 |
460 | if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0:
461 | print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN
462 |
463 | if cookie_string:
464 | try:
465 | start_cookies = self.parse_cookie_string(cookie_string)
466 | except Exception as e:
467 | print "error decoding cookie string"
468 | sys.exit(1)
469 |
470 | if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
471 | probe_options.append("-f") # dont fill values
472 | if Shared.options['mode'] == CRAWLMODE_PASSIVE:
473 | probe_options.append("-t") # dont trigger events
474 |
475 | if Shared.options['proxy']:
476 | probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
477 | probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
478 |
479 | probe_cmd.append(self.base_dir + 'probe/analyze.js')
480 |
481 |
482 | if len(Shared.excluded_urls) > 0:
483 | probe_options.extend(("-X", ",".join(Shared.excluded_urls)))
484 |
485 | if save_html:
486 | probe_options.append("-H")
487 |
488 | if user_script:
489 | probe_options.extend(("-u", user_script))
490 |
491 | probe_options.extend(("-x", str(Shared.options['process_timeout'])))
492 | probe_options.extend(("-A", Shared.options['useragent']))
493 |
494 | if not Shared.options['override_timeout_functions']:
495 | probe_options.append("-O")
496 |
497 | Shared.probe_cmd = probe_cmd + probe_options
498 |
499 |
500 | Shared.starturl = normalize_url(args[0])
501 | #out_file = args[1]
502 |
503 | purl = urlsplit(Shared.starturl)
504 | try:
505 | pdomain = get_tld(Shared.starturl)
506 | except:
507 | pdomain = purl.hostname
508 | if purl.hostname == pdomain:
509 | Shared.allowed_domains.add(purl.hostname)
510 | else:
511 | Shared.allowed_domains.add(pdomain)
512 | Shared.allowed_domains.add(purl.hostname)
513 |
514 |
515 | for sc in start_cookies:
516 | Shared.start_cookies.append(Cookie(sc, Shared.starturl))
517 |
518 |
519 | start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer)
520 |
521 | if not hasattr(ssl, "SSLContext"):
522 | print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"
523 |
524 | if user_script and initial_checks:
525 | self.check_user_script_syntax(probe_cmd, user_script)
526 |
527 | start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt)
528 |
529 | database = None
530 | fname = None
531 | try:
532 | database = self.init_db(fname, out_file)
533 | except Exception as e:
534 | print str(e)
535 |
536 | taskid = database.save_crawl_info(
537 | target = Shared.starturl,
538 | start_date = self.crawl_start_time,
539 | commandline = cmd_to_str(argv),
540 | user_agent = Shared.options['useragent']
541 | )
542 | self.taskid = taskid
543 |
544 | for req in start_requests:
545 | urlfilt = HostFilter(req.url)
546 | if urlfilt.urlfilter():
547 | database.save_request(req,self.taskid)
548 |
549 | print "initialized, crawl started with %d threads" % (num_threads)
550 |
551 | for n in range(0, num_threads):
552 | thread = CrawlerThread()
553 | threads.append(thread)
554 | thread.start()
555 |
556 |
557 | self.main_loop(threads, start_requests, database, display_progress, verbose,self.taskid)
558 |
559 | self.kill_threads(threads)
560 |
561 | self.crawl_end_time = int(time.time())
562 |
563 | print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60)
564 | database.update_crawl_info(self.taskid,self.crawl_end_time)
565 |
--------------------------------------------------------------------------------
/core/crawl/crawler_thread.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 | from __future__ import unicode_literals
14 | import time
15 | import re
16 | import json
17 | import urllib
18 | import cookielib
19 | import threading
20 | import base64
21 |
22 | import tempfile
23 | import os
24 | import uuid
25 |
26 | from urlparse import urlparse, urlsplit, urljoin, parse_qsl
27 |
28 | from core.lib.exception import *
29 | from core.crawl.lib.shared import *
30 |
31 |
32 | from core.crawl.lib.probe import Probe
33 |
34 | from core.lib.http_get import HttpGet
35 | from core.lib.cookie import Cookie
36 | from core.lib.shell import CommandExecutor
37 | from core.lib.request import Request
38 |
39 | from core.lib.utils import *
40 | from core.constants import *
41 |
42 | from lib.utils import *
43 | from lib.crawl_result import *
44 |
45 |
46 | class CrawlerThread(threading.Thread):
47 |
48 | def __init__(self):
49 | threading.Thread.__init__(self)
50 | self.thread_uuid = uuid.uuid4()
51 | self.process_retries = 2
52 | self.process_retries_interval = 0.5
53 |
54 | self.status = THSTAT_RUNNING
55 | self.exit = False
56 |
57 | self.cookie_file = "%s%shtcap_cookiefile-%s.json" % (tempfile.gettempdir(), os.sep, self.thread_uuid)
58 |
59 |
60 | def run(self):
61 | self.crawl()
62 |
63 |
64 |
65 | def wait_request(self):
66 | request = None
67 | Shared.th_condition.acquire()
68 | while True:
69 | if self.exit == True:
70 | Shared.th_condition.notifyAll()
71 | Shared.th_condition.release()
72 | raise ThreadExitRequestException("exit request received")
73 |
74 | if Shared.requests_index >= len(Shared.requests):
75 | self.status = THSTAT_WAITING
76 | Shared.th_condition.wait() # The wait method releases the lock, blocks the current thread until another thread calls notify
77 | continue
78 |
79 | request = Shared.requests[Shared.requests_index]
80 | Shared.requests_index += 1
81 |
82 | break
83 |
84 | Shared.th_condition.release()
85 |
86 | self.status = THSTAT_RUNNING
87 |
88 | return request
89 |
90 |
91 |
92 | def load_probe_json(self, jsn):
93 | jsn = jsn.strip()
94 | if not jsn: jsn = "["
95 | if jsn[-1] != "]":
96 | jsn += '{"status":"ok", "partialcontent":true}]'
97 | try:
98 | return json.loads(jsn)
99 | except Exception:
100 | #print "-- JSON DECODE ERROR %s" % jsn
101 | raise
102 |
103 |
104 | def send_probe(self, request, errors):
105 |
106 | url = request.url
107 | jsn = None
108 | probe = None
109 | retries = self.process_retries
110 | params = []
111 | cookies = []
112 |
113 |
114 | if request.method == "POST":
115 | params.append("-P")
116 | if request.data:
117 | params.extend(("-D", request.data))
118 |
119 |
120 | if len(request.cookies) > 0:
121 | for cookie in request.cookies:
122 | cookies.append(cookie.get_dict())
123 |
124 | with open(self.cookie_file,'w') as fil:
125 | fil.write(json.dumps(cookies))
126 |
127 | params.extend(("-c", self.cookie_file))
128 |
129 |
130 |
131 | if request.http_auth:
132 | params.extend(("-p" ,request.http_auth))
133 |
134 | if Shared.options['set_referer'] and request.referer:
135 | params.extend(("-r", request.referer))
136 |
137 |
138 | params.extend(("-i", str(request.db_id)))
139 |
140 | params.append(url)
141 |
142 |
143 | while retries:
144 | #while False:
145 |
146 | # print cmd_to_str(Shared.probe_cmd + params)
147 | # print ""
148 |
149 | cmd = CommandExecutor(Shared.probe_cmd + params)
150 | jsn = cmd.execute(Shared.options['process_timeout'] + 2)
151 |
152 | if jsn == None:
153 | errors.append(ERROR_PROBEKILLED)
154 | time.sleep(self.process_retries_interval) # ... ???
155 | retries -= 1
156 | continue
157 |
158 |
159 | # try to decode json also after an exception .. sometimes phantom crashes BUT returns a valid json ..
160 | try:
161 | if jsn and type(jsn) is not str:
162 | jsn = jsn[0]
163 | probeArray = self.load_probe_json(jsn)
164 | except Exception as e:
165 | raise
166 |
167 |
168 | if probeArray:
169 | probe = Probe(probeArray, request)
170 |
171 | if probe.status == "ok":
172 | break
173 |
174 | errors.append(probe.errcode)
175 |
176 | if probe.errcode in (ERROR_CONTENTTYPE, ERROR_PROBE_TO):
177 | break
178 |
179 | time.sleep(self.process_retries_interval)
180 | retries -= 1
181 |
182 | return probe
183 |
184 |
185 |
186 | def crawl(self):
187 |
188 | while True:
189 | url = None
190 | cookies = []
191 | requests = []
192 |
193 | requests_to_crawl = []
194 | redirects = 0
195 | errors = []
196 |
197 | try:
198 | request = self.wait_request()
199 | except ThreadExitRequestException:
200 | if os.path.exists(self.cookie_file):
201 | os.remove(self.cookie_file)
202 | return
203 | except Exception as e:
204 | print "-->"+str(e)
205 | continue
206 |
207 | url = request.url
208 |
209 | purl = urlsplit(url)
210 |
211 |
212 | probe = None
213 |
214 | probe = self.send_probe(request, errors)
215 |
216 | if probe:
217 | if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
218 |
219 | requests = probe.requests
220 |
221 | if probe.html:
222 | request.html = probe.html
223 |
224 | if len(probe.user_output) > 0:
225 | request.user_output = probe.user_output
226 |
227 | else :
228 | errors.append(ERROR_PROBEFAILURE)
229 | # get urls with python to continue crawling
230 | if Shared.options['use_urllib_onerror'] == False:
231 | continue
232 | try:
233 | hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])
234 | requests = hr.get_requests()
235 | except Exception as e:
236 | errors.append(str(e))
237 |
238 |
239 | # set out_of_scope, apply user-supplied filters to urls (ie group_qs)
240 | adjust_requests(requests)
241 |
242 | Shared.main_condition.acquire()
243 | res = CrawlResult(request, requests, errors)
244 | Shared.crawl_results.append(res)
245 | Shared.main_condition.notify()
246 | Shared.main_condition.release()
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
--------------------------------------------------------------------------------
/core/crawl/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/lib/__init__.py
--------------------------------------------------------------------------------
/core/crawl/lib/crawl_result.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 |
14 |
15 | class CrawlResult:
16 | def __init__(self, request, found_requests = None, errors = None):
17 | self.request = request
18 | self.found_requests = found_requests if found_requests else []
19 | self.errors = errors if errors else []
20 |
21 |
--------------------------------------------------------------------------------
/core/crawl/lib/probe.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 |
14 | from core.lib.request import Request
15 | from core.lib.cookie import Cookie
16 | from core.constants import *
17 |
18 | class Probe:
19 |
20 | def __init__(self, data, parent):
21 | self.status = "ok"
22 | self.requests = []
23 | self.cookies = []
24 | self.redirect = None;
25 | # if True the probe returned no error BUT the json is not closed properly
26 | self.partialcontent = False
27 | self.html = None
28 | self.user_output = []
29 |
30 | status = data.pop()
31 |
32 | if status['status'] == "error":
33 | self.status = "error"
34 | self.errcode = status['code']
35 |
36 |
37 | if "partialcontent" in status:
38 | self.partialcontent = status['partialcontent']
39 |
40 | # grap cookies before creating rquests
41 | for key,val in data:
42 | if key == "cookies":
43 | for cookie in val:
44 | self.cookies.append(Cookie(cookie, parent.url))
45 |
46 | if "redirect" in status:
47 | self.redirect = status['redirect']
48 | r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id)
49 | self.requests.append(r)
50 |
51 | for key,val in data:
52 | if key == "request":
53 | trigger = val['trigger'] if 'trigger' in val else None
54 | r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, data=val['data'], trigger=trigger, parent_db_id=parent.db_id )
55 | self.requests.append(r)
56 | elif key == "html":
57 | self.html = val
58 | elif key == "user":
59 | self.user_output.append(val)
60 |
61 |
62 |
63 | # @TODO handle cookies set by ajax (in probe too)
64 |
--------------------------------------------------------------------------------
/core/crawl/lib/shared.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 |
14 | class Shared:
15 | """
16 | data shared between threads
17 | """
18 |
19 | main_condition = None
20 | th_condition = None
21 |
22 | requests = []
23 | requests_index = 0
24 | crawl_results = []
25 |
26 | starturl = ""
27 | start_cookies = []
28 | allowed_domains = set()
29 | excluded_urls = set()
30 |
31 | options = {}
32 |
33 |
--------------------------------------------------------------------------------
/core/crawl/lib/urlfinder.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 | import re
14 | from HTMLParser import HTMLParser
15 | from urlparse import urljoin, urlparse
16 |
17 |
18 | class UrlFinder:
19 | def __init__(self, html):
20 | self.html = html
21 |
22 | def get_urls(self):
23 |
24 | try:
25 | parser = UrlHTMLParser()
26 | parser.feed(self.html)
27 | except:
28 | raise
29 |
30 | return parser.urls
31 |
32 |
33 | class UrlHTMLParser(HTMLParser):
34 | def __init__(self):
35 |
36 | HTMLParser.__init__(self)
37 | self.base_url = ""
38 | self.urls = []
39 |
40 | def handle_starttag(self, tag, attrs):
41 | # more info about the tag: https://www.w3.org/wiki/HTML/Elements/base
42 | if tag == "base":
43 | for key, val in attrs:
44 | if key == "href":
45 | self.base_url = urlparse(val.strip()).geturl()
46 |
47 | elif tag == "a":
48 | for key, val in attrs:
49 | if key == "href":
50 | if re.match("^https?://", val, re.I):
51 | self.urls.extend([val])
52 | elif not re.match("^[a-z]+:", val, re.I) and not val.startswith("#"):
53 | self.urls.extend([urljoin(self.base_url, val)])
54 |
--------------------------------------------------------------------------------
/core/crawl/lib/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | HTCAP - beta 1
5 | Author: filippo.cavallarin@wearesegment.com
6 |
7 | This program is free software; you can redistribute it and/or modify it under
8 | the terms of the GNU General Public License as published by the Free Software
9 | Foundation; either version 2 of the License, or (at your option) any later
10 | version.
11 | """
12 |
13 | from urlparse import urljoin
14 | from core.lib.cookie import Cookie
15 | from core.lib.utils import *
16 | from shared import *
17 | import posixpath
18 | import json
19 | import re
20 |
21 |
22 |
23 | def request_in_scope(request):
24 | url = request.url
25 | purl = urlsplit(url)
26 | spurl = urlsplit(Shared.starturl)
27 | scope = Shared.options['scope']
28 | in_scope = False
29 |
30 | # check for scopes
31 | if scope == CRAWLSCOPE_DOMAIN:
32 | for pattern in Shared.allowed_domains:
33 | if re.match(pattern, purl.hostname):
34 | in_scope = True
35 | break
36 |
37 | elif scope == CRAWLSCOPE_DIRECTORY:
38 | if purl.hostname != spurl.hostname:
39 | in_scope = False
40 | else:
41 | path = [p for p in posixpath.dirname(purl.path).split("/") if p]
42 | spath = [p for p in posixpath.dirname(spurl.path).split("/") if p]
43 | in_scope = path[:len(spath)] == spath
44 |
45 | elif scope == CRAWLSCOPE_URL:
46 | in_scope = url == Shared.starturl
47 |
48 |
49 | # check for excluded urls
50 | for pattern in Shared.excluded_urls:
51 | if re.match(pattern, request.url):
52 | in_scope = False
53 | break
54 |
55 | return in_scope
56 |
57 |
58 |
59 | def adjust_requests(requests):
60 | """
61 | adjust an array of requsts according to current status/settings
62 | 1. sets the out_of_scope property
63 | 2. normalize url accoding to user settings
64 | """
65 |
66 | for request in requests:
67 | if request.type == REQTYPE_UNKNOWN or not request_in_scope(request):
68 | request.out_of_scope = True
69 |
70 | if Shared.options['group_qs']:
71 | request.url = group_qs_params(request.url)
72 |
73 | return requests
74 |
75 |
76 | def request_depth(request):
77 | if request.parent == None:
78 | return 1
79 |
80 | return 1 + request_depth(request.parent)
81 |
82 |
83 |
84 | def request_post_depth(request):
85 | if request.method != "POST":
86 | return 0
87 |
88 | if request.parent == None or request.parent.method != "POST":
89 | return 1
90 |
91 | return 1 + request_post_depth(request.parent)
92 |
93 |
94 |
95 | def request_is_crawlable(request):
96 | if request.out_of_scope:
97 | return False
98 |
99 | types = [REQTYPE_LINK, REQTYPE_REDIRECT]
100 | if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
101 | types.append(REQTYPE_FORM)
102 |
103 | return request.type in types and re.match("^https?://", request.url, re.I)
104 |
105 |
106 |
--------------------------------------------------------------------------------
/core/crawl/probe/analyze.js:
--------------------------------------------------------------------------------
1 | /*
2 | HTCAP - beta 1
3 | Author: filippo.cavallarin@wearesegment.com
4 |
5 | This program is free software; you can redistribute it and/or modify it under
6 | the terms of the GNU General Public License as published by the Free Software
7 | Foundation; either version 2 of the License, or (at your option) any later
8 | version.
9 | */
10 |
11 | var system = require('system');
12 | var fs = require('fs');
13 |
14 |
15 |
16 | phantom.injectJs("functions.js");
17 | phantom.injectJs("options.js");
18 | phantom.injectJs("probe.js");
19 |
20 |
21 | var startTime = Date.now();
22 |
23 |
24 | var site = "";
25 | var response = null;
26 | //var showHelp = false;
27 |
28 | var headers = {};
29 |
30 | var args = getopt(system.args,"hVaftUJdICc:MSEp:Tsx:A:r:mHX:PD:R:Oi:u:v");
31 |
32 | var page = require('webpage').create();
33 | var page_settings = {encoding: "utf8"};
34 | var random = "IsHOulDb34RaNd0MsTR1ngbUt1mN0t";
35 | //var injectScript = "{}";
36 | var US = null;
37 |
38 | var userInterface = {
39 | id: null,
40 | vars: {},
41 | pageEval: function(fnc){
42 | var sfnc = 'return (' + fnc.toString() + ').apply(null, arguments)';
43 | return page.evaluate(function(fnc){
44 | return (new Function('', fnc)).apply(null, window.__PROBE__.currentUserScriptParameters)
45 | }, sfnc);
46 | },
47 | render: function(file){
48 | try {
49 | page.render(file);
50 | return true;
51 | } catch(e){
52 | return false;
53 | }
54 | },
55 | print: function(str){
56 | console.log('["user",' + JSON.stringify(str) + '],');
57 | },
58 | fread: function(file){
59 | try{
60 | return "" + fs.read(file);
61 | } catch(e){
62 | return false;
63 | }
64 | },
65 | fwrite: function(file, content, mode){
66 | try {
67 | fs.write(file, content, mode || 'w');
68 | return true;
69 | } catch(e) {
70 | console.log(e)
71 | return false;
72 | }
73 | }
74 | }
75 |
76 | if(typeof args == 'string'){
77 | console.log("Error: " + args);
78 | phantom.exit(-1);
79 | }
80 |
81 | for(var a = 0; a < args.opts.length; a++){
82 | switch(args.opts[a][0]){
83 | case "h":
84 | usage();
85 | phantom.exit(1);
86 | break;
87 | case "P":
88 | page_settings.operation = "POST";
89 | break;
90 | case "D":
91 | page_settings.data = args.opts[a][1];
92 | break;
93 | case "R":
94 | random = args.opts[a][1];
95 | break;
96 | case "u":
97 | if(!phantom.injectJs(args.opts[a][1])){
98 | console.log("File not found: " + args.opts[a][1]);
99 | phantom.exit(0);
100 | }
101 | if(!window.US){
102 | phantom.exit(0);
103 | }
104 | break;
105 | case "v":
106 | phantom.exit(0);
107 | }
108 | }
109 |
110 |
111 | parseArgsToOptions(args);
112 | userInterface.id = options.id;
113 |
114 | site = args.args[1];
115 |
116 | if(!site){
117 | usage();
118 | phantom.exit(-1);
119 | }
120 |
121 | site = site.trim();
122 | if(site.length < 4 || site.substring(0,4).toLowerCase() != "http"){
123 | site = "http://" + site;
124 | }
125 |
126 | console.log("[");
127 |
128 | /* maximum execution time */
129 | setTimeout(execTimedOut,options.maxExecTime);
130 |
131 |
132 |
133 | phantom.onError = function(msg, trace) {
134 | var msgStack = ['PHANTOM ERROR: ' + msg];
135 | if (trace && trace.length) {
136 | msgStack.push('TRACE:');
137 | trace.forEach(function(t) {
138 | msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
139 | });
140 | }
141 | console.error(msgStack.join('\n'));
142 | phantom.exit(1);
143 | };
144 |
145 |
146 |
147 | page.onConsoleMessage = function(msg, lineNum, sourceId) {
148 | if(options.verbose)
149 | console.log("console: " + msg);
150 | }
151 | page.onError = function(msg, lineNum, sourceId) {
152 | if(options.verbose)
153 | console.log("console error: on " + JSON.stringify(lineNum) + " " + msg);
154 | }
155 |
156 | page.onAlert = function(msg) {
157 | if(options.verbose)
158 | console.log('ALERT: ' + msg);
159 | };
160 |
161 | page.settings.userAgent = options.userAgent;
162 | page.settings.loadImages = options.loadImages;
163 |
164 |
165 |
166 | page.onResourceReceived = function(resource) {
167 | if(window.response == null){
168 | window.response = resource;
169 | // @TODO sanytize response.contentType
170 |
171 | }
172 | };
173 |
174 |
175 | page.onResourceRequested = function(requestData, networkRequest) {
176 | //console.log(JSON.stringify(requestData))
177 | };
178 |
179 | // to detect window.location= / document.location.href=
180 | page.onNavigationRequested = onNavigationRequested;
181 |
182 | page.onConfirm = function(msg) {return true;} // recently changed
183 |
184 | /* phantomjs issue #11684 workaround */
185 | var isPageInitialized = false;
186 | page.onInitialized = function(){
187 | if(isPageInitialized) return;
188 | isPageInitialized = true;
189 |
190 | // try to hide phantomjs
191 | page.evaluate(function(){
192 | window.__callPhantom = window.callPhantom;
193 | delete window.callPhantom;
194 | });
195 |
196 | startProbe(random/*, injectScript*/);
197 |
198 | };
199 |
200 |
201 | page.onCallback = function(data) {
202 | switch(data.cmd){
203 | case "triggerUserEvent":
204 | var ret = window.US[data.argument.name](window.userInterface)
205 | return ret;
206 | case "print":
207 | console.log(data.argument);
208 | break;
209 |
210 | case "end":
211 | if(options.returnHtml){
212 | page.evaluate(function(options){
213 | window.__PROBE__.printPageHTML();
214 | }, options);
215 | }
216 |
217 | page.evaluate(function(options){
218 | window.__PROBE__.triggerUserEvent("onEnd");
219 | });
220 |
221 | printStatus("ok", window.response.contentType);
222 | phantom.exit(0);
223 | break;
224 |
225 | }
226 |
227 | }
228 |
229 |
230 |
231 | if(options.httpAuth){
232 | headers['Authorization'] = 'Basic ' + btoa(options.httpAuth[0] + ":" + options.httpAuth[1]);
233 | }
234 |
235 | if(options.referer){
236 | headers['Referer'] = options.referer;
237 | }
238 |
239 | page.customHeaders = headers;
240 |
241 |
242 | for(var a = 0; a < options.setCookies.length; a++){
243 | // maybe this is wrogn acconding to rfc .. but phantomjs cannot set cookie witout a domain...
244 | if(!options.setCookies[a].domain){
245 | var purl = document.createElement("a");
246 | purl.href=site;
247 | options.setCookies[a].domain = purl.hostname
248 | }
249 | if(options.setCookies[a].expires)
250 | options.setCookies[a].expires *= 1000;
251 |
252 | phantom.addCookie(options.setCookies[a]);
253 |
254 | }
255 |
256 | page.viewportSize = {
257 | width: 1920,
258 | height: 1080
259 | };
260 |
261 |
262 |
263 |
264 | page.open(site, page_settings, function(status) {
265 | var response = window.response; // just to be clear
266 | if (status !== 'success'){
267 | var mess = "";
268 | var out = {response: response};
269 | if(!response || response.headers.length == 0){
270 | printStatus("error", "load");
271 | phantom.exit(1);
272 | }
273 |
274 | // check for redirect first
275 | for(var a = 0; a < response.headers.length; a++){
276 | if(response.headers[a].name.toLowerCase() == 'location'){
277 |
278 | if(options.getCookies){
279 | printCookies(response.headers, site);
280 | }
281 | printStatus("ok", null, null, response.headers[a].value);
282 | phantom.exit(0);
283 | }
284 | }
285 |
286 | assertContentTypeHtml(response);
287 |
288 | phantom.exit(1);
289 | }
290 |
291 |
292 | if(options.getCookies){
293 | printCookies(response.headers, site);
294 | }
295 |
296 | assertContentTypeHtml(response);
297 |
298 | page.evaluate(function(){
299 |
300 | window.__PROBE__.waitAjax(function(ajaxTriggered){
301 | window.__PROBE__.triggerUserEvent("onStart");
302 | if(ajaxTriggered){
303 | window.__PROBE__.triggerUserEvent("onAllXhrsCompleted");
304 | }
305 | console.log("startAnalysis")
306 | window.__PROBE__.startAnalysis();
307 | });
308 | })
309 |
310 |
311 | });
312 |
313 |
314 |
315 |
--------------------------------------------------------------------------------
/core/crawl/probe/functions.js:
--------------------------------------------------------------------------------
1 | /*
2 | HTCAP - beta 1
3 | Author: filippo.cavallarin@wearesegment.com
4 |
5 | This program is free software; you can redistribute it and/or modify it under
6 | the terms of the GNU General Public License as published by the Free Software
7 | Foundation; either version 2 of the License, or (at your option) any later
8 | version.
9 | */
10 |
11 | // @todo error on Unknown option ds
12 | function getopt(arguments, optstring){
13 | var args = arguments.slice();
14 | var ret = {
15 | opts: [],
16 | args: args
17 | };
18 |
19 | var m = optstring.match(/[a-zA-Z]\:*/g);
20 | for(var a = 0; a < m.length; a++){
21 | var ai = args.indexOf("-" + m[a][0]);
22 | if(ai > -1){
23 | if(m[a][1] == ":"){
24 | if(args[ai+1]){
25 | ret.opts.push([m[a][0], args[ai+1]]);
26 | args.splice(ai,2);
27 | } else {
28 | return "missing argumnet for option " + m[a][0];
29 | }
30 | } else {
31 | ret.opts.push([m[a][0]]);
32 | args.splice(ai,1);
33 | }
34 | }
35 | }
36 |
37 | return ret;
38 | }
39 |
40 |
41 | function removeHash(url){
42 | var anchor = document.createElement("a");
43 | anchor.href = url;
44 |
45 | return anchor.protocol + "//" + anchor.host + anchor.pathname + anchor.search;
46 | }
47 |
48 |
49 |
50 | function compareUrls(url1, url2, includeHash){
51 | var a1 = document.createElement("a");
52 | var a2 = document.createElement("a");
53 | a1.href = url1;
54 | a2.href = url2;
55 |
56 | var eq = (a1.protocol == a2.protocol && a1.host == a2.host && a1.pathname == a2.pathname && a1.search == a2.search);
57 |
58 | if(includeHash) eq = eq && a1.hash == a2.hash;
59 |
60 | return eq;
61 |
62 | }
63 |
64 |
65 | function printCookies(headers, site){
66 | var cookies = getCookies(headers, site);
67 | console.log('["cookies",' + JSON.stringify(cookies) + "],");
68 | }
69 |
70 |
71 | function printStatus(status, errcode, message, redirect){
72 | var o = {status:status};
73 | if(status == "error"){
74 | o.code = errcode;
75 | switch(errcode){
76 | case "load":
77 | break;
78 | case "contentType":
79 | o.message = message;
80 | break;
81 | case "requestTimeout":
82 | break;
83 | case "probe_timeout":
84 | break;
85 | }
86 | }
87 | if(redirect) o.redirect = redirect;
88 | o.time = Math.floor((Date.now() - window.startTime)/1000);
89 | console.log(JSON.stringify(o));
90 | console.log("]")
91 | }
92 |
93 |
94 |
95 | function execTimedOut(){
96 | if(!response || response.headers.length == 0){
97 | printStatus("error", "requestTimeout");
98 | phantom.exit(0);
99 | }
100 | printStatus("error", "probe_timeout");
101 | phantom.exit(0);
102 |
103 | }
104 |
105 |
106 |
107 | function usage(){
108 | var usage = "Usage: analyze.js [options] \n" +
109 | " -V verbose\n" +
110 | " -a don't check ajax\n" +
111 | " -f don't fill values\n" +
112 | " -t don't trigger events (onload only)\n" +
113 | " -s don't check websockets\n" +
114 | " -M dont' map events\n" +
115 | " -T don't trigger mapped events\n" +
116 | " -S don't check for \n"
216 | "\n"
217 | "%s\n"
218 | "