├── README.md
├── core
    ├── __init__.py
    ├── constants.py
    ├── crawl
    │   ├── __init__.py
    │   ├── crawler.py
    │   ├── crawler_thread.py
    │   ├── lib
    │   │   ├── __init__.py
    │   │   ├── crawl_result.py
    │   │   ├── probe.py
    │   │   ├── shared.py
    │   │   ├── urlfinder.py
    │   │   └── utils.py
    │   └── probe
    │   │   ├── analyze.js
    │   │   ├── functions.js
    │   │   ├── options.js
    │   │   └── probe.js
    ├── lib
    │   ├── DB_config.py
    │   ├── __init__.py
    │   ├── cookie.py
    │   ├── database.py
    │   ├── exception.py
    │   ├── http_get.py
    │   ├── request.py
    │   ├── request_pattern.py
    │   ├── shell.py
    │   ├── thirdparty
    │   │   ├── __init__.py
    │   │   ├── pysocks
    │   │   │   ├── __init__.py
    │   │   │   ├── socks.py
    │   │   │   └── sockshandler.py
    │   │   └── simhash
    │   │   │   └── __init__.py
    │   └── utils.py
    └── util
    │   ├── __init__.py
    │   ├── base_util.py
    │   ├── util.py
    │   └── utilities
    │       ├── __init__.py
    │       ├── htmlreport
    │           ├── report.html
    │           ├── report.js
    │           └── style.css
    │       ├── login.py
    │       ├── login
    │           └── login.js
    │       ├── lsajax.py
    │       ├── lsvuln.py
    │       ├── report.py
    │       ├── tocurl.py
    │       ├── updcookie.py
    │       └── usgen.py
├── htcap.py
├── new.sql
└── requirements.txt


/README.md:
--------------------------------------------------------------------------------
  1 | ## HTCAP
  2 | 
  3 | Htcap is a web application scanner able to crawl single page application (SPA) in a recursive manner by intercepting ajax calls and DOM changes.  
  4 | Htcap is not just another vulnerability scanner since it's focused mainly on the crawling process and uses external tools to discover vulnerabilities. It's designed to be a tool for both manual and automated penetration test of modern web applications.
  5 | 
  6 | More infos at [htcap.org](http://htcap.org).
  7 | 
  8 | ## SETUP
  9 | 
 10 | ### Requirements
 11 | 
 12 |  1. Python 2.7
 13 |  2. PhantomJS v2  [PS:因为PhantomJS作者不再维护PhantomJS项目了..估计这个也不会继续更新了]
 14 | 
 15 | ### Download and Run
 16 | 
 17 | ```console
 18 | $ git clone https://github.com/0xa-saline/htcap_mysql htcap
 19 | $ cd htcap
 20 | $ vi core/lib/DB_config.py
 21 | 	#数据库信息
 22 |     'host' : 'localhost',
 23 |     'user' : 'root',
 24 |     'port' : '3306',
 25 |     'password' : 'mysqlroot',
 26 |     'db' : 'w3a_scan',
 27 | $ sudo pip install -r requirements.txt
 28 | $ python htcap.py crawl http://0day5.com
 29 | 
 30 | ```
 31 | 
 32 | 使用姿势和原本的一样的
 33 | 
 34 | ```bash
 35 | $ python htcap.py crawl http://testphp.vulnweb.com
 36 | *****************************************************
 37 | * / _ \|  _ \  / \ \ / / ___/ ___|  ___ __ _| \ | | *
 38 | *| | | | | | |/ _ \ V /|___ \___ \ / __/ _` |  \| | *
 39 | *| |_| | |_| / ___ \| |  ___) |__) | (_| (_| | |\  |*
 40 | * \___/|____/_/   \_\_| |____/____/ \___\__,_|_| \_|*
 41 | *****************************************************
 42 | . No handlers could be found for logger "tldextract"
 43 | [*][debug] http://testphp.vulnweb.com/pictures/
 44 | [*][debug] http://testphp.vulnweb.com/images/
 45 | [*][debug] http://testphp.vulnweb.com/bxss/
 46 | [*][debug] http://testphp.vulnweb.com/Connections/
 47 | [*][debug] http://testphp.vulnweb.com/admin/
 48 | [*][debug] http://testphp.vulnweb.com/CVS/
 49 | [*][debug] http://testphp.vulnweb.com/secured/
 50 | [*][debug] http://testphp.vulnweb.com/userinfo.php
 51 | [*][debug] http://testphp.vulnweb.com/cart.php
 52 | [*][debug] http://testphp.vulnweb.com/logout.php
 53 | [*][debug] http://testphp.vulnweb.com/search.php
 54 | [*][debug] http://testphp.vulnweb.com/comment.php
 55 | [*][debug] http://testphp.vulnweb.com/login.php
 56 | [*][debug] http://testphp.vulnweb.com/index.php
 57 | [*][debug] http://testphp.vulnweb.com/product.php
 58 | [*][debug] http://testphp.vulnweb.com/guestbook.php
 59 | . initialized, crawl started with 10 threads
 60 | [=================================]   108 of 108 pages processed in 43 minutes
 61 | Crawl finished, 108 pages analyzed in 43 minutes
 62 | ```
 63 | 
 64 | PhantomJs can be downloaded [here](http://phantomjs.org//download.html). It comes as a self-contained executable with all libraries linked statically, so there is no need to install or compile anything else.  
 65 | 
 66 | 
 67 | ## DOCUMENTATION
 68 | 
 69 | Documentation, examples and demos can be found at the official website [http://htcap.org](http://htcap.org).
 70 | 
 71 | 
 72 | ## TO DO
 73 | 
 74 | 0.禁止dns刷新缓存 done
 75 | 
 76 | 
 77 | 1.修改htcap的数据库为mysql done
 78 | 
 79 | 
 80 | 2.增加常见统计代码和分享网站的过滤功能 done
 81 | 
 82 | 
 83 | 3.增加常见静态后缀的识别 done
 84 | 
 85 | 
 86 | 4.获取url在原有的robots基础上增加目录爆破和搜索引擎采集.识别一些不能访问的目录 done
 87 | 
 88 | 
 89 | 5.砍掉sqlmap和Arachni扫描功能. done
 90 | 
 91 | 
 92 | 6.增加页面信息识别功能.
 93 | 
 94 | 
 95 | 7.增加重复去重和相似度去重功能
 96 | 
 97 | 
 98 | ## demo
 99 | 
100 | http://htcap.org/scanme/
101 | 
102 | <img src="http://htcap.org/scanme/db_screen.png"></img>
103 | 
104 | ## LICENSE
105 | 
106 | This program is free software; you can redistribute it and/or modify it under the terms of the [GNU General Public License](https://www.gnu.org/licenses/gpl-2.0.html) as published by the Free Software Foundation; either version 2 of the License, or(at your option) any later version.
107 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/__init__.py


--------------------------------------------------------------------------------
/core/constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | 
14 | THSTAT_WAITING = 0
15 | THSTAT_RUNNING = 1
16 | 
17 | CRAWLSCOPE_DOMAIN = "domain"
18 | CRAWLSCOPE_DIRECTORY = "directory"
19 | CRAWLSCOPE_URL = "url"
20 | 
21 | 
22 | CRAWLMODE_PASSIVE = "passive"
23 | CRAWLMODE_ACTIVE = "active"
24 | CRAWLMODE_AGGRESSIVE = "aggressive"
25 | 
26 | REQTYPE_LINK = "link"
27 | REQTYPE_XHR = "xhr"
28 | REQTYPE_WS = "websocket"
29 | REQTYPE_JSONP = "jsonp"
30 | REQTYPE_FORM = "form"
31 | REQTYPE_REDIRECT = "redirect"
32 | REQTYPE_UNKNOWN = "unknown"
33 | 
34 | 
35 | ERROR_CONTENTTYPE = "contentType"
36 | ERROR_TIMEOUT = "timeout"
37 | ERROR_PROBE_TO = "probe_timeout"
38 | ERROR_LOAD = "loaderror"
39 | ERROR_PROBEKILLED = "probe_killed"
40 | ERROR_PROBEFAILURE = "probe_failure"
41 | ERROR_MAXREDIRECTS = "too_many_redirects"
42 | ERROR_CRAWLDEPTH = "crawler_depth_limit_reached"
43 | VULNTYPE_SQLI = "sqli"
44 | VULNTYPE_XSS = "xss"
45 | 


--------------------------------------------------------------------------------
/core/crawl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/__init__.py


--------------------------------------------------------------------------------
/core/crawl/crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under
  8 | the terms of the GNU General Public License as published by the Free Software
  9 | Foundation; either version 2 of the License, or (at your option) any later
 10 | version.
 11 | """
 12 | 
 13 | from __future__ import unicode_literals
 14 | import sys
 15 | import os
 16 | import datetime
 17 | import time
 18 | import getopt
 19 | import json
 20 | import re
 21 | from urlparse import urlsplit, urljoin
 22 | from urllib import unquote
 23 | import urllib2
 24 | import threading
 25 | import subprocess
 26 | from random import choice
 27 | import string
 28 | import ssl
 29 | 
 30 | 
 31 | from core.lib.exception import *
 32 | from core.lib.cookie import Cookie
 33 | from core.lib.database import Database
 34 | 
 35 | 
 36 | from lib.shared import *
 37 | from lib.crawl_result import *
 38 | from core.lib.request import Request
 39 | from core.lib.http_get import HttpGet
 40 | from core.lib.shell import CommandExecutor
 41 | #from core.dirburp.dirscan import Dirbuster
 42 | from crawler_thread import CrawlerThread
 43 | 
 44 | from core.lib.utils import *
 45 | from core.constants import *
 46 | from tld import get_tld
 47 | from lib.utils import *
 48 | 
 49 | class Crawler:
 50 | 
 51 | 	def __init__(self, argv):
 52 | 
 53 | 		self.base_dir = getrealdir(__file__) + os.sep
 54 | 
 55 | 		self.crawl_start_time = int(time.time())
 56 | 		self.crawl_end_time = None
 57 | 		self.taskid = ''
 58 | 
 59 | 		self.defaults = {
 60 | 			"useragent": random_useragent(),
 61 | 			"num_threads": 10,
 62 | 			"max_redirects": 10*10,
 63 | 			"out_file_overwrite": False,
 64 | 			"proxy": None,
 65 | 			"http_auth": None,
 66 | 			"use_urllib_onerror": True,
 67 | 			"group_qs": False,
 68 | 			"process_timeout": 450, # when lots of element(~25000) are added dynamically it can take some time..
 69 | 			"set_referer": True,
 70 | 			"scope": CRAWLSCOPE_DOMAIN,
 71 | 			"mode": CRAWLMODE_AGGRESSIVE,
 72 | 			"max_depth": 10000,
 73 | 			"max_post_depth": 1000,
 74 | 			"override_timeout_functions": True,
 75 | 			'crawl_forms': True# only if mode == CRAWLMODE_AGGRESSIVE
 76 | 		}
 77 | 
 78 | 
 79 | 		self.main(argv)
 80 | 
 81 | 
 82 | 
 83 | 	def usage(self):
 84 | 		print (
 85 | 			   "usage: htcap [options] url outfile\n"
 86 | 			   "Options: \n"
 87 | 			   "  -h               this help\n"
 88 | 			   "  -q               do not display progress informations\n"
 89 | 			   "  -m MODE          set crawl mode:\n"
 90 | 			   "                      - "+CRAWLMODE_PASSIVE+": do not intract with the page\n"
 91 | 			   "                      - "+CRAWLMODE_ACTIVE+": trigger events\n"
 92 | 			   "                      - "+CRAWLMODE_AGGRESSIVE+": also fill input values and crawl forms (default)\n"
 93 | 			   "  -s SCOPE         set crawl scope\n"
 94 | 			   "                      - "+CRAWLSCOPE_DOMAIN+": limit crawling to current domain (default)\n"
 95 | 			   "                      - "+CRAWLSCOPE_DIRECTORY+": limit crawling to current directory (and subdirecotries) \n"
 96 | 			   "                      - "+CRAWLSCOPE_URL+": do not crawl, just analyze a single page\n"
 97 | 			   "  -D               maximum crawl depth (default: " + str(Shared.options['max_depth']) + ")\n"
 98 | 			   "  -P               maximum crawl depth for consecutive forms (default: " + str(Shared.options['max_post_depth']) + ")\n"
 99 | 			   "  -F               even if in aggressive mode, do not crawl forms\n"
100 | 			   "  -H               save HTML generated by the page\n"
101 | 			   "  -d DOMAINS       comma separated list of allowed domains (ex *.target.com)\n"
102 | 			   "  -c COOKIES       cookies as json or name=value pairs separaded by semicolon\n"
103 | 			   "  -C COOKIE_FILE   path to file containing COOKIES \n"
104 | 			   "  -r REFERER       set initial referer\n"
105 | 			   "  -x EXCLUDED      comma separated list of urls to exclude (regex) - ie logout urls\n"
106 | 			   "  -p PROXY         proxy string protocol:host:port -  protocol can be 'http' or 'socks5'\n"
107 | 			   "  -n THREADS       number of parallel threads (default: " + str(self.defaults['num_threads']) + ")\n"
108 | 			   "  -A CREDENTIALS   username and password used for HTTP authentication separated by a colon\n"
109 | 			   "  -U USERAGENT     set user agent\n"
110 | 			   "  -t TIMEOUT       maximum seconds spent to analyze a page (default " + str(self.defaults['process_timeout']) + ")\n"
111 | 			   "  -u USER_SCRIPT   inject USER_SCRIPT into any loaded page\n"
112 | 			   "  -S               skip initial checks\n"
113 | 			   "  -G               group query_string parameters with the same name ('[]' ending excluded)\n"
114 | 			   "  -N               don't normalize URL path (keep ../../)\n"
115 | 			   "  -R               maximum number of redirects to follow (default " + str(self.defaults['max_redirects']) + ")\n"
116 | 			   "  -I               ignore robots.txt\n"
117 | 			   "  -O               dont't override timeout functions (setTimeout, setInterval)\n"
118 | 			   "  -K               keep elements in the DOM (prevent removal)\n"
119 | 			   )
120 | 
121 | 
122 | 	def generate_filename(self, name, out_file_overwrite):
123 | 		fname = generate_filename(name, None, out_file_overwrite)
124 | 		if out_file_overwrite:
125 | 			if os.path.exists(fname):
126 | 				os.remove(fname)
127 | 
128 | 		return fname
129 | 
130 | 
131 | 
132 | 	def kill_threads(self, threads):
133 | 		for th in threads:
134 | 			if th.isAlive(): th.exit = True
135 | 		# start notify() chain
136 | 		Shared.th_condition.acquire()
137 | 		Shared.th_condition.notifyAll()
138 | 		Shared.th_condition.release()
139 | 
140 | 
141 | 
142 | 	def parse_cookie_string(self, string):
143 | 
144 | 		cookies = []
145 | 		try:
146 | 			cookies = json.loads(string)
147 | 		except ValueError:
148 | 			tok = re.split("; *", string)
149 | 			for t in tok:
150 | 				k, v = t.split("=", 1)
151 | 				cookies.append({"name":k.strip(), "value":unquote(v.strip())})
152 | 		except Exception as e:
153 | 			raise
154 | 
155 | 		return cookies
156 | 
157 | 
158 | 
159 | 	def init_db(self, dbname, report_name):
160 | 		infos = {
161 | 			"target": Shared.starturl,
162 | 			"scan_date": -1,
163 | 			"urls_scanned": -1,
164 | 			"scan_time": -1,
165 | 			'command_line': " ".join(sys.argv)
166 | 		}
167 | 
168 | 		database = Database(dbname, report_name, infos)
169 | 		return database
170 | 
171 | 	def check_startrequest(self, request):
172 | 
173 | 		h = HttpGet(request, Shared.options['process_timeout'], 2, Shared.options['useragent'], Shared.options['proxy'])
174 | 		try:
175 | 			h.get_requests()
176 | 		except NotHtmlException:
177 | 			print "\nError: Document is not html"
178 | 			sys.exit(1)
179 | 		except Exception as e:
180 | 			print "\nError: unable to open url: %s" % e
181 | 			sys.exit(1)
182 | 
183 | 	def get_requests_from_robots(self, request):
184 |     		purl = urlsplit(request.url)
185 | 		url = "%s://%s/robots.txt" % (purl.scheme, purl.netloc)
186 | 
187 | 		getreq = Request(REQTYPE_LINK, "GET", url)
188 | 		try:
189 | 			# request, timeout, retries=None, useragent=None, proxy=None):
190 | 			httpget = HttpGet(getreq, 10, 1, "Googlebot", Shared.options['proxy'])
191 | 			lines = httpget.get_file().split("\n")
192 | 		except urllib2.HTTPError:
193 | 			return []
194 | 		except:
195 | 			raise
196 | 
197 | 		requests = []
198 | 		for line in lines:
199 | 			directive = ""
200 | 			url = None
201 | 			try:
202 | 				directive, url = re.sub("\#.*","",line).split(":",1)
203 | 			except:
204 | 				continue # ignore errors
205 | 
206 | 			if re.match("(dis)?allow", directive.strip(), re.I):
207 | 				req = Request(REQTYPE_LINK, "GET", url.strip(), parent=request)
208 | 				requests.append(req)
209 | 
210 | 
211 | 		return adjust_requests(requests) if requests else []
212 | 
213 | 
214 | 	def randstr(self, length):
215 | 		all_chars = string.digits + string.letters + string.punctuation
216 | 		random_string = ''.join(choice(all_chars) for _ in range(length))
217 | 		return random_string
218 | 
219 | 
220 | 
221 | 	def main_loop(self, threads, start_requests, database, display_progress = True, verbose = False,taskid=''):
222 | 		pending = len(start_requests)
223 | 		crawled = 0
224 | 
225 | 		req_to_crawl = start_requests
226 | 		try:
227 | 			while True:
228 | 
229 | 				if display_progress and not verbose:
230 | 					tot = (crawled + pending)
231 | 					print_progressbar(tot, crawled, self.crawl_start_time, "pages processed")
232 | 
233 | 				if pending == 0:
234 | 					# is the check of running threads really needed?
235 | 					running_threads = [t for t in threads if t.status == THSTAT_RUNNING]
236 | 					if len(running_threads) == 0:
237 | 						if display_progress or verbose:
238 | 							print ""
239 | 						break
240 | 
241 | 				if len(req_to_crawl) > 0:
242 | 					Shared.th_condition.acquire()
243 | 					Shared.requests.extend(req_to_crawl)
244 | 					Shared.th_condition.notifyAll()
245 | 					Shared.th_condition.release()
246 | 
247 | 				req_to_crawl = []
248 | 				Shared.main_condition.acquire()
249 | 				Shared.main_condition.wait(1)
250 | 				if len(Shared.crawl_results) > 0:
251 | 					#database.connect()
252 | 					#database.begin()
253 | 					for result in Shared.crawl_results:
254 | 						crawled += 1
255 | 						pending -= 1
256 | 						if verbose:
257 | 							print "crawl result for: %s " % result.request
258 | 							if len(result.request.user_output) > 0:
259 | 								print "  user: %s" % json.dumps(result.request.user_output)
260 | 							if result.errors:
261 | 								print "* crawler errors: %s" % ", ".join(result.errors)
262 | 
263 | 						#database.save_crawl_result(result, True)
264 | 						for req in result.found_requests:
265 |     							######tips
266 |     							#print req.url,req.data,req.method,Shared.allowed_domains
267 | 
268 | 							if verbose:
269 | 								print "  new request found %s" % req
270 | 							
271 | 							urlfilt = HostFilter(req.url)
272 | 							if urlfilt.urlfilter():
273 | 								database.save_request(req,taskid)
274 | 
275 | 							if request_is_crawlable(req) and req not in Shared.requests and req not in req_to_crawl:
276 | 								if request_depth(req) > Shared.options['max_depth'] or request_post_depth(req) > Shared.options['max_post_depth']:
277 | 									if verbose:
278 | 										print "  * cannot crawl: %s : crawl depth limit reached" % req
279 | 									result = CrawlResult(req, errors=[ERROR_CRAWLDEPTH])
280 | 									#database.save_crawl_result(result, False)
281 | 									continue
282 | 
283 | 								if req.redirects > Shared.options['max_redirects']:
284 | 									if verbose:
285 | 										print "  * cannot crawl: %s : too many redirects" % req
286 | 									result = CrawlResult(req, errors=[ERROR_MAXREDIRECTS])
287 | 									#database.save_crawl_result(result, False)
288 | 									continue
289 | 
290 | 								pending += 1
291 | 								req_to_crawl.append(req)
292 | 
293 | 					Shared.crawl_results = []
294 | 				Shared.main_condition.release()
295 | 
296 | 		except KeyboardInterrupt:
297 | 			print "\nTerminated by user"
298 | 			try:
299 | 				Shared.main_condition.release()
300 | 				Shared.th_condition.release()
301 | 			except:
302 | 				pass
303 | 
304 | 
305 | 	def check_user_script_syntax(self, probe_cmd, user_script):
306 | 		try:
307 | 			exe = CommandExecutor(probe_cmd + ["-u", user_script, "-v"] , False)
308 | 			out = exe.execute(5)
309 | 			if out:
310 | 				print "\n* USER_SCRIPT error: %s" % out
311 | 				sys.exit(1)
312 | 			stdoutw(". ")
313 | 		except KeyboardInterrupt:
314 | 			print "\nAborted"
315 | 			sys.exit(0)
316 | 
317 | 
318 | 	def init_crawl(self, start_req, check_starturl, get_robots_txt):
319 | 		start_requests = [start_req]
320 | 		try:
321 | 			if check_starturl:
322 | 				self.check_startrequest(start_req)
323 | 				stdoutw(". ")
324 | 
325 | 			if get_robots_txt:
326 | 				rrequests = self.get_requests_from_robots(start_req)
327 | 				stdoutw(". ")
328 | 				for req in rrequests:
329 | 					if request_is_crawlable(req) and not req in start_requests:
330 | 						start_requests.append(req)
331 | 		except KeyboardInterrupt:
332 | 			print "\nAborted"
333 | 			sys.exit(0)
334 | 
335 | 		return start_requests
336 | 
337 | 
338 | 	def main(self, argv):
339 | 		Shared.options = self.defaults
340 | 		Shared.th_condition = threading.Condition()
341 | 		Shared.main_condition = threading.Condition()
342 | 
343 | 
344 | 		probe_cmd = get_phantomjs_cmd()
345 | 		if not probe_cmd:
346 | 			print "Error: unable to find phantomjs executable"
347 | 			sys.exit(1)
348 | 
349 | 		start_cookies = []
350 | 		start_referer = None
351 | 
352 | 		probe_options = ["-R", self.randstr(20)]
353 | 		threads = []
354 | 		num_threads = self.defaults['num_threads']
355 | 
356 | 		out_file = ""
357 | 		out_file_overwrite = self.defaults['out_file_overwrite']
358 | 		cookie_string = None
359 | 		display_progress = True
360 | 		verbose = False
361 | 		initial_checks = True
362 | 		http_auth = None
363 | 		get_robots_txt = True
364 | 		save_html = False
365 | 		user_script = None
366 | 
367 | 		try:
368 | 			opts, args = getopt.getopt(argv, 'hc:t:jn:x:A:p:d:BGR:U:wD:s:m:C:qr:SIHFP:Ovu:')
369 | 		except getopt.GetoptError as err:
370 | 			print str(err)
371 | 			sys.exit(1)
372 | 
373 | 		
374 | 		if len(args) < 1:
375 | 			self.usage()
376 | 			sys.exit(1)
377 | 		
378 | 
379 | 
380 | 		for o, v in opts:
381 | 			if o == '-h':
382 | 				self.usage()
383 | 				sys.exit(0)
384 | 			elif o == '-c':
385 | 				cookie_string = v
386 | 			elif o == '-C':
387 | 				try:
388 | 					with open(v) as cf:
389 | 						cookie_string = cf.read()
390 | 				except Exception as e:
391 | 					print "error reading cookie file"
392 | 					sys.exit(1)
393 | 			elif o == '-r':
394 | 				start_referer = v
395 | 			elif o == '-n':
396 | 				num_threads = int(v)
397 | 			elif o == '-t':
398 | 				Shared.options['process_timeout'] = int(v)
399 | 			elif o == '-q':
400 | 				display_progress = False
401 | 			elif o == '-A':
402 | 				http_auth = v
403 | 			elif o == '-p':
404 | 				if v == "tor": v = "socks5:127.0.0.1:9150"
405 | 				proxy =  v.split(":")
406 | 				if proxy[0] not in ("http", "socks5"):
407 | 					print "only http and socks5 proxies are supported"
408 | 					sys.exit(1)
409 | 				Shared.options['proxy'] = {"proto":proxy[0], "host":proxy[1], "port":proxy[2]}
410 | 			elif o == '-d':
411 | 				for ad in v.split(","):
412 | 					# convert *.domain.com to *.\.domain\.com
413 | 					pattern = re.escape(ad).replace("\\*\\.","((.*\\.)|)")
414 | 					Shared.allowed_domains.add(pattern)
415 | 			elif o == '-x':
416 | 				for eu in v.split(","):
417 | 					Shared.excluded_urls.add(eu)
418 | 			elif o == "-G":
419 | 				Shared.options['group_qs'] = True
420 | 			#elif o == "-w":
421 | 			#	out_file_overwrite = True
422 | 			elif o == "-R":
423 | 				Shared.options['max_redirects'] = int(v)
424 | 			elif o == "-U":
425 | 				Shared.options['useragent'] = v
426 | 			elif o == "-s":
427 | 				if not v in (CRAWLSCOPE_DOMAIN, CRAWLSCOPE_DIRECTORY, CRAWLSCOPE_URL):
428 | 					self.usage()
429 | 					print "* ERROR: wrong scope set '%s'" % v
430 | 					sys.exit(1)
431 | 				Shared.options['scope'] = v
432 | 			elif o == "-m":
433 | 				if not v in (CRAWLMODE_PASSIVE, CRAWLMODE_ACTIVE, CRAWLMODE_AGGRESSIVE):
434 | 					self.usage()
435 | 					print "* ERROR: wrong mode set '%s'" % v
436 | 					sys.exit(1)
437 | 				Shared.options['mode'] = v
438 | 			elif o == "-S":
439 | 				initial_checks = False
440 | 			elif o == "-I":
441 | 				get_robots_txt = False
442 | 			elif o == "-H":
443 | 				save_html = True
444 | 			elif o == "-D":
445 | 				Shared.options['max_depth'] = int(v)
446 | 			elif o == "-P":
447 | 				Shared.options['max_post_depth'] = int(v)
448 | 			elif o == "-O":
449 | 				Shared.options['override_timeout_functions'] = False
450 | 			elif o == "-F":
451 | 				Shared.options['crawl_forms'] = False
452 | 			elif o == "-u":
453 | 				if os.path.isfile(v):
454 | 					user_script = os.path.abspath(v)
455 | 				else:
456 | 					print "error: unable to open USER_SCRIPT"
457 | 					sys.exit(1)
458 | 
459 | 
460 | 		if Shared.options['scope'] != CRAWLSCOPE_DOMAIN and len(Shared.allowed_domains) > 0:
461 | 			print "* Warinig: option -d is valid only if scope is %s" % CRAWLSCOPE_DOMAIN
462 | 
463 | 		if cookie_string:
464 | 			try:
465 | 				start_cookies = self.parse_cookie_string(cookie_string)
466 | 			except Exception as e:
467 | 				print "error decoding cookie string"
468 | 				sys.exit(1)
469 | 
470 | 		if Shared.options['mode'] != CRAWLMODE_AGGRESSIVE:
471 | 			probe_options.append("-f") # dont fill values
472 | 		if Shared.options['mode'] == CRAWLMODE_PASSIVE:
473 | 			probe_options.append("-t") # dont trigger events
474 | 
475 | 		if Shared.options['proxy']:
476 | 			probe_cmd.append("--proxy-type=%s" % Shared.options['proxy']['proto'])
477 | 			probe_cmd.append("--proxy=%s:%s" % (Shared.options['proxy']['host'], Shared.options['proxy']['port']))
478 | 
479 | 		probe_cmd.append(self.base_dir + 'probe/analyze.js')
480 | 
481 | 
482 | 		if len(Shared.excluded_urls) > 0:
483 | 			probe_options.extend(("-X", ",".join(Shared.excluded_urls)))
484 | 
485 | 		if save_html:
486 | 			probe_options.append("-H")
487 | 
488 | 		if user_script:
489 | 			probe_options.extend(("-u", user_script))
490 | 
491 | 		probe_options.extend(("-x", str(Shared.options['process_timeout'])))
492 | 		probe_options.extend(("-A", Shared.options['useragent']))
493 | 
494 | 		if not Shared.options['override_timeout_functions']:
495 | 			probe_options.append("-O")
496 | 
497 | 		Shared.probe_cmd = probe_cmd + probe_options
498 | 
499 | 
500 | 		Shared.starturl = normalize_url(args[0])
501 | 		#out_file = args[1]
502 | 
503 | 		purl = urlsplit(Shared.starturl)
504 | 		try:
505 | 			pdomain = get_tld(Shared.starturl)
506 | 		except:
507 | 			pdomain = purl.hostname
508 | 		if purl.hostname == pdomain:
509 | 			Shared.allowed_domains.add(purl.hostname)
510 | 		else:
511 | 			Shared.allowed_domains.add(pdomain)
512 | 			Shared.allowed_domains.add(purl.hostname)
513 | 
514 | 
515 | 		for sc in start_cookies:
516 | 			Shared.start_cookies.append(Cookie(sc, Shared.starturl))
517 | 
518 | 
519 | 		start_req = Request(REQTYPE_LINK, "GET", Shared.starturl, set_cookie=Shared.start_cookies, http_auth=http_auth, referer=start_referer)
520 | 
521 | 		if not hasattr(ssl, "SSLContext"):
522 | 			print "* WARNING: SSLContext is not supported with this version of python, consider to upgrade to >= 2.7.9 in case of SSL errors"
523 | 
524 | 		if user_script and initial_checks:
525 | 			self.check_user_script_syntax(probe_cmd, user_script)
526 | 
527 | 		start_requests = self.init_crawl(start_req, initial_checks, get_robots_txt)
528 | 		
529 | 		database = None
530 | 		fname = None
531 | 		try:
532 | 			database = self.init_db(fname, out_file)
533 | 		except Exception as e:
534 | 			print str(e)
535 | 		
536 | 		taskid = database.save_crawl_info(
537 | 			target = Shared.starturl,
538 | 			start_date = self.crawl_start_time,
539 | 			commandline = cmd_to_str(argv),
540 | 			user_agent = Shared.options['useragent']
541 | 		)
542 | 		self.taskid = taskid
543 | 		
544 | 		for req in start_requests:
545 |     			urlfilt = HostFilter(req.url)
546 |                 if urlfilt.urlfilter():
547 |     				database.save_request(req,self.taskid)
548 | 
549 | 		print "initialized, crawl started with %d threads" % (num_threads)
550 | 
551 | 		for n in range(0, num_threads):
552 | 			thread = CrawlerThread()
553 | 			threads.append(thread)
554 | 			thread.start()
555 | 
556 | 
557 | 		self.main_loop(threads, start_requests, database, display_progress, verbose,self.taskid)
558 | 
559 | 		self.kill_threads(threads)
560 | 
561 | 		self.crawl_end_time = int(time.time())
562 | 
563 | 		print "Crawl finished, %d pages analyzed in %d minutes" % (Shared.requests_index, (self.crawl_end_time - self.crawl_start_time) / 60)
564 | 		database.update_crawl_info(self.taskid,self.crawl_end_time)
565 | 


--------------------------------------------------------------------------------
/core/crawl/crawler_thread.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under
  8 | the terms of the GNU General Public License as published by the Free Software
  9 | Foundation; either version 2 of the License, or (at your option) any later
 10 | version.
 11 | """
 12 | 
 13 | from __future__ import unicode_literals
 14 | import time
 15 | import re
 16 | import json
 17 | import urllib
 18 | import cookielib
 19 | import threading
 20 | import base64
 21 | 
 22 | import tempfile
 23 | import os
 24 | import uuid
 25 | 
 26 | from urlparse import urlparse, urlsplit, urljoin, parse_qsl
 27 | 
 28 | from core.lib.exception import *
 29 | from core.crawl.lib.shared import *
 30 | 
 31 | 
 32 | from core.crawl.lib.probe import Probe
 33 | 
 34 | from core.lib.http_get import HttpGet
 35 | from core.lib.cookie import Cookie
 36 | from core.lib.shell import CommandExecutor
 37 | from core.lib.request import Request
 38 | 
 39 | from core.lib.utils import *
 40 | from core.constants import *
 41 | 
 42 | from lib.utils import *
 43 | from lib.crawl_result import *
 44 | 
 45 | 
 46 | class CrawlerThread(threading.Thread):
 47 | 
 48 | 	def __init__(self):
 49 | 		threading.Thread.__init__(self)
 50 | 		self.thread_uuid = uuid.uuid4()
 51 | 		self.process_retries = 2
 52 | 		self.process_retries_interval = 0.5
 53 | 
 54 | 		self.status = THSTAT_RUNNING
 55 | 		self.exit = False
 56 | 
 57 | 		self.cookie_file = "%s%shtcap_cookiefile-%s.json" % (tempfile.gettempdir(), os.sep, self.thread_uuid)
 58 | 
 59 | 
 60 | 	def run(self):
 61 | 		self.crawl()
 62 | 
 63 | 
 64 | 
 65 | 	def wait_request(self):
 66 | 		request = None
 67 | 		Shared.th_condition.acquire()
 68 | 		while True:
 69 | 			if self.exit == True:
 70 | 				Shared.th_condition.notifyAll()
 71 | 				Shared.th_condition.release()
 72 | 				raise ThreadExitRequestException("exit request received")
 73 | 
 74 | 			if Shared.requests_index >= len(Shared.requests):
 75 | 				self.status = THSTAT_WAITING
 76 | 				Shared.th_condition.wait() # The wait method releases the lock, blocks the current thread until another thread calls notify
 77 | 				continue
 78 | 
 79 | 			request = Shared.requests[Shared.requests_index]
 80 | 			Shared.requests_index += 1
 81 | 
 82 | 			break
 83 | 
 84 | 		Shared.th_condition.release()
 85 | 
 86 | 		self.status = THSTAT_RUNNING
 87 | 
 88 | 		return request
 89 | 
 90 | 
 91 | 
 92 | 	def load_probe_json(self, jsn):
 93 | 		jsn = jsn.strip()
 94 | 		if not jsn: jsn = "["
 95 | 		if jsn[-1] != "]":
 96 | 			jsn += '{"status":"ok", "partialcontent":true}]'
 97 | 		try:
 98 | 			return json.loads(jsn)
 99 | 		except Exception:
100 | 			#print "-- JSON DECODE ERROR %s" % jsn
101 | 			raise
102 | 
103 | 
104 | 	def send_probe(self, request,  errors):
105 | 
106 | 		url = request.url
107 | 		jsn = None
108 | 		probe = None
109 | 		retries = self.process_retries
110 | 		params = []
111 | 		cookies = []
112 | 
113 | 
114 | 		if request.method == "POST":
115 | 			params.append("-P")
116 | 			if request.data:
117 | 				params.extend(("-D", request.data))
118 | 
119 | 
120 | 		if len(request.cookies) > 0:
121 | 			for cookie in request.cookies:
122 | 				cookies.append(cookie.get_dict())
123 | 
124 | 			with open(self.cookie_file,'w') as fil:
125 | 				fil.write(json.dumps(cookies))
126 | 
127 | 			params.extend(("-c", self.cookie_file))
128 | 
129 | 
130 | 
131 | 		if request.http_auth:
132 | 			params.extend(("-p" ,request.http_auth))
133 | 
134 | 		if Shared.options['set_referer'] and request.referer:
135 | 			params.extend(("-r", request.referer))
136 | 
137 | 
138 | 		params.extend(("-i", str(request.db_id)))
139 | 
140 | 		params.append(url)
141 | 
142 | 
143 | 		while retries:
144 | 		#while False:
145 | 
146 | 			# print cmd_to_str(Shared.probe_cmd + params)
147 | 			# print ""
148 | 
149 | 			cmd = CommandExecutor(Shared.probe_cmd + params)
150 | 			jsn = cmd.execute(Shared.options['process_timeout'] + 2)
151 | 
152 | 			if jsn == None:
153 | 				errors.append(ERROR_PROBEKILLED)
154 | 				time.sleep(self.process_retries_interval) # ... ???
155 | 				retries -= 1
156 | 				continue
157 | 
158 | 
159 | 			# try to decode json also after an exception .. sometimes phantom crashes BUT returns a valid json ..
160 | 			try:
161 | 				if jsn and type(jsn) is not str:
162 | 					jsn = jsn[0]
163 | 				probeArray = self.load_probe_json(jsn)
164 | 			except Exception as e:
165 | 				raise
166 | 
167 | 
168 | 			if probeArray:
169 | 				probe = Probe(probeArray, request)
170 | 
171 | 				if probe.status == "ok":
172 | 					break
173 | 
174 | 				errors.append(probe.errcode)
175 | 
176 | 				if probe.errcode in (ERROR_CONTENTTYPE, ERROR_PROBE_TO):
177 | 					break
178 | 
179 | 			time.sleep(self.process_retries_interval)
180 | 			retries -= 1
181 | 
182 | 		return probe
183 | 
184 | 
185 | 
186 | 	def crawl(self):
187 | 
188 | 		while True:
189 | 			url = None
190 | 			cookies = []
191 | 			requests = []
192 | 
193 | 			requests_to_crawl = []
194 | 			redirects = 0
195 | 			errors = []
196 | 
197 | 			try:
198 | 				request = self.wait_request()
199 | 			except ThreadExitRequestException:
200 | 				if os.path.exists(self.cookie_file):
201 | 					os.remove(self.cookie_file)
202 | 				return
203 | 			except Exception as e:
204 | 				print "-->"+str(e)
205 | 				continue
206 | 
207 | 			url = request.url
208 | 
209 | 			purl = urlsplit(url)
210 | 
211 | 
212 | 			probe = None
213 | 
214 | 			probe = self.send_probe(request, errors)
215 | 
216 | 			if probe:
217 | 				if probe.status == "ok" or probe.errcode == ERROR_PROBE_TO:
218 | 
219 | 					requests = probe.requests
220 | 
221 | 					if probe.html:
222 | 						request.html = probe.html
223 | 
224 | 					if len(probe.user_output) > 0:
225 | 						request.user_output = probe.user_output
226 | 
227 | 			else :
228 | 				errors.append(ERROR_PROBEFAILURE)
229 | 				# get urls with python to continue crawling
230 | 				if Shared.options['use_urllib_onerror'] == False:
231 | 					continue
232 | 				try:
233 | 					hr = HttpGet(request, Shared.options['process_timeout'], self.process_retries, Shared.options['useragent'], Shared.options['proxy'])
234 | 					requests = hr.get_requests()
235 | 				except Exception as e:
236 | 					errors.append(str(e))
237 | 
238 | 
239 | 			# set out_of_scope, apply user-supplied filters to urls (ie group_qs)
240 | 			adjust_requests(requests)
241 | 
242 | 			Shared.main_condition.acquire()
243 | 			res = CrawlResult(request, requests, errors)
244 | 			Shared.crawl_results.append(res)
245 | 			Shared.main_condition.notify()
246 | 			Shared.main_condition.release()
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 


--------------------------------------------------------------------------------
/core/crawl/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/crawl/lib/__init__.py


--------------------------------------------------------------------------------
/core/crawl/lib/crawl_result.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | 
14 | 
15 | class CrawlResult:
16 | 	def __init__(self, request, found_requests = None, errors = None):
17 | 		self.request = request
18 | 		self.found_requests = found_requests if found_requests else []
19 | 		self.errors = errors if errors else []
20 | 
21 | 


--------------------------------------------------------------------------------
/core/crawl/lib/probe.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | 
14 | from core.lib.request import Request
15 | from core.lib.cookie import Cookie
16 | from core.constants import *
17 | 
18 | class Probe:
19 | 
20 | 	def __init__(self, data, parent):
21 | 		self.status = "ok"
22 | 		self.requests = []
23 | 		self.cookies = []
24 | 		self.redirect = None;
25 | 		# if True the probe returned no error BUT the json is not closed properly
26 | 		self.partialcontent = False
27 | 		self.html = None
28 | 		self.user_output = []
29 | 
30 | 		status = data.pop()
31 | 
32 | 		if status['status'] == "error":
33 | 			self.status = "error"
34 | 			self.errcode = status['code']
35 | 
36 | 
37 | 		if "partialcontent" in status:
38 | 			self.partialcontent = status['partialcontent']
39 | 
40 | 		# grap cookies before creating rquests
41 | 		for key,val in data:
42 | 			if key == "cookies":
43 | 				for cookie in val:
44 | 					self.cookies.append(Cookie(cookie, parent.url))
45 | 
46 | 		if "redirect" in status:
47 | 			self.redirect = status['redirect']
48 | 			r = Request(REQTYPE_REDIRECT, "GET", self.redirect, parent=parent, set_cookie=self.cookies, parent_db_id=parent.db_id)
49 | 			self.requests.append(r)
50 | 
51 | 		for key,val in data:
52 | 			if key == "request":
53 | 				trigger = val['trigger'] if 'trigger' in val else None
54 | 				r = Request(val['type'], val['method'], val['url'], parent=parent, set_cookie=self.cookies, data=val['data'], trigger=trigger, parent_db_id=parent.db_id )
55 | 				self.requests.append(r)
56 | 			elif key == "html":
57 | 				self.html = val
58 | 			elif key == "user":
59 | 				self.user_output.append(val)
60 | 
61 | 
62 | 
63 | 	# @TODO handle cookies set by ajax (in probe too)
64 | 


--------------------------------------------------------------------------------
/core/crawl/lib/shared.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | 
14 | class Shared:
15 | 	"""
16 | 	data shared between threads
17 | 	"""
18 | 	
19 | 	main_condition = None
20 | 	th_condition = None
21 | 
22 | 	requests = []
23 | 	requests_index = 0
24 | 	crawl_results = []
25 | 
26 | 	starturl = ""
27 | 	start_cookies = []
28 | 	allowed_domains = set()
29 | 	excluded_urls = set()	
30 | 	
31 | 	options = {}	
32 | 
33 | 


--------------------------------------------------------------------------------
/core/crawl/lib/urlfinder.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | import re
14 | from HTMLParser import HTMLParser
15 | from urlparse import urljoin, urlparse
16 | 
17 | 
18 | class UrlFinder:
19 |     def __init__(self, html):
20 |         self.html = html
21 | 
22 |     def get_urls(self):
23 | 
24 |         try:
25 |             parser = UrlHTMLParser()
26 |             parser.feed(self.html)
27 |         except:
28 |             raise
29 | 
30 |         return parser.urls
31 | 
32 | 
33 | class UrlHTMLParser(HTMLParser):
34 |     def __init__(self):
35 | 
36 |         HTMLParser.__init__(self)
37 |         self.base_url = ""
38 |         self.urls = []
39 | 
40 |     def handle_starttag(self, tag, attrs):
41 |         # more info about the <base> tag: https://www.w3.org/wiki/HTML/Elements/base
42 |         if tag == "base":
43 |             for key, val in attrs:
44 |                 if key == "href":
45 |                     self.base_url = urlparse(val.strip()).geturl()
46 | 
47 |         elif tag == "a":
48 |             for key, val in attrs:
49 |                 if key == "href":
50 |                     if re.match("^https?://", val, re.I):
51 |                         self.urls.extend([val])
52 |                     elif not re.match("^[a-z]+:", val, re.I) and not val.startswith("#"):
53 |                         self.urls.extend([urljoin(self.base_url, val)])
54 | 


--------------------------------------------------------------------------------
/core/crawl/lib/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | 
 13 | from urlparse import urljoin
 14 | from core.lib.cookie import Cookie
 15 | from core.lib.utils import *
 16 | from shared import *
 17 | import posixpath
 18 | import json 
 19 | import re
 20 | 
 21 | 
 22 | 
 23 | def request_in_scope(request):
 24 | 	url = request.url
 25 | 	purl = urlsplit(url)
 26 | 	spurl = urlsplit(Shared.starturl)	
 27 | 	scope = Shared.options['scope']
 28 | 	in_scope = False
 29 | 
 30 | 	# check for scopes
 31 | 	if scope == CRAWLSCOPE_DOMAIN:
 32 | 		for pattern in Shared.allowed_domains:
 33 | 			if re.match(pattern, purl.hostname):
 34 | 				in_scope = True	
 35 | 				break			
 36 | 
 37 | 	elif scope == CRAWLSCOPE_DIRECTORY:
 38 | 		if purl.hostname != spurl.hostname:
 39 | 			in_scope = False
 40 | 		else:
 41 | 			path  = [p for p in posixpath.dirname(purl.path).split("/") if p]
 42 | 			spath = [p for p in posixpath.dirname(spurl.path).split("/") if p]			
 43 | 			in_scope = path[:len(spath)] == spath
 44 | 		
 45 | 	elif scope == CRAWLSCOPE_URL:
 46 | 		in_scope = url == Shared.starturl
 47 | 
 48 | 
 49 | 	# check for excluded urls
 50 | 	for pattern in Shared.excluded_urls:
 51 | 		if re.match(pattern, request.url):
 52 | 			in_scope = False
 53 | 			break
 54 | 
 55 | 	return in_scope
 56 | 
 57 | 
 58 | 
 59 | def adjust_requests(requests):
 60 | 	"""
 61 | 	adjust an array of requsts according to current status/settings
 62 | 	 1. sets the out_of_scope property
 63 | 	 2. normalize url accoding to user settings
 64 | 	"""
 65 | 
 66 | 	for request in requests:
 67 | 		if request.type == REQTYPE_UNKNOWN or not request_in_scope(request):
 68 | 			request.out_of_scope = True
 69 | 
 70 | 		if Shared.options['group_qs']:
 71 | 			request.url = group_qs_params(request.url)
 72 | 
 73 | 	return requests
 74 | 
 75 | 
 76 | def request_depth(request):
 77 | 	if request.parent == None:
 78 | 		return 1
 79 | 
 80 | 	return 1 + request_depth(request.parent)
 81 | 
 82 | 
 83 | 
 84 | def request_post_depth(request):
 85 | 	if request.method != "POST":
 86 | 		return 0
 87 | 
 88 | 	if request.parent == None or request.parent.method != "POST":
 89 | 		return 1
 90 | 
 91 | 	return 1 + request_post_depth(request.parent)
 92 | 
 93 | 
 94 | 
 95 | def request_is_crawlable(request):
 96 | 	if request.out_of_scope:
 97 | 		return False
 98 | 
 99 | 	types = [REQTYPE_LINK, REQTYPE_REDIRECT]
100 | 	if Shared.options['mode'] == CRAWLMODE_AGGRESSIVE and Shared.options['crawl_forms']:
101 | 		types.append(REQTYPE_FORM)
102 | 
103 | 	return request.type in types and re.match("^https?://", request.url, re.I)
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/core/crawl/probe/analyze.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | HTCAP - beta 1
  3 | Author: filippo.cavallarin@wearesegment.com
  4 | 
  5 | This program is free software; you can redistribute it and/or modify it under
  6 | the terms of the GNU General Public License as published by the Free Software
  7 | Foundation; either version 2 of the License, or (at your option) any later
  8 | version.
  9 | */
 10 | 
 11 | var system = require('system');
 12 | var fs = require('fs');
 13 | 
 14 | 
 15 | 
 16 | phantom.injectJs("functions.js");
 17 | phantom.injectJs("options.js");
 18 | phantom.injectJs("probe.js");
 19 | 
 20 | 
 21 | var startTime = Date.now();
 22 | 
 23 | 
 24 | var site = "";
 25 | var response = null;
 26 | //var showHelp = false;
 27 | 
 28 | var headers = {};
 29 | 
 30 | var args = getopt(system.args,"hVaftUJdICc:MSEp:Tsx:A:r:mHX:PD:R:Oi:u:v");
 31 | 
 32 | var page = require('webpage').create();
 33 | var page_settings = {encoding: "utf8"};
 34 | var random = "IsHOulDb34RaNd0MsTR1ngbUt1mN0t";
 35 | //var injectScript = "{}";
 36 | var US = null;
 37 | 
 38 | var userInterface = {
 39 | 	id: null,
 40 | 	vars: {},
 41 | 	pageEval: function(fnc){
 42 | 		var sfnc = 'return (' + fnc.toString() + ').apply(null, arguments)';
 43 | 		return page.evaluate(function(fnc){
 44 | 			 return (new Function('', fnc)).apply(null, window.__PROBE__.currentUserScriptParameters)
 45 | 		}, sfnc);
 46 | 	},
 47 | 	render: function(file){
 48 | 		try {
 49 | 			page.render(file);
 50 | 			return true;
 51 | 		} catch(e){
 52 | 			return false;
 53 | 		}
 54 | 	},
 55 | 	print: function(str){
 56 | 		console.log('["user",' + JSON.stringify(str) + '],');
 57 | 	},
 58 | 	fread: function(file){
 59 | 		try{
 60 | 			return "" + fs.read(file);
 61 | 		} catch(e){
 62 | 			return false;
 63 | 		}
 64 | 	},
 65 | 	fwrite: function(file, content, mode){
 66 | 		try {
 67 | 			fs.write(file, content, mode || 'w');
 68 | 			return true;
 69 | 		} catch(e) {
 70 | 			console.log(e)
 71 | 			return false;
 72 | 		}
 73 | 	}
 74 | }
 75 | 
 76 | if(typeof args == 'string'){
 77 | 	console.log("Error: " + args);
 78 | 	phantom.exit(-1);
 79 | }
 80 | 
 81 | for(var a = 0; a < args.opts.length; a++){
 82 | 	switch(args.opts[a][0]){
 83 | 		case "h":
 84 | 			usage();
 85 | 			phantom.exit(1);
 86 | 			break;
 87 | 		case "P":
 88 | 			page_settings.operation = "POST";
 89 | 			break;
 90 | 		case "D":
 91 | 			page_settings.data = args.opts[a][1];
 92 | 			break;
 93 | 		case "R":
 94 | 			random = args.opts[a][1];
 95 | 			break;
 96 | 		case "u":
 97 | 			if(!phantom.injectJs(args.opts[a][1])){
 98 | 				console.log("File not found: " + args.opts[a][1]);
 99 | 				phantom.exit(0);
100 | 			}
101 | 			if(!window.US){
102 | 				phantom.exit(0);
103 | 			}
104 | 			break;
105 | 		case "v":
106 | 			phantom.exit(0);
107 | 	}
108 | }
109 | 
110 | 
111 | parseArgsToOptions(args);
112 | userInterface.id = options.id;
113 | 
114 | site = args.args[1];
115 | 
116 | if(!site){
117 | 	usage();
118 | 	phantom.exit(-1);
119 | }
120 | 
121 | site = site.trim();
122 | if(site.length < 4 || site.substring(0,4).toLowerCase() != "http"){
123 | 	site = "http://" + site;
124 | }
125 | 
126 | console.log("[");
127 | 
128 | /* maximum execution time */
129 | setTimeout(execTimedOut,options.maxExecTime);
130 | 
131 | 
132 | 
133 | phantom.onError = function(msg, trace) {
134 |   var msgStack = ['PHANTOM ERROR: ' + msg];
135 |   if (trace && trace.length) {
136 |     msgStack.push('TRACE:');
137 |     trace.forEach(function(t) {
138 |       msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
139 |     });
140 |   }
141 |   console.error(msgStack.join('\n'));
142 |   phantom.exit(1);
143 | };
144 | 
145 | 
146 | 
147 | page.onConsoleMessage = function(msg, lineNum, sourceId) {
148 | 	if(options.verbose)
149 | 		console.log("console: " + msg);
150 | }
151 | page.onError = function(msg, lineNum, sourceId) {
152 | 	if(options.verbose)
153 | 		console.log("console error: on   " + JSON.stringify(lineNum) + " " + msg);
154 | }
155 | 
156 | page.onAlert = function(msg) {
157 | 	if(options.verbose)
158 |   		console.log('ALERT: ' + msg);
159 | };
160 | 
161 | page.settings.userAgent = options.userAgent;
162 | page.settings.loadImages = options.loadImages;
163 | 
164 | 
165 | 
166 | page.onResourceReceived = function(resource) {
167 | 	if(window.response == null){
168 | 		window.response = resource;
169 | 		// @TODO sanytize response.contentType
170 | 
171 | 	}
172 | };
173 | 
174 | 
175 | page.onResourceRequested = function(requestData, networkRequest) {
176 | 	//console.log(JSON.stringify(requestData))
177 | };
178 | 
179 | // to detect window.location= / document.location.href=
180 | page.onNavigationRequested = onNavigationRequested;
181 | 
182 | page.onConfirm = function(msg) {return true;} // recently changed
183 | 
184 | /* phantomjs issue #11684 workaround */
185 | var isPageInitialized = false;
186 | page.onInitialized = function(){
187 | 	if(isPageInitialized) return;
188 | 	isPageInitialized = true;
189 | 
190 | 	// try to hide phantomjs
191 | 	page.evaluate(function(){
192 | 		window.__callPhantom = window.callPhantom;
193 | 		delete window.callPhantom;
194 | 	});
195 | 
196 | 	startProbe(random/*, injectScript*/);
197 | 
198 | };
199 | 
200 | 
201 | page.onCallback = function(data) {
202 | 	switch(data.cmd){
203 | 		case "triggerUserEvent":
204 | 			var ret = window.US[data.argument.name](window.userInterface)
205 | 			return ret;
206 | 		case "print":
207 | 			console.log(data.argument);
208 | 			break;
209 | 
210 | 		case "end":
211 | 			if(options.returnHtml){
212 | 				page.evaluate(function(options){
213 | 					window.__PROBE__.printPageHTML();
214 | 				}, options);
215 | 			}
216 | 
217 | 			page.evaluate(function(options){
218 | 				window.__PROBE__.triggerUserEvent("onEnd");
219 | 			});
220 | 
221 | 			printStatus("ok", window.response.contentType);
222 | 			phantom.exit(0);
223 | 			break;
224 | 
225 | 	}
226 | 
227 | }
228 | 
229 | 
230 | 
231 | if(options.httpAuth){
232 | 	headers['Authorization'] = 'Basic ' + btoa(options.httpAuth[0] + ":" + options.httpAuth[1]);
233 | }
234 | 
235 | if(options.referer){
236 | 	headers['Referer'] = options.referer;
237 | }
238 | 
239 | page.customHeaders = headers;
240 | 
241 | 
242 | for(var a = 0; a < options.setCookies.length; a++){
243 | 	// maybe this is wrogn acconding to rfc .. but phantomjs cannot set cookie witout a domain...
244 | 	if(!options.setCookies[a].domain){
245 | 		var purl = document.createElement("a");
246 | 		purl.href=site;
247 | 		options.setCookies[a].domain = purl.hostname
248 | 	}
249 | 	if(options.setCookies[a].expires)
250 | 		options.setCookies[a].expires *= 1000;
251 | 
252 | 	phantom.addCookie(options.setCookies[a]);
253 | 
254 | }
255 | 
256 | page.viewportSize = {
257 |   width: 1920,
258 |   height: 1080
259 | };
260 | 
261 | 
262 | 
263 | 
264 | page.open(site, page_settings, function(status) {
265 | 	var response = window.response; // just to be clear
266 | 	if (status !== 'success'){
267 | 		var mess = "";
268 | 		var out = {response: response};
269 | 		if(!response || response.headers.length == 0){
270 | 			printStatus("error", "load");
271 | 			phantom.exit(1);
272 | 		}
273 | 
274 | 		// check for redirect first
275 | 		for(var a = 0; a < response.headers.length; a++){
276 | 			if(response.headers[a].name.toLowerCase() == 'location'){
277 | 
278 | 				if(options.getCookies){
279 | 					printCookies(response.headers, site);
280 | 				}
281 | 				printStatus("ok", null, null, response.headers[a].value);
282 | 				phantom.exit(0);
283 | 			}
284 | 		}
285 | 
286 | 		assertContentTypeHtml(response);
287 | 
288 | 		phantom.exit(1);
289 | 	}
290 | 
291 | 
292 | 	if(options.getCookies){
293 | 		printCookies(response.headers, site);
294 | 	}
295 | 
296 | 	assertContentTypeHtml(response);
297 | 
298 | 	page.evaluate(function(){
299 | 
300 | 		window.__PROBE__.waitAjax(function(ajaxTriggered){
301 | 			window.__PROBE__.triggerUserEvent("onStart");
302 | 			if(ajaxTriggered){
303 | 				window.__PROBE__.triggerUserEvent("onAllXhrsCompleted");
304 | 			}
305 | 			console.log("startAnalysis")
306 | 			window.__PROBE__.startAnalysis();
307 | 		});
308 | 	})
309 | 
310 | 
311 | });
312 | 
313 | 
314 | 
315 | 


--------------------------------------------------------------------------------
/core/crawl/probe/functions.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | HTCAP - beta 1
  3 | Author: filippo.cavallarin@wearesegment.com
  4 | 
  5 | This program is free software; you can redistribute it and/or modify it under
  6 | the terms of the GNU General Public License as published by the Free Software
  7 | Foundation; either version 2 of the License, or (at your option) any later
  8 | version.
  9 | */
 10 | 
 11 | // @todo error on Unknown option ds
 12 | function getopt(arguments, optstring){
 13 | 	var args = arguments.slice();
 14 | 	var ret = {
 15 | 		opts: [],
 16 | 		args: args
 17 | 	};
 18 | 
 19 | 	var m = optstring.match(/[a-zA-Z]\:*/g);
 20 | 	for(var a = 0; a < m.length; a++){
 21 | 		var ai = args.indexOf("-" + m[a][0]);
 22 | 		if(ai > -1){
 23 | 			if(m[a][1] == ":"){
 24 | 				if(args[ai+1]){
 25 | 					ret.opts.push([m[a][0], args[ai+1]]);
 26 | 					args.splice(ai,2);
 27 | 				} else {
 28 | 					return "missing argumnet for option " + m[a][0];
 29 | 				}
 30 | 			} else {
 31 | 				ret.opts.push([m[a][0]]);
 32 | 				args.splice(ai,1);
 33 | 			}
 34 | 		}
 35 | 	}
 36 | 
 37 | 	return ret;
 38 | }
 39 | 
 40 | 
 41 | function removeHash(url){
 42 | 	var anchor = document.createElement("a");
 43 | 	anchor.href = url;
 44 | 
 45 | 	return anchor.protocol + "//" + anchor.host + anchor.pathname + anchor.search;
 46 | }
 47 | 
 48 | 
 49 | 
 50 | function compareUrls(url1, url2, includeHash){
 51 | 	var a1 = document.createElement("a");
 52 | 	var a2 = document.createElement("a");
 53 | 	a1.href = url1;
 54 | 	a2.href = url2;
 55 | 
 56 | 	var eq = (a1.protocol == a2.protocol && a1.host == a2.host && a1.pathname == a2.pathname && a1.search == a2.search);
 57 | 
 58 | 	if(includeHash) eq = eq && a1.hash == a2.hash;
 59 | 
 60 | 	return eq;
 61 | 
 62 | }
 63 | 
 64 | 
 65 | function printCookies(headers, site){
 66 | 	var cookies = getCookies(headers, site);
 67 | 	console.log('["cookies",' + JSON.stringify(cookies) + "],");
 68 | }
 69 | 
 70 | 
 71 | function printStatus(status, errcode, message, redirect){
 72 | 	var o = {status:status};
 73 | 	if(status == "error"){
 74 | 		o.code = errcode;
 75 | 		switch(errcode){
 76 | 			case "load":
 77 | 				break;
 78 | 			case "contentType":
 79 | 				o.message = message;
 80 | 				break;
 81 | 			case "requestTimeout":
 82 | 				break;
 83 | 			case "probe_timeout":
 84 | 				break;
 85 | 		}
 86 | 	}
 87 | 	if(redirect) o.redirect = redirect;
 88 | 	o.time = Math.floor((Date.now() - window.startTime)/1000);
 89 | 	console.log(JSON.stringify(o));
 90 | 	console.log("]")
 91 | }
 92 | 
 93 | 
 94 | 
 95 | function execTimedOut(){
 96 | 	if(!response || response.headers.length == 0){
 97 | 		printStatus("error", "requestTimeout");
 98 | 		phantom.exit(0);
 99 | 	}
100 | 	printStatus("error", "probe_timeout");
101 | 	phantom.exit(0);
102 | 
103 | }
104 | 
105 | 
106 | 
107 | function usage(){
108 | 	var usage = "Usage: analyze.js [options] <url>\n" +
109 | 				"  -V              verbose\n" +
110 | 				"  -a              don't check ajax\n" +
111 | 				"  -f              don't fill values\n" +
112 | 				"  -t              don't trigger events (onload only)\n" +
113 | 				"  -s              don't check websockets\n" +
114 | 				"  -M              dont' map events\n" +
115 | 				"  -T              don't trigger mapped events\n" +
116 | 				"  -S              don't check for <script> insertion\n" +
117 | 				"  -P              load page with POST\n" +
118 | 				"  -D              POST data\n" +
119 | 				"  -R <string>     random string used to generate random values - the same random string will generate the same random values\n" +
120 | 				"  -X              comma separated list of excluded urls\n" +
121 | 				"  -C              don't get cookies\n" +
122 | 				"  -c <path>       set cookies from file (json)\n" +
123 | 				"  -p <user:pass>  http auth \n" +
124 | 				"  -x <seconds>    maximum execution time \n" +
125 | 				"  -A <user agent> set user agent \n" +
126 | 				"  -r <url>        set referer \n" +
127 | 				"  -H              return generated html \n" +
128 | 				"  -I              load images\n" + 
129 | 				"  -O              dont't override timeout functions\n" +
130 | 				"  -u              path to user script to inject\n" +
131 | 				"  -K              keep elements in the DOM (prevent removal)\n" +
132 | 				"  -v              exit after parsing options, used to verify user script";
133 | 	console.log(usage);
134 | }
135 | 
136 | 
137 | 
138 | function parseArgsToOptions(args){
139 | 
140 | 	for(var a = 0; a < args.opts.length; a++){
141 | 		switch(args.opts[a][0]){
142 | 			// case "h":
143 | 			// 	//showHelp = true;
144 | 			// 	usage();
145 | 			// 	phantom.exit(1);
146 | 			// 	break;
147 | 			case "V":
148 | 				options.verbose = true;
149 | 				break;
150 | 			case "a":
151 | 				options.checkAjax = false;
152 | 				break;
153 | 			case "f":
154 | 				options.fillValues = false;
155 | 				break;
156 | 			case "t":
157 | 				options.triggerEvents = false;
158 | 				break;
159 | 			case "d":
160 | 				options.printAjaxPostData = false;
161 | 			case "S":
162 | 				options.checkScriptInsertion = false;
163 | 				break;
164 | 			case "I":
165 | 				options.loadImages = true;
166 | 				break;
167 | 			case "C":
168 | 				options.getCookies = false;
169 | 				break;
170 | 
171 | 			case "c":
172 | 				try{
173 | 					var cookie_file = fs.read(args.opts[a][1]);
174 | 					options.setCookies = JSON.parse(cookie_file);
175 | 				} catch(e){
176 | 					console.log(e);
177 | 					phantom.exit(1);
178 | 				}
179 | 
180 | 				break;
181 | 			case "p":
182 | 				var arr = args.opts[a][1].split(":");
183 | 				options.httpAuth = [arr[0], arr.slice(1).join(":")];
184 | 				break;
185 | 			case "M":
186 | 				options.mapEvents = false;
187 | 				break;
188 | 			case "T":
189 | 				options.triggerAllMappedEvents = false;
190 | 				break;
191 | 			case "s":
192 | 				options.checkWebsockets = false;
193 | 				break;
194 | 			case "x":
195 | 				options.maxExecTime = parseInt(args.opts[a][1]) * 1000;
196 | 				break;
197 | 			case "A":
198 | 				options.userAgent = args.opts[a][1];
199 | 				break;
200 | 			case "r":
201 | 				options.referer = args.opts[a][1];
202 | 				break;
203 | 			case "m":
204 | 				options.outputMappedEvents = true;
205 | 				break;
206 | 			case "H":
207 | 				options.returnHtml = true;
208 | 				break;
209 | 			case "X":
210 | 				options.excludedUrls = args.opts[a][1].split(",");
211 | 				break;
212 | 			case "O":
213 | 				options.overrideTimeoutFunctions = false;
214 | 				break;
215 | 			case "O":
216 | 				options.overrideTimeoutFunctions = false;
217 | 				break;
218 | 			case "i":
219 | 				options.id = args.opts[a][1];
220 | 				break;
221 | 			case "K":
222 | 				options.preventElementRemoval = true;
223 | 				break;
224 | 		}
225 | 	}
226 | };
227 | 
228 | function onNavigationRequested(url, type, willNavigate, main) {
229 | 
230 | 	// @todo: detection on window.location is broken .. it fals on multiple calls
231 | 	if(page.navigationLocked == true){
232 | 		page.evaluate(function(url, type, main){
233 | 			if(type == "LinkClicked")
234 | 				return;
235 | 
236 | 			if(type == 'Other' && main == false){
237 | 				if(window.__PROBE__)
238 | 					window.__PROBE__.printLink(url);
239 | 			}
240 | 
241 | 		},url, type, main);
242 | 	}
243 | 
244 | 
245 | 	// allow the navigation if only the hash is changed
246 | 	if(page.navigationLocked == true && compareUrls(url, site)){
247 | 		page.navigationLocked = false;
248 | 		page.evaluate(function(url){
249 | 			document.location.href=url;
250 | 		},url);
251 | 	}
252 | 
253 | 	page.navigationLocked = true;
254 | }
255 | 
256 | 
257 | // generates PSEUDO random values. the same seed will generate the same values
258 | function generateRandomValues(seed){
259 | 	var values = {};
260 | 	var letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
261 | 	var numbers = "0123456789";
262 | 	var symbols = "!#&^;.,?%$*";
263 | 	var months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"];
264 | 	var years = ["1982", "1989", "1990", "1994", "1995", "1996"];
265 | 	var names = ["james", "john", "robert", "michael", "william", "david", "richard", "charles", "joseph", "thomas", "christopher", "daniel", "paul", "mark", "donald", "george", "kenneth"];
266 | 	var surnames = ["anderson", "thomas", "jackson", "white", "harris", "martin", "thompson", "garcia", "martinez", "robinson", "clark", "rodriguez", "lewis", "lee", "walker", "hall"];
267 | 	var domains = [".com", ".org", ".net", ".it", ".tv", ".de", ".fr"];
268 | 
269 | 	var randoms = [];
270 | 	var randoms_i = 0;
271 | 
272 | 	for(var a = 0; a < seed.length; a++){
273 | 		var i = seed[a].charCodeAt(0);
274 | 		randoms.push(i);
275 | 	}
276 | 
277 | 	var rand = function(max){
278 | 		var i = randoms[randoms_i] % max;
279 | 		randoms_i = (randoms_i + 1) % randoms.length;
280 | 		return i;
281 | 	}
282 | 
283 | 	var randarr = function(arr, len){
284 | 		var r;
285 | 		var ret = "";
286 | 		for(var a = 0; a < len; a++){
287 | 			r = rand(arr.length - 1) ;
288 | 			ret += arr[r];
289 | 		}
290 | 		return ret;
291 | 	};
292 | 
293 | 	var generators = {
294 | 		string: function(){
295 | 			return randarr(letters, 8);
296 | 		},
297 | 		number: function(){
298 | 			return randarr(numbers, 3);
299 | 		},
300 | 		month: function(){
301 | 			return randarr(months, 1);
302 | 		},
303 | 		year: function(){
304 | 			return randarr(years, 1);
305 | 		},
306 | 		date: function(){
307 | 			return generators.year() + "-" + generators.month() + "-" + generators.month();
308 | 		},
309 | 		color: function(){
310 | 			return "#" + randarr(numbers, 6);
311 | 		},
312 | 		week: function(){
313 | 			return generators.year() + "-W" + randarr(months.slice(0, 6), 1);
314 | 		},
315 | 		time: function(){
316 | 			return generators.month() + ":" + generators.month();
317 | 		},
318 | 		datetimeLocal: function(){
319 | 			return generators.date() + "T" + generators.time();
320 | 		},
321 | 		domain: function(){
322 | 			return randarr(letters, 12).toLowerCase() + randarr(domains ,1);
323 | 		},
324 | 		email: function(){
325 | 			return randarr(names, 1) + "." + generators.surname() + "@" + generators.domain();
326 | 		},
327 | 		url: function(){
328 | 			return "http://www." + generators.domain();
329 | 		},
330 | 		humandate: function(){
331 | 			return  generators.month() + "/" + generators.month() + "/" + generators.year();
332 | 		},
333 | 		password: function(){
334 | 			return randarr(letters, 3) + randarr(symbols, 1) + randarr(letters, 2) + randarr(numbers, 3) + randarr(symbols, 2);
335 | 		},
336 | 		surname: function(){
337 | 			return randarr(surnames, 1);
338 | 		},
339 | 		lastname: function(){
340 | 			return generators.surname();
341 | 		},
342 | 		firstname: function(){
343 | 			return randarr(names, 1);
344 | 		},
345 | 		tel: function(){
346 | 			return "+" + randarr(numbers, 1) + " " + randarr(numbers, 10)
347 | 		}
348 | 	};
349 | 
350 | 
351 | 	for(var type in generators){
352 | 		values[type] = generators[type]();
353 | 	}
354 | 
355 | 	return values;
356 | 
357 | 
358 | };
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | 
366 | function startProbe(random) {
367 | 	// generate a static map of random values using a "static" seed for input fields
368 | 	// the same seed generates the same values
369 | 	// generated values MUST be the same for all analyze.js call othewise the same form will look different
370 | 	// for example if a page sends a form to itself with input=random1,
371 | 	// the same form on the same page (after first post) will became input=random2
372 | 	// => form.data1 != form.data2 => form.data2 is considered a different request and it'll be crawled.
373 | 	// this process will lead to and infinite loop!
374 | 	var inputValues = generateRandomValues(random);
375 | 
376 | 	page.evaluate(initProbe, options, inputValues);
377 | 	page.evaluate(function(options) {
378 | 
379 | 		if(options.mapEvents){
380 | 
381 | 			Node.prototype.originaladdEventListener = Node.prototype.addEventListener;
382 | 			Node.prototype.addEventListener = function(event, func, useCapture){
383 | 				if(event != "DOMContentLoaded"){ // is this ok???
384 | 					window.__PROBE__.addEventToMap(this, event);
385 | 				}
386 | 				this.originaladdEventListener(event, func, useCapture);
387 | 			};
388 | 
389 | 			window.addEventListener = (function(originalAddEventListener){
390 | 				return function(event, func, useCapture){
391 | 					if(event != "load"){ // is this ok???
392 | 						window.__PROBE__.addEventToMap(this, event);
393 | 					}
394 | 					originalAddEventListener.apply(this,[event, func, useCapture]);
395 | 				}
396 | 			})(window.addEventListener);
397 | 		}
398 | 
399 | 		if(options.checkAjax){
400 | 			XMLHttpRequest.prototype.originalOpen = XMLHttpRequest.prototype.open;
401 | 			XMLHttpRequest.prototype.open = function(method, url, async, user, password){
402 | 
403 | 				var _url = window.__PROBE__.removeUrlParameter(url, "_");
404 | 				this.__request = new window.__PROBE__.Request("xhr", method, _url, null, null);
405 | 
406 | 				return this.originalOpen(method, url, async, user, password);
407 | 			}
408 | 
409 | 
410 | 
411 | 			XMLHttpRequest.prototype.originalSend = XMLHttpRequest.prototype.send;
412 | 
413 | 			XMLHttpRequest.prototype.send = function(data){
414 | 				this.__request.data = data;
415 | 
416 | 				var absurl = window.__PROBE__.getAbsoluteUrl(this.__request.url);
417 | 				for(var a = 0; a < options.excludedUrls.length; a++){
418 | 					if(absurl.match(options.excludedUrls[a])){
419 | 						this.__skipped = true;
420 | 					}
421 | 				}
422 | 
423 | 
424 | 				this.__request.trigger = window.__PROBE__.getTrigger();
425 | 
426 | 
427 | 				// check if request has already been sent
428 | 				var rk = this.__request.key();
429 | 				if(window.__PROBE__.sentAjax.indexOf(rk) != -1){
430 | 					return;
431 | 				}
432 | 
433 | 				var ueRet = window.__PROBE__.triggerUserEvent("onXhr",[this.__request]);
434 | 				if(ueRet){
435 | 					// pending ajax
436 | 					window.__PROBE__.pendingAjax.push(this);
437 | 					window.__PROBE__.sentAjax.push(rk);
438 | 					window.__PROBE__.addRequestToPrintQueue(this.__request);
439 | 
440 | 
441 | 					if(!this.__skipped)
442 | 						return this.originalSend(data);
443 | 				}
444 | 
445 | 				return;
446 | 			}
447 | 
448 | 		}
449 | 
450 | 
451 | 		if(options.checkScriptInsertion){
452 | 
453 | 			Node.prototype.originalappendChild = Node.prototype.appendChild;
454 | 			Node.prototype.appendChild = function(node){
455 | 				window.__PROBE__.printJSONP(node);
456 | 				return this.originalappendChild(node);
457 | 			}
458 | 
459 | 			Node.prototype.originalinsertBefore = Node.prototype.insertBefore;
460 | 			Node.prototype.insertBefore = function(node, element){
461 | 				window.__PROBE__.printJSONP(node);
462 | 				return this.originalinsertBefore(node, element);
463 | 			}
464 | 
465 | 			Node.prototype.originalreplaceChild = Node.prototype.replaceChild;
466 | 			Node.prototype.replaceChild = function(node, oldNode){
467 | 				window.__PROBE__.printJSONP(node);
468 | 				return this.originalreplaceChild(node, oldNode);
469 | 			}
470 | 		}
471 | 
472 | 		if(options.checkWebsockets){
473 | 			window.WebSocket = (function(WebSocket){
474 | 				return function(url, protocols){
475 | 					window.__PROBE__.printWebsocket(url); //websockets.push(url);
476 | 					return WebSocket.prototype;
477 | 				}
478 | 			})(window.WebSocket);
479 | 		}
480 | 
481 | 
482 | 		if(options.overrideTimeoutFunctions){
483 | 			window.setTimeout = (function(setTimeout){
484 | 				return function(func, time, setTime){
485 | 					var t = setTime ? time : 0;
486 | 					return setTimeout(func, t);
487 | 				}
488 | 			})(window.setTimeout);
489 | 
490 | 			window.setInterval = (function(setInterval){
491 | 				return function(func, time, setTime){
492 | 					var t = setTime ? time : 0;
493 | 					return setInterval(func, t);
494 | 				}
495 | 			})(window.setInterval);
496 | 
497 | 		}
498 | 
499 | 		if(options.preventElementRemoval){
500 | 			//Node.prototype.originalremoveChild = Node.prototype.removeChild;
501 | 			Node.prototype.removeChild = function(node){
502 | 				//console.log(node);
503 | 				return node;
504 | 			}
505 | 		}
506 | 
507 | 		HTMLFormElement.prototype.originalSubmit = HTMLFormElement.prototype.submit;
508 | 		HTMLFormElement.prototype.submit = function(){
509 | 			//console.log("=-->"+this.action)
510 | 			var req = window.__PROBE__.getFormAsRequest(this);
511 | 			window.__PROBE__.printRequest(req);
512 | 			return this.originalSubmit();
513 | 		}
514 | 
515 | 		// prevent window.close
516 | 		window.close = function(){ return }
517 | 
518 | 		window.open = function(url, name, specs, replace){
519 | 			window.__PROBE__.printLink(url);
520 | 		}
521 | 
522 | 		window.__PROBE__.triggerUserEvent("onInit");
523 | 	}, options);
524 | };
525 | 
526 | 
527 | 
528 | 
529 | function checkContentType(ctype){
530 | 	ctype = ctype || ""
531 | 	return (ctype.toLowerCase().split(";")[0] == "text/html");
532 | };
533 | 
534 | function assertContentTypeHtml(response){
535 | 	if(!checkContentType(response.contentType)){
536 | 		printStatus("error", "contentType", "content type is " + response.contentType); // escape response.contentType???
537 | 		phantom.exit(0);
538 | 	}
539 | }
540 | 
541 | function getCookies(headers, url){
542 | 	var a, b, c, ret = [];
543 | 	var purl = document.createElement('a');
544 | 	purl.href = url;
545 | 	var domain = purl.hostname;
546 | 
547 | 	for(a = 0; a < headers.length; a++){
548 | 		//console.log(JSON.stringify(headers[a]))
549 | 		if(headers[a].name.toLowerCase() == "set-cookie"){
550 | 			var cookies = headers[a].value.split("\n");	 // phantomjs stores multiple cookies in this way ..
551 | 			for(b = 0; b < cookies.length; b++){
552 | 				var ck = cookies[b].split(/; */);
553 | 				var cookie = {domain: domain, path: "/", secure: false, httponly:false};
554 | 				for(c = 0; c < ck.length; c++){
555 | 					var kv = ck[c].split("=");
556 | 					if(c == 0){
557 | 						cookie.name = kv[0];
558 | 						cookie.value = decodeURIComponent(kv[1]);
559 | 						continue;
560 | 					}
561 | 					switch(kv[0].toLowerCase()){
562 | 						case "expires":
563 | 							if(!("expires" in cookie))
564 | 								cookie.expires = parseInt((new Date(kv[1])).getTime() / 1000);
565 | 							break;
566 | 						case "max-age":
567 | 							cookie.expires = parseInt(((new Date()).getTime() / 1000) + parseInt(kv[1]));
568 | 							break;
569 | 						case "domain":
570 | 						case "path":
571 | 							cookie[kv[0]] = kv[1];
572 | 							break;
573 | 						case "httponly":
574 | 						case "secure":
575 | 							cookie[kv[0]] = true;
576 | 							break;
577 | 					}
578 | 				}
579 | 				ret.push(cookie);
580 | 			}
581 | 		}
582 | 	}
583 | 	return ret;
584 | };
585 | 
586 | 
587 | 


--------------------------------------------------------------------------------
/core/crawl/probe/options.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | HTCAP - beta 1
 3 | Author: filippo.cavallarin@wearesegment.com
 4 | 
 5 | This program is free software; you can redistribute it and/or modify it under
 6 | the terms of the GNU General Public License as published by the Free Software
 7 | Foundation; either version 2 of the License, or (at your option) any later
 8 | version.
 9 | */
10 | 
11 | 
12 | window.options = {
13 |     id: 0,
14 |     verbose: false,
15 |     checkAjax: true,
16 |     fillValues: true,
17 |     triggerEvents: true,
18 |     checkWebsockets: true,
19 |     searchUrls: true,
20 |     jsonOutput: true,
21 |     //maxExecTime: 300000, // 300 seconds
22 |     maxExecTime: 100000, // 100 seconds
23 |     ajaxTimeout: 5000,
24 |     printAjaxPostData: true,
25 |     loadImages: false,
26 |     getCookies: true,
27 |     mapEvents: true,
28 |     checkScriptInsertion: true,
29 |     httpAuth: false,
30 |     triggerAllMappedEvents: true,
31 |     outputMappedEvents: false,
32 |     overrideTimeoutFunctions: true,
33 |     referer: false,
34 |     userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
35 |     allEvents: ['abort', 'autocomplete', 'autocompleteerror', 'beforecopy', 'beforecut', 'beforepaste', 'blur', 'cancel', 'canplay', 'canplaythrough', 'change', 'click', 'close', 'contextmenu', 'copy', 'cuechange', 'cut', 'dblclick', 'drag', 'dragend', 'dragenter', 'dragleave', 'dragover', 'dragstart', 'drop', 'durationchange', 'emptied', 'ended', 'error', 'focus', 'input', 'invalid', 'keydown', 'keypress', 'keyup', 'load', 'loadeddata', 'loadedmetadata', 'loadstart', 'mousedown', 'mouseenter', 'mouseleave', 'mousemove', 'mouseout', 'mouseover', 'mouseup', 'mousewheel', 'paste', 'pause', 'play', 'playing', 'progress', 'ratechange', 'reset', 'resize', 'scroll', 'search', 'seeked', 'seeking', 'select', 'selectstart', 'show', 'stalled', 'submit', 'suspend', 'timeupdate', 'toggle', 'volumechange', 'waiting', 'webkitfullscreenchange', 'webkitfullscreenerror', 'wheel'],
36 |     returnHtml: false,
37 |     setCookies: [],
38 |     excludedUrls: [],
39 |     maximumRecursion: 50,
40 |     printUnknownRequests: false, // unknown requests are for example mailto: and javascript: urls
41 |     maximumAjaxChain: 30,
42 |     preventElementRemoval: false,
43 |     // map input names to string generators. see generateRandomValues to see all available generators
44 |     inputNameMatchValue: [ // regexps NEED to be string to get passed to phantom page
45 |         { name: "mail", value: "email" },
46 |         { name: "((number)|(phone))|(^tel)", value: "number" },
47 |         { name: "(date)|(birth)", value: "humandate" },
48 |         { name: "((month)|(day))|(^mon$)", value: "month" },
49 |         { name: "year", value: "year" },
50 |         { name: "url", value: "url" },
51 |         { name: "firstname", value: "firstname" },
52 |         { name: "(surname)|(lastname)", value: "surname" },
53 |     ],
54 |     /* always trigger these events since event delegation mays "confuse" the triggering of mapped events */
55 |     eventsMap: {
56 |         'button': ['click', 'keyup', 'keydown'],
57 |         'select': ['change', 'click', 'keyup', 'keydown'],
58 |         'input': ['change', 'click', 'blur', 'focus', 'keyup', 'keydown'],
59 |         'a': ['click', 'keyup', 'keydown'],
60 |         'textarea': ['change', 'click', 'blur', 'focus', 'keyup', 'keydown'],
61 |         'span': ['click'],
62 |         'td': ['click']
63 |     }
64 | };


--------------------------------------------------------------------------------
/core/lib/DB_config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-  
  2 | '''
  3 | Created on 2017年5月15日
  4 | @author: Saline
  5 | '''
  6 | import pymysql
  7 | from DBUtils.PooledDB import PooledDB
  8 | 
  9 | mysqldb_conn = {
 10 | 	#数据库信息
 11 |     'host' : 'localhost',
 12 |     'user' : 'root',
 13 |     'port' : '3306',
 14 |     'password' : 'mysqlroot',
 15 |     'db' : 'w3a_scan',
 16 |     #数据库连接编码
 17 |     'charset' : 'utf8',
 18 |     #mincached : 启动时开启的闲置连接数量(缺省值 0 以为着开始时不创建连接)
 19 |     "DB_MIN_CACHED":"10",
 20 |     #maxcached : 连接池中允许的闲置的最多连接数量(缺省值 0 代表不闲置连接池大小)
 21 |     "DB_MAX_CACHED":"10",
 22 |     #maxshared : 共享连接数允许的最大数量(缺省值 0 代表所有连接都是专用的)如果达到了最大数量,被请求为共享的连接将会被共享使用
 23 |     "DB_MAX_SHARED":"20",
 24 |     #maxconnecyions : 创建连接池的最大数量(缺省值 0 代表不限制)
 25 |     "DB_MAX_CONNECYIONS":"100",
 26 |     #blocking : 设置在连接池达到最大数量时的行为(缺省值 0 或 False 代表返回一个错误<toMany......>; 其他代表阻塞直到连接数减少,连接被分配)
 27 |     "DB_BLOCKING":True,
 28 |     #maxusage : 单个连接的最大允许复用次数(缺省值 0 或 False 代表不限制的复用).当达到最大数时,连接会自动重新连接(关闭和重新打开)
 29 |     "DB_MAX_USAGE":"0",
 30 |     #setsession : 一个可选的SQL命令列表用于准备每个会话，如["set datestyle to german", ...]
 31 |     "DB_SET_SESSION":None
 32 | }
 33 | '''
 34 | @功能：数据库连接池
 35 | '''
 36 | class PTConnectionPool(object):
 37 |     __pool = None
 38 |     def __enter__(self):
 39 |         self.conn = self.getConn()
 40 |         self.cursor = self.conn.cursor()
 41 |         return self
 42 | 
 43 |     def getConn(self):
 44 |         if self.__pool is None:
 45 |             self.__pool = PooledDB(
 46 |             	creator=pymysql, cursorclass= pymysql.cursors.DictCursor, 
 47 |             	mincached=int(mysqldb_conn.get('DB_MIN_CACHED')),
 48 |             	maxcached=int(mysqldb_conn.get('DB_MAX_CACHED')),
 49 |             	maxshared=int(mysqldb_conn.get('DB_MAX_SHARED')),
 50 |             	maxconnections=int(mysqldb_conn.get('DB_MAX_CONNECYIONS')),
 51 |             	blocking=mysqldb_conn.get('DB_BLOCKING'),
 52 |             	setsession=mysqldb_conn.get('DB_SET_SESSION'),
 53 |                 maxusage=int(mysqldb_conn.get('DB_MAX_USAGE')),
 54 |                 host=mysqldb_conn.get('host'),
 55 |                 port=int(mysqldb_conn.get('port')),
 56 |                 user=mysqldb_conn.get('user'),
 57 |                 passwd=mysqldb_conn.get('password'),
 58 |                 db=mysqldb_conn.get('db') , use_unicode=False,
 59 |                 charset=mysqldb_conn.get('charset')
 60 |             )
 61 | 
 62 |         return self.__pool.connection()
 63 | 
 64 |     """
 65 |     @summary: 释放连接池资源
 66 |     """
 67 |     def __exit__(self, type, value, trace):
 68 |         self.cursor.close()
 69 |         self.conn.close()
 70 | 
 71 | '''
 72 | @功能：获取数据库连接
 73 | '''
 74 | def getPTConnection():
 75 |     return PTConnectionPool() 
 76 | 
 77 | 
 78 | def query(sql,args= None):
 79 |     with getPTConnection() as db:
 80 |         try:
 81 |             cur = db.cursor
 82 |             cur.execute(sql,args)
 83 |             return db.cursor.fetchall()
 84 |         except Exception, e:
 85 |             print str(e)
 86 |             db.conn.rollback()
 87 |             raise Exception(e)
 88 | 
 89 | 
 90 | def execute(sql, args=None):
 91 |     with getPTConnection() as db:
 92 |         try:
 93 |             cur = db.cursor
 94 |             result = cur.execute(sql, args)
 95 |             db.conn.commit()
 96 |             return result
 97 |         except Exception, e:
 98 |             print str(e)
 99 |             db.conn.rollback()
100 |             raise Exception(e)
101 | 
102 | def executmany(sql, args=None):
103 |     with getPTConnection() as db:
104 |         try:
105 |             cur = db.cursor
106 |             result = cur.executemany(sql, args)
107 |             db.conn.commit()
108 |             return result
109 |         except Exception, e:
110 |             print e
111 |             db.conn.rollback()
112 |             raise Exception(e)
113 | 
114 | if __name__ == "__main__":  
115 | 	res = execute('select count(*) from movies')  
116 | 	print str(res)
117 | 
118 | 	res = query('select * from movies limit 10')  
119 | 	print str(res)


--------------------------------------------------------------------------------
/core/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/lib/__init__.py


--------------------------------------------------------------------------------
/core/lib/cookie.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - www.htcap.org
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | 
 13 | import time
 14 | import cookielib
 15 | from urllib import quote
 16 | from urlparse import urlparse, urljoin
 17 | 
 18 | class Cookie:
 19 | 	"""
 20 | 	RFC 6265
 21 | 	"""
 22 | 
 23 | 	def __init__(self, cookie, setter=None):
 24 | 		self.name = (str(cookie['name']) if 'name' in cookie and cookie['name'] else None)
 25 | 		self.domain = (str(cookie['domain']) if 'domain' in cookie and cookie['domain'] else None)
 26 | 		self.path = (str(cookie['path']) if 'path' in cookie and cookie['path'] else "/")
 27 | 
 28 | 		# setter is the url that set this cookie, it's used to handle cookies with domain=None
 29 | 		# if both domain and setter are None then no domain restrictions are applied (used when cookied are loaded from db)
 30 | 		self.setter = urlparse(setter) if setter else None
 31 | 
 32 | 
 33 | 		# if self.domain[0] != ".":
 34 | 		# 	self.domain = "." + self.domain
 35 | 
 36 | 		self.update(cookie)
 37 | 
 38 | 
 39 | 	def update(self, cookie):
 40 | 		self.value = (quote(str(cookie['value'])) if 'value' in cookie and cookie['value'] else None)
 41 | 		self.expires = (cookie['expires'] if 'expires' in cookie else None)
 42 | 		self.secure = (cookie['secure'] if 'secure' in cookie else False)
 43 | 		self.httponly = (cookie['httponly'] if 'httponly' in cookie else False)
 44 | 
 45 | 
 46 | 
 47 | 	def __eq__(self, other):
 48 | 		return  (other
 49 | 				and self.name == other.name
 50 | 				and self.path == other.path
 51 | 				and (self.domain == None or other.domain == None or self.domain == other.domain)
 52 | 				)
 53 | 
 54 | 	def get_string(self):
 55 | 		return "%s=%s; path = %s" % (self.name, self.value, self.path)#self.setter)
 56 | 
 57 | 
 58 | 
 59 | 	# if domain is set it is valid for all subdomains
 60 | 	# if domain is not set it is valid only for the setter's domain 
 61 | 	def is_valid_for_url(self, url):
 62 | 		purl = urlparse(url)
 63 | 		# the preceding dot in domain is optional so .foo.com and foo.com are the same
 64 | 		if self.domain is None:
 65 | 			# domain is considered the domain of setter
 66 | 			# the cookie is valid if url's domain is EQUAL to setter's domain
 67 | 			# if setter is None, no domain restrictions are applied (ie when loading cookies from db)
 68 | 			if self.setter and purl.hostname != self.setter.hostname: return False
 69 | 		else:
 70 | 			if not purl.hostname: return False
 71 | 			# url is valid ALSO if it is a subdomain of self.domain
 72 | 			sh = [t for t in self.domain.split(".")[::-1] if t] # skip empty vals (in case of .foo.bar)
 73 | 			uh = purl.hostname.split(".")[::-1]
 74 | 			# @TODO DO NOT trust self.domain blindely .. check if = to setter...
 75 | 			if uh[:len(sh)] != sh: return False
 76 | 
 77 | 		if self.path:
 78 | 			# check if url's path is equal or subfolder of self.path
 79 | 			if not purl.path: return False
 80 | 			sp = [t for t in self.path.split("/") if t]
 81 | 			up = [t for t in purl.path.split("/") if t]
 82 | 			if up[:len(sp)] != sp: return False
 83 | 
 84 | 		# @TODO!!!
 85 | 		if self.expires:
 86 | 			pass
 87 | 
 88 | 		#print "%s is valid for %s" % (self.get_string(), url)
 89 | 
 90 | 		return True
 91 | 
 92 | 
 93 | 	# def get_json(self):
 94 | 	# 	return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True)
 95 | 
 96 | 	def get_dict(self):
 97 | 		return dict(
 98 | 			name = self.name,
 99 | 			value = self.value,
100 | 			domain = self.domain,
101 | 			path = self.path,
102 | 			secure = self.secure,
103 | 			expires = self.expires, 
104 | 			httponly = self.httponly
105 | 		)
106 | 
107 | 
108 | 	def get_cookielib_cookie(self):
109 | 		return cookielib.Cookie(
110 | 			version = 0, 
111 | 			name = self.name,
112 | 			value = self.value,
113 | 			port = None, 
114 | 			port_specified = False, 
115 | 			domain =  self.domain if self.domain else "" , # is this ok?
116 | 			domain_specified = True, 
117 | 			domain_initial_dot = False, 
118 | 			path = self.path,
119 | 			path_specified = True, 
120 | 			secure = self.secure,
121 | 			expires = self.expires, 
122 | 			discard = True, 
123 | 			comment = None, 
124 | 			comment_url = None, 
125 | 			rest = None
126 | 		)
127 | 
128 | 
129 | 	def get_as_netscape(self):
130 | 		"""
131 | 		7 tab delimited properties:
132 | 			domain - The domain that created AND that can read the variable. 
133 | 			flag - A TRUE/FALSE value indicating if all machines within a given domain can access 
134 | 				the variable. This value is set automatically by the browser, depending on the value you set for domain. 
135 | 			path - The path within the domain that the variable is valid for. 
136 | 			secure - A TRUE/FALSE value indicating if a secure connection with the domain is needed      to access the variable. 
137 | 			expiration - The UNIX time that the variable will expire on. UNIX time is defined as   the number of seconds since Jan 1, 1970 00:00:00 GMT. 
138 | 			name - The name of the variable. 
139 | 			value - The value of the variable.
140 | 		"""
141 | 
142 | 		domain = self.domain
143 | 		if domain:
144 | 			if not domain.startswith("."): domain = ".%s" % domain
145 | 		else:
146 | 			domain = self.setter.hostname if self.setter else "."
147 | 
148 | 		# @TODO capire come e se settare 'flag'
149 | 		flag = "TRUE"
150 | 		# @TODO non è chiaro cosa setto se il cookie non ha expire date .. per ora lo setto nel futuro e pace
151 | 		expiry = self.expires if self.expires else (time.time() + (3600 * 24 * 7))
152 | 		values = (domain, flag, self.path, ("TRUE" if self.secure else "FALSE"), expiry, self.name, self.value)
153 | 		return "%s\t%s\t%s\t%s\t%d\t%s\t%s" % values
154 | 
155 | 	def __str__(self):
156 | 		return "Cookie: %s=%s" % (self.name, self.value)
157 | 
158 | 


--------------------------------------------------------------------------------
/core/lib/database.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | 
 13 | import json
 14 | from core.lib.request import Request
 15 | from core.lib.DB_config import query,execute,executmany
 16 | 
 17 | class Database:
 18 | 	def __init__(self,dbname, report_name = "" , infos = ""):
 19 |     		self.dbname = dbname
 20 | 		self.report_name = report_name
 21 | 		self.conn = None
 22 | 
 23 | 	def dict_from_row(self, row):
 24 | 		return dict(zip(row.keys(), row)) 
 25 | 
 26 | 	def checkdir(self,url):
 27 | 		import re
 28 | 		pawd = re.findall('\?[A-Z]=[A-Z];[A-Z]=[A-Z]',url,re.IGNORECASE)
 29 | 		##
 30 | 		#
 31 | 		##
 32 | 		if pawd:
 33 | 			if url.endswith(pawd[0]):
 34 | 				url = url.replace(pawd[0],'')
 35 | 			else:
 36 | 				url = url
 37 | 		else:
 38 | 			url =url
 39 | 		return url
 40 | 
 41 | 
 42 | 	def save_crawl_info(self, target=None, start_date=None, end_date=None, commandline=None, user_agent=None):
 43 | 		values = []
 44 | 		pars = []
 45 | 
 46 | 		if target:
 47 | 			values.append(target)
 48 | 
 49 | 		if start_date:
 50 | 			values.append(start_date)
 51 | 
 52 | 		if end_date:
 53 | 			values.append(end_date)
 54 | 		else:
 55 | 			values.append('')
 56 | 
 57 | 		if commandline:
 58 | 			values.append(commandline)
 59 | 
 60 | 		if user_agent:
 61 | 			values.append(user_agent)
 62 | 			
 63 | 		sql = "insert `crawl_info` (`target`,`start_date`,`end_date`,`commandline`,`user_agent`) value (%s,%s,%s,%s,%s)"
 64 | 		try:
 65 | 			execute(sql,values)
 66 | 			result = query("select max(id) from crawl_info")
 67 | 
 68 | 			for x,v in result[0].iteritems():
 69 | 				taskid = v
 70 | 
 71 | 			return taskid
 72 | 		except Exception as e:
 73 | 			print str(e)
 74 | 
 75 | 	def update_crawl_info(self,taskid,endtime):
 76 | 		if not taskid:
 77 | 			return
 78 | 		try:
 79 | 			sql = "update `crawl_info` set `end_date`=%s where id=%s"
 80 | 			value = str(endtime),str(taskid)
 81 | 			execute(sql,value)
 82 | 		except Exception as e:
 83 | 			print str(e)
 84 | 
 85 |     					
 86 | 
 87 | 	def save_request(self,request,taskid=''):
 88 | 		gettype = request.type if request.type else ''
 89 | 		method = request.method if request.method else ''
 90 | 		geturl = request.url
 91 | 		referer = request.referer if request.referer else ''
 92 | 		redirects = request.redirects if request.redirects else ''
 93 | 		pdata = request.data if request.data else ''
 94 | 		cookie = json.dumps([r.get_dict() for r in request.cookies])
 95 | 		http_auth = request.http_auth if request.http_auth else ''
 96 | 		out_of_scope = 1 if request.out_of_scope else 0
 97 | 		trigger = json.dumps(request.trigger) if request.trigger else ''
 98 | 		html = request.html if request.html else ''
 99 | 		user_output = json.dumps(request.user_output) if len(request.user_output) > 0 else ''
100 | 
101 | 		sql = "INSERT INTO `request` (`taskid`, `type`, `method`, `url`, `referer`, `redirects`, `data`, `cookies`, `http_auth`,`out_of_scope`,`trigger`,`html`,`user_output`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
102 | 		newvalue = taskid,gettype,method,self.checkdir(request.url), request.referer,request.redirects,request.data,cookie,http_auth,out_of_scope,trigger,html,user_output
103 | 
104 | 
105 | 		# include trigger in query to save the same request with different triggers
106 | 		# (normally requests are compared using type,method,url and data only) 
107 | 		sql_select = "SELECT * FROM `request` WHERE `type`=%s AND `method`=%s AND `url`=%s AND `http_auth`=%s AND `data`=%s and `taskid`=%s"
108 | 		sevalue = gettype,method,self.checkdir(request.url),http_auth,pdata,taskid
109 | 
110 | 		try:
111 | 			exsit_res = query(sql_select,sevalue)
112 | 			if len(exsit_res) == 0:
113 | 				execute(sql,newvalue)
114 | 				#max = query("select max(id) from `request`")
115 | 				#print max[0]
116 | 		except  Exception as e:
117 | 			print str(e)
118 | 			pass
119 | 
120 | 
121 | 	def save_crawl_result(self, result, crawled):
122 | 
123 | 		if result.request.db_id == 0: # start url has id=0
124 | 			return
125 | 		qry = "UPDATE request SET crawled=?, crawler_errors=?, html=?, user_output=? WHERE id=?"
126 | 		values = (
127 | 			1 if crawled else 0,
128 | 			json.dumps(result.errors),
129 | 			result.request.html if result.request.html else "",
130 | 			json.dumps(result.request.user_output) if len(result.request.user_output) > 0 else "",
131 | 			result.request.db_id
132 | 		)
133 | 		try:
134 | 			cur = self.conn.cursor()
135 | 			cur.execute(qry, values)
136 | 		except Exception as e:
137 | 			print str(e)
138 | 			pass


--------------------------------------------------------------------------------
/core/lib/exception.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | class NotHtmlException(Exception):
14 | 	pass
15 | 
16 | 
17 | class RedirectException(Exception):
18 | 	pass
19 | 
20 | 
21 | class ThreadExitRequestException(Exception):
22 | 	pass
23 | 
24 | class MalformedUrlException(Exception):
25 | 	pass
26 | 
27 | 


--------------------------------------------------------------------------------
/core/lib/http_get.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under
  8 | the terms of the GNU General Public License as published by the Free Software
  9 | Foundation; either version 2 of the License, or (at your option) any later
 10 | version.
 11 | """
 12 | 
 13 | from __future__ import unicode_literals
 14 | import sys
 15 | import os
 16 | import datetime
 17 | import time
 18 | import json
 19 | import re
 20 | import cookielib
 21 | import urllib
 22 | import urllib2
 23 | import base64
 24 | import ssl,socket
 25 | from urlparse import urlsplit, urljoin
 26 | from urllib import urlencode
 27 | from core.crawl.lib.urlfinder import UrlFinder
 28 | 
 29 | import core.lib.thirdparty.pysocks.socks as socks
 30 | from core.lib.thirdparty.pysocks.sockshandler import SocksiPyHandler
 31 | 
 32 | from core.lib.exception import *
 33 | from core.lib.cookie import Cookie
 34 | 
 35 | from core.lib.request import Request
 36 | 
 37 | 
 38 | from core.lib.utils import *
 39 | from core.constants import *
 40 | 
 41 | class HttpGet:
 42 | 
 43 | 	def __init__(self, request, timeout, retries=None, useragent=None, proxy=None):
 44 | 		self.request = request
 45 | 		self.timeout = timeout
 46 | 		self.retries = retries if retries else 1
 47 | 		self.proxy = proxy
 48 | 		self.retries_interval = 0.5
 49 | 		self.useragent = useragent
 50 | 
 51 | 	def urllib2_opener(self, request, jar_response, follow_redirect = None):
 52 | 		url = request.url
 53 | 		headers = []
 54 | 
 55 | 		class RedirectHandler(urllib2.HTTPRedirectHandler):
 56 | 			def http_error_302(self, req, fp, code, msg, headers):
 57 | 				raise RedirectException(headers['Location'])
 58 | 
 59 | 			http_error_301 = http_error_303 = http_error_307 = http_error_302
 60 | 
 61 | 
 62 | 		try :
 63 | 			handlers = [urllib2.HTTPCookieProcessor(jar_response)]
 64 | 
 65 | 			# SSLContext is available from python 2.7.9
 66 | 			if hasattr(ssl, "SSLContext"):
 67 | 				handlers.append(urllib2.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_SSLv23)))
 68 | 
 69 | 			if not follow_redirect:
 70 | 				handlers.append(RedirectHandler)
 71 | 
 72 | 			if self.proxy:
 73 | 				if self.proxy['proto'] == "socks5":
 74 | 					# dns queries WONT go thru self.proxy .. consider "monkey patching"...
 75 | 					socksh = SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, self.proxy['host'], int(self.proxy['port']))
 76 | 					handlers.append(socksh)
 77 | 				elif self.proxy['proto'] == "http":
 78 | 					proxy_string = "http://%s:%s" % (self.proxy['host'], self.proxy['port'])
 79 | 					httproxy = urllib2.ProxyHandler({'http': proxy_string,'https': proxy_string})
 80 | 					handlers.append(httproxy)
 81 | 
 82 | 			if self.useragent:
 83 | 				headers.append(('User-agent', self.useragent))
 84 | 
 85 | 
 86 | 			if request.http_auth:
 87 | 				auths = base64.b64encode(request.http_auth)
 88 | 				headers.append(("Authorization", "Basic %s" % auths))
 89 | 
 90 | 			if request.referer:
 91 | 				headers.append(("Referer", request.referer))
 92 | 
 93 | 			opener = urllib2.build_opener(*handlers)
 94 | 			opener.addheaders = headers
 95 | 
 96 | 			return opener
 97 | 
 98 | 		except RedirectException as e:
 99 | 			raise
100 | 		except Exception as e:
101 | 			print "\n--->"+url+" "+str(e)
102 | 			raise
103 | 
104 | 
105 | 	def get_requests(self): # Shared.options['process_timeout']
106 | 
107 | 		if self.request.method == "POST":
108 | 			raise Exception("POST method with urllib is not supported yet")
109 | 
110 | 		#parent = self.request.parent.url if self.request.parent else ""
111 | 
112 | 		self.retries_interval = 0.5
113 | 
114 | 		jar_response = cookielib.LWPCookieJar()
115 | 		jar_request = cookielib.LWPCookieJar()
116 | 
117 | 
118 | 		html = ""
119 | 		set_cookie = []
120 | 
121 | 		requests = []
122 | 
123 | 
124 | 		while True:
125 | 			try :
126 | 				#Shared.th_lock.acquire()
127 | 
128 | 				for cookie in self.request.cookies:
129 | 					jar_request.set_cookie(cookie.get_cookielib_cookie())
130 | 
131 | 				#Shared.th_lock.release()
132 | 
133 | 				opener = self.urllib2_opener(self.request, jar_response)
134 | 				req = urllib2.Request(url=self.request.url)
135 | 				jar_request.add_cookie_header(req)
136 | 
137 | 				res = opener.open(req, None, self.timeout)
138 | 
139 | 				for cookie in jar_response:
140 | 					set_cookie.append(Cookie(cookie.__dict__, self.request.url))
141 | 
142 | 				ctype = res.info()['Content-Type']
143 | 				if ctype is not None:
144 | 					if ctype.lower().split(";")[0] != "text/html":
145 | 						opener.close()
146 | 						raise NotHtmlException(ERROR_CONTENTTYPE)
147 | 
148 | 				html = res.read()
149 | 				opener.close()
150 | 
151 | 				if html:
152 | 					finder = UrlFinder(html)
153 | 					try:
154 | 						urls = finder.get_urls()
155 | 					except Exception as e:
156 | 						raise
157 | 
158 | 				for url in urls:
159 | 					# @TODO handle FORMS
160 | 					requests.append(Request(REQTYPE_LINK, "GET", url, parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id))
161 | 
162 | 				break
163 | 
164 | 			except RedirectException as e:
165 | 				set_cookie = []
166 | 				for cookie in jar_response:
167 | 					set_cookie.append(Cookie(cookie.__dict__, self.request.url))
168 | 
169 | 				r = Request(REQTYPE_REDIRECT, "GET", str(e), parent=self.request, set_cookie=set_cookie, parent_db_id=self.request.db_id)
170 | 				requests.append(r)
171 | 				break
172 | 			except NotHtmlException:
173 | 				raise
174 | 			except Exception as e:
175 | 				self.retries -= 1
176 | 				if self.retries == 0: raise
177 | 				time.sleep(self.retries_interval)
178 | 
179 | 		return requests
180 | 
181 | 
182 | 
183 | 
184 | 	def get_file(self): # Shared.options['process_timeout']
185 | 
186 | 		if self.request.method == "POST":
187 | 			raise Exception("get_file: POST method with urllib is not supported yet")
188 | 
189 | 
190 | 
191 | 		jar_request = cookielib.LWPCookieJar()
192 | 
193 | 
194 | 		cont = ""
195 | 		while True:
196 | 			try :
197 | 
198 | 				for cookie in self.request.cookies:
199 | 					jar_request.set_cookie(cookie.get_cookielib_cookie())
200 | 
201 | 				opener = self.urllib2_opener(self.request, None, True)
202 | 				req = urllib2.Request(url=self.request.url)
203 | 				jar_request.add_cookie_header(req)
204 | 				res = opener.open(req, None, self.timeout)
205 | 
206 | 				cont = res.read()
207 | 				opener.close()
208 | 
209 | 				break
210 | 
211 | 			except Exception as e:
212 | 				self.retries -= 1
213 | 				if self.retries == 0: raise
214 | 				time.sleep(self.retries_interval)
215 | 
216 | 		return cont
217 | 
218 | 
219 | 
220 | 
221 | 


--------------------------------------------------------------------------------
/core/lib/request.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | import socket,time
 13 | from urlparse import urljoin
 14 | from core.lib.cookie import Cookie
 15 | from core.lib.utils import *
 16 | import json 
 17 | from core.lib.thirdparty.simhash import Simhash
 18 | 
 19 | 
 20 | _dnscache={}  
 21 | def _setDNSCache():  
 22 |     """ 
 23 |     Makes a cached version of socket._getaddrinfo to avoid subsequent DNS requests. 
 24 |     """  
 25 |   
 26 |     def _getaddrinfo(*args, **kwargs):  
 27 |         global _dnscache  
 28 |         if args in _dnscache: 
 29 |             return _dnscache[args]  
 30 |   
 31 |         else: 
 32 |             _dnscache[args] = socket._getaddrinfo(*args, **kwargs)  
 33 |             return _dnscache[args]  
 34 |   
 35 |     if not hasattr(socket, '_getaddrinfo'):  
 36 |         socket._getaddrinfo = socket.getaddrinfo  
 37 |         socket.getaddrinfo = _getaddrinfo
 38 | 
 39 | class Request(object):
 40 | 	@staticmethod
 41 | 	def _connect(*args, **kwargs):
 42 | 		self, realfun, args = args
 43 | 
 44 | 		timeout = args[0].gettimeout()
 45 | 		if (timeout == None or timeout != 0) and self._speed > 0:
 46 | 			while True:
 47 | 				begin = time.time()
 48 | 				nowtime = max(0.01, begin - self.__ts)
 49 | 				if nowtime > 5:
 50 | 					self.__conn = 0
 51 | 					self.__ts = begin
 52 | 					break
 53 | 				if self.__conn / nowtime <= self._speed:
 54 | 					break
 55 | 				else:
 56 | 					time.sleep(0.1)
 57 | 			self.__conn += 1
 58 | 
 59 | 		return apply(realfun, args, kwargs)
 60 | 
 61 | 	def __init__(self, type, method, url, parent = None, referer = None, data = None, trigger=None, json_cookies = None, set_cookie = None, http_auth=None, db_id = None, parent_db_id = None, out_of_scope = None):
 62 | 		self.type = type
 63 | 		self.method = method
 64 | 		self._html = None
 65 | 		self._html_hash = None
 66 | 		self.user_output = []
 67 | 		
 68 | 		self._speed   = 5
 69 | 		self.__conn = 0
 70 | 		self.__ts = time.time()
 71 | 		_dnscache={}
 72 | 
 73 | 		Socket_connect = socket.socket.connect
 74 | 		Socket_connect_ex = socket.socket.connect_ex
 75 | 
 76 | 		#socket.socket.connect = lambda *args, **kwargs: apply(self._connect,
 77 |                 #                                              (self, Socket_connect) + (args,),
 78 |                 #                                              kwargs)
 79 | 		#socket.socket.connect_ex = lambda *args, **kwargs: apply(self._connect,
 80 |                 #                                                 (self, Socket_connect_ex) + (args,),
 81 |                 #                                                 kwargs)
 82 | 
 83 | 		url = url.strip()
 84 | 
 85 | 		try:
 86 | 			url = url.decode("utf-8")
 87 | 		except:
 88 | 			try:
 89 | 				url = url.decode("latin-1")
 90 | 			except Exception as e:
 91 | 				raise AssertionError("unable to decode " + url)
 92 | 
 93 | 		if type != REQTYPE_UNKNOWN:
 94 | 			# extract http auth if present in url
 95 | 			# if credentials are present in url, the url IS absolute so we can do this before urljoin
 96 | 			# (foo:bar@example.local is NOT A VALID URL) 
 97 | 			auth, nurl = extract_http_auth(url)
 98 | 			if auth:
 99 | 				if not http_auth: 
100 | 					http_auth = auth
101 | 				url = nurl
102 | 
103 | 			self.url = normalize_url( urljoin(parent.url, url) if parent else url )
104 | 		else:
105 | 			self.url = url
106 | 
107 | 		# if not url_is_valid(self.url):
108 | 		# 	raise MalformedUrlException(url)
109 | 
110 | 		# parent is the parent request that can be a redirect, referer is the referer page (ahead of redirects)
111 | 		self._parent = parent
112 | 
113 | 
114 | 		self.data = data if data else ""
115 | 		self.trigger = trigger
116 | 		self.db_id = db_id
117 | 		self.parent_db_id = parent_db_id
118 | 		self.out_of_scope = out_of_scope
119 | 		self.cookies = []
120 | 
121 | 		self.http_auth = parent.http_auth if not http_auth and parent else http_auth
122 | 
123 | 		self.redirects = parent.redirects + 1 if type == REQTYPE_REDIRECT and parent else 0
124 | 
125 | 		if not referer and parent:
126 | 			self.referer = parent.url if type != REQTYPE_REDIRECT else parent.referer
127 | 		else:
128 | 			self.referer = referer
129 | 
130 | 		# if type == "unknown":
131 | 		# 	return
132 | 
133 | 		if json_cookies:
134 | 			self.all_cookies = self.cookies_from_json(json_cookies)
135 | 		else:
136 | 			set_cookie = set_cookie if set_cookie else []
137 | 			self.all_cookies = self.merge_cookies(set_cookie, parent.all_cookies) if parent else set_cookie
138 | 
139 | 		self.cookies = [c for c in self.all_cookies if c.is_valid_for_url(self.url)]
140 | 
141 | 
142 | 	@property
143 | 	def parent(self):
144 | 		if not self._parent and self.parent_db_id:
145 | 			# fetch from db
146 | 			pass
147 | 		return self._parent
148 | 
149 | 	@parent.setter
150 | 	def parent(self, value):
151 | 		self._parent = value
152 | 
153 | 
154 | 	@property
155 | 	def html(self):
156 | 		return self._html
157 | 
158 | 	@html.setter
159 | 	def html(self, value):
160 | 		self._html = value
161 | 		self._html_hash = Simhash(value)
162 | 
163 | 
164 | 	def get_dict(self):
165 | 		return dict(
166 | 			type = self.type,
167 | 			method = self.method,
168 | 			url = self.url,
169 | 			referer = self.referer,
170 | 			data = self.data,
171 | 			trigger = self.trigger,
172 | 			cookies = self.cookies,
173 | 			db_id = self.db_id,
174 | 			parent_db_id = self.parent_db_id,
175 | 			out_of_scope = self.out_of_scope
176 | 		)
177 | 
178 | 	def cookies_from_json(self, cookies):
179 | 		#return [Cookie(c, self.parent.url) for c in json.loads(cookies)]
180 | 
181 | 		# create Cookie without "setter" because cookies loaded from db are always valid (no domain restrictions)
182 | 		# see Cookie.py 
183 | 		return [Cookie(c) for c in json.loads(cookies)]
184 | 
185 | 
186 | 	def get_cookies_as_json(self):
187 | 		cookies = [c.get_dict() for c in self.cookies]
188 | 		return json.dumps(cookies)
189 | 
190 | 
191 | 
192 | 	def merge_cookies(self, cookies1, cookies2):
193 | 		cookies = list(cookies2)
194 | 		for parent_cookie in cookies1:
195 | 			if parent_cookie not in cookies:
196 | 				cookies.append(parent_cookie)
197 | 			else :
198 | 				for cookie in cookies:
199 | 					if parent_cookie == cookie:
200 | 						cookie.update(parent_cookie.__dict__)
201 | 
202 | 		return cookies
203 | 
204 | 
205 | 	def get_full_url(self):
206 | 		"""
207 | 		returns the url with http credentials
208 | 		"""
209 | 		if not self.http_auth:
210 | 			return self.url
211 | 
212 | 		purl = urlsplit(self.url)
213 | 		netloc = "%s@%s" % (self.http_auth, purl.netloc)
214 | 		purl = purl._replace(netloc=netloc)
215 | 
216 | 		return purl.geturl()
217 | 
218 | 
219 | 	# UNUSED
220 | 	def tokenize_request(self, request):
221 | 		"""
222 | 		returns an array of url components
223 | 		"""
224 | 		purl = urlsplit(request.url)
225 | 
226 | 		tokens = [purl.scheme, purl.netloc]
227 | 
228 | 		if purl.path:
229 | 			tokens.extend(purl.path.split("/"))
230 | 
231 | 		data = [purl.query] if purl.query else []
232 | 
233 | 		if request.data:
234 | 			data.append(request.data)
235 | 
236 | 		for d in data:
237 | 			qtokens = re.split(r'(?:&amp;|&)', d)
238 | 			for qt in qtokens:
239 | 				tokens.extend(qt.split("=",1))
240 | 
241 | 		#print tokens
242 | 		return tokens
243 | 
244 | 	# UNUSED
245 | 	def compare_html(self, other):
246 | 		if not other: return False
247 | 
248 | 		if not self.html and not other.html: return True
249 | 
250 | 
251 | 		if self.html and other.html:
252 | 			return self._html_hash.distance(other._html_hash) <= 2
253 | 
254 | 		return False
255 | 
256 | 	# UNUSED
257 | 	def is_similar(self, other):
258 | 		# is equal .. so not similar
259 | 		if self == other: return False
260 | 
261 | 		ot = self.tokenize_request(other)
262 | 		st = self.tokenize_request(self)
263 | 
264 | 		if len(ot) != len(st): return False
265 | 		diff = 0
266 | 		for i in range(0, len(st)):
267 | 			if st[i] != ot[i]: diff += 1
268 | 
269 | 		if diff > 1: return False
270 | 
271 | 		return True
272 | 
273 | 
274 | 
275 | 
276 | 	def __eq__(self, other):
277 | 		if other == None: return False
278 | 		data = self.data
279 | 		odata = other.data
280 | 		if self.method == "POST":
281 | 			data = remove_tokens(data)
282 | 			odata = remove_tokens(odata)
283 | 
284 | 		return (self.method, self.url, self.http_auth, data) == (other.method, other.url, other.http_auth, odata)
285 | 
286 | 
287 | 
288 | 	def __repr__(self):
289 | 		print "DEBUG" + self.__str__()
290 | 
291 | 	def __str__(self):
292 | 		return "%s %s %s %s" % (self.type, self.method, self.get_full_url(), self.data)
293 | 


--------------------------------------------------------------------------------
/core/lib/request_pattern.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | 
 13 | import time
 14 | import cookielib
 15 | from urllib import quote
 16 | from urlparse import urlparse, urljoin, parse_qs, urlsplit
 17 | import xml.etree.ElementTree as ET
 18 | import json
 19 | 
 20 | 
 21 | class RequestPattern:
 22 | 
 23 | 	def __init__(self, request):
 24 | 		self.request = request
 25 | 		self.pattern = None
 26 | 
 27 | 		self.set_pattern()
 28 | 
 29 | 
 30 | 	def set_pattern(self):
 31 | 		"""
 32 | 		sets requst pattern for comparision
 33 | 		"""
 34 | 
 35 | 		# pattern[0] = url_pattern, pattern[1] = data_pattern
 36 | 		self.pattern = [self.get_url_pattern(self.request.url), None]
 37 | 
 38 | 		if self.request.method == "GET" or not self.request.data:
 39 | 			return
 40 | 
 41 | 		# try xml
 42 | 		try:
 43 | 			root = ET.fromstring(self.request.data)
 44 | 			self.pattern[1] = self.get_xml_pattern(root)
 45 | 		except Exception as e:
 46 | 			# try json
 47 | 			try:
 48 | 				self.pattern[1] = self.get_json_pattern(self.request.data)
 49 | 			except Exception as e:
 50 | 				# try url-encoded
 51 | 				try:
 52 | 					self.pattern[1] = self.get_urlencoded_pattern(self.request.data, False)
 53 | 				except Exception as e:
 54 | 					#print "! UNKNOWN POST DATA FORMAT"
 55 | 					pass
 56 | 
 57 | 
 58 | 
 59 | 	def get_url_pattern(self, url):
 60 | 		"""
 61 | 		returns url pattern for comparision (query and data parameters are sorted and without values)
 62 | 		"""
 63 | 		purl = urlsplit(url)
 64 | 		patt = [purl.scheme, purl.netloc, purl.path, self.get_urlencoded_pattern(purl.query)]
 65 | 
 66 | 		return patt
 67 | 
 68 | 
 69 | 
 70 | 	def get_xml_pattern(self, node):
 71 | 		"""
 72 | 		returns the xml tree as an array without values, example:
 73 | 
 74 | 		<root>
 75 | 			<node foo="bar" bar="foo"/>
 76 | 			<elements index="1">
 77 | 				<z>1</z>
 78 | 				<z>1</z>
 79 | 				<a type="int">123</a>
 80 | 			</elements>
 81 | 		</root> 
 82 | 
 83 | 		['root', [      <--- child nodes sorted ('elements' comes before 'node')
 84 | 			['elements', 'index', [
 85 | 				['a', 'type'], ['z'], ['z']
 86 | 			]], 
 87 | 			['node', 'bar', foo']     <--- properties sorted ('bar' comes before 'foo')
 88 | 		]]
 89 | 		"""
 90 | 
 91 | 		# describe a tag as its name plus the name of its properties (sorted)
 92 | 		patt = [node.tag] + [x for x in sorted(node.attrib.keys())]
 93 | 
 94 | 		#collect child nodes in the form of "node array" (tagname+props)
 95 | 		ch = []
 96 | 		for child in node:
 97 | 			ch.append(self.get_xml_pattern(child))
 98 | 
 99 | 		if ch:
100 | 			# sort using the tagname as the key
101 | 			ch.sort(key=lambda x:x[0])
102 | 			patt.append(ch) 
103 | 
104 | 		return patt
105 | 
106 | 
107 | 
108 | 	def get_json_pattern(self, data):
109 | 		"""
110 | 		returns an object with values set to zero (sorting of keys is not needed), example:
111 | 		"""
112 | 
113 | 		patt = json.loads(data)
114 | 		self.nullify_object_values(patt)
115 | 
116 | 		return patt
117 | 
118 | 
119 | 
120 | 	def nullify_object_values(self, obj):
121 | 		"""
122 | 		sets to 0 all object values
123 | 		"""
124 | 		keys = obj.keys() if isinstance(obj, dict) else range(0, len(obj))
125 | 
126 | 		for k in keys:
127 | 			if not hasattr(obj[k], '__iter__'):
128 | 				obj[k] = 0
129 | 			else:
130 | 				self.nullify_object_values(obj[k])
131 | 
132 | 
133 | 
134 | 	def get_urlencoded_pattern(self, data, ignoreErrors = True):
135 | 		"""
136 | 		returns query parameters sorted and without vaules
137 | 		"""
138 | 		# parse_qs(qs[, keep_blank_values[, strict_parsing]])
139 | 		query = parse_qs(data, True, not ignoreErrors)
140 | 		return sorted(query.keys())
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/core/lib/shell.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | import sys
14 | import subprocess
15 | import threading
16 | 
17 | 
18 | class CommandExecutor:
19 | 	"""
20 | 	Executes shell command and returns its output.
21 | 	The process is killed afer <timeout> seconds
22 | 	"""
23 | 
24 | 	def __init__(self, cmd, stderr = False):
25 | 		#self.cmd = cmd
26 | 		self.cmd = [c.encode("utf-8") for c in cmd]
27 | 		self.stderr = stderr		
28 | 		self.out = None
29 | 		self.err = None
30 | 		self.process = None
31 | 		self.thread = None
32 | 		
33 | 
34 | 	def kill(self):
35 | 			self.process.kill()
36 | 			self.thread.join()
37 | 			
38 | 
39 | 	def execute(self, timeout):
40 | 		
41 | 		def executor():					
42 | 			try:
43 | 				# close_fds=True is needed in threaded programs				
44 | 				self.process = subprocess.Popen(self.cmd,stderr=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=0, close_fds=sys.platform != "win32")
45 | 				self.out, self.err = self.process.communicate()
46 | 				
47 | 			except Exception as e:
48 | 				raise
49 | 			
50 | 		self.thread = threading.Thread(target = executor)
51 | 		self.thread.start()
52 | 		
53 | 		self.thread.join(int(timeout))
54 | 	
55 | 		if self.thread.is_alive():
56 | 			self.kill()											
57 | 			self.out = None
58 | 			self.err = "Executor: execution timeout"
59 | 
60 | 		return self.out if not self.stderr else (self.out, self.err)


--------------------------------------------------------------------------------
/core/lib/thirdparty/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/lib/thirdparty/__init__.py


--------------------------------------------------------------------------------
/core/lib/thirdparty/pysocks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/lib/thirdparty/pysocks/__init__.py


--------------------------------------------------------------------------------
/core/lib/thirdparty/pysocks/sockshandler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | SocksiPy + urllib2 handler
 4 | 
 5 | version: 0.3
 6 | author: e<e@tr0ll.in>
 7 | 
 8 | This module provides a Handler which you can use with urllib2 to allow it to tunnel your connection through a socks.sockssocket socket, with out monkey patching the original socket...
 9 | """
10 | import ssl
11 | 
12 | try:
13 |     import urllib2
14 |     import httplib
15 | except ImportError: # Python 3
16 |     import urllib.request as urllib2
17 |     import http.client as httplib
18 | 
19 | import socks # $ pip install PySocks
20 | 
21 | def merge_dict(a, b):
22 |     d = a.copy()
23 |     d.update(b)
24 |     return d
25 | 
26 | class SocksiPyConnection(httplib.HTTPConnection):
27 |     def __init__(self, proxytype, proxyaddr, proxyport=None, rdns=True, username=None, password=None, *args, **kwargs):
28 |         self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password)
29 |         httplib.HTTPConnection.__init__(self, *args, **kwargs)
30 | 
31 |     def connect(self):
32 |         self.sock = socks.socksocket()
33 |         self.sock.setproxy(*self.proxyargs)
34 |         if type(self.timeout) in (int, float):
35 |             self.sock.settimeout(self.timeout)
36 |         self.sock.connect((self.host, self.port))
37 | 
38 | class SocksiPyConnectionS(httplib.HTTPSConnection):
39 |     def __init__(self, proxytype, proxyaddr, proxyport=None, rdns=True, username=None, password=None, *args, **kwargs):
40 |         self.proxyargs = (proxytype, proxyaddr, proxyport, rdns, username, password)
41 |         httplib.HTTPSConnection.__init__(self, *args, **kwargs)
42 | 
43 |     def connect(self):
44 |         sock = socks.socksocket()
45 |         sock.setproxy(*self.proxyargs)
46 |         if type(self.timeout) in (int, float):
47 |             sock.settimeout(self.timeout)
48 |         sock.connect((self.host, self.port))
49 |         self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
50 | 
51 | class SocksiPyHandler(urllib2.HTTPHandler, urllib2.HTTPSHandler):
52 |     def __init__(self, *args, **kwargs):
53 |         self.args = args
54 |         self.kw = kwargs
55 |         urllib2.HTTPHandler.__init__(self)
56 | 
57 |     def http_open(self, req):
58 |         def build(host, port=None, timeout=0, **kwargs):
59 |             kw = merge_dict(self.kw, kwargs)
60 |             conn = SocksiPyConnection(*self.args, host=host, port=port, timeout=timeout, **kw)
61 |             return conn
62 |         return self.do_open(build, req)
63 | 
64 |     def https_open(self, req):
65 |         def build(host, port=None, timeout=0, **kwargs):
66 |             kw = merge_dict(self.kw, kwargs)
67 |             conn = SocksiPyConnectionS(*self.args, host=host, port=port, timeout=timeout, **kw)
68 |             return conn
69 |         return self.do_open(build, req)
70 | 
71 | if __name__ == "__main__":
72 |     import sys
73 |     try:
74 |         port = int(sys.argv[1])
75 |     except (ValueError, IndexError):
76 |         port = 9050
77 |     opener = urllib2.build_opener(SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, "localhost", port))
78 |     print("HTTP: " + opener.open("http://httpbin.org/ip").read().decode())
79 |     print("HTTPS: " + opener.open("https://httpbin.org/ip").read().decode())
80 | 


--------------------------------------------------------------------------------
/core/lib/thirdparty/simhash/__init__.py:
--------------------------------------------------------------------------------
  1 | # Created by 1e0n in 2013
  2 | from __future__ import division, unicode_literals
  3 | 
  4 | import sys
  5 | import re
  6 | import hashlib
  7 | import logging
  8 | import collections
  9 | from itertools import groupby
 10 | 
 11 | if sys.version_info[0] >= 3:
 12 |     basestring = str
 13 |     unicode = str
 14 |     long = int
 15 | else:
 16 |     range = xrange
 17 | 
 18 | 
 19 | class Simhash(object):
 20 | 
 21 |     def __init__(self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=None):
 22 |         """
 23 |         `f` is the dimensions of fingerprints
 24 | 
 25 |         `reg` is meaningful only when `value` is basestring and describes
 26 |         what is considered to be a letter inside parsed string. Regexp
 27 |         object can also be specified (some attempt to handle any letters
 28 |         is to specify reg=re.compile(r'\w', re.UNICODE))
 29 | 
 30 |         `hashfunc` accepts a utf-8 encoded string and returns a unsigned
 31 |         integer in at least `f` bits.
 32 |         """
 33 | 
 34 |         self.f = f
 35 |         self.reg = reg
 36 |         self.value = None
 37 | 
 38 |         if hashfunc is None:
 39 |             def _hashfunc(x):
 40 |                 return int(hashlib.md5(x).hexdigest(), 16)
 41 | 
 42 |             self.hashfunc = _hashfunc
 43 |         else:
 44 |             self.hashfunc = hashfunc
 45 | 
 46 |         if isinstance(value, Simhash):
 47 |             self.value = value.value
 48 |         elif isinstance(value, basestring):
 49 |             self.build_by_text(unicode(value))
 50 |         elif isinstance(value, collections.Iterable):
 51 |             self.build_by_features(value)
 52 |         elif isinstance(value, long):
 53 |             self.value = value
 54 |         else:
 55 |             raise Exception('Bad parameter with type {}'.format(type(value)))
 56 | 
 57 |     def _slide(self, content, width=4):
 58 |         return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]
 59 | 
 60 |     def _tokenize(self, content):
 61 |         content = content.lower()
 62 |         content = ''.join(re.findall(self.reg, content))
 63 |         ans = self._slide(content)
 64 |         return ans
 65 | 
 66 |     def build_by_text(self, content):
 67 |         features = self._tokenize(content)
 68 |         features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))}
 69 |         return self.build_by_features(features)
 70 | 
 71 |     def build_by_features(self, features):
 72 |         """
 73 |         `features` might be a list of unweighted tokens (a weight of 1
 74 |                    will be assumed), a list of (token, weight) tuples or
 75 |                    a token -> weight dict.
 76 |         """
 77 |         v = [0] * self.f
 78 |         masks = [1 << i for i in range(self.f)]
 79 |         if isinstance(features, dict):
 80 |             features = features.items()
 81 |         for f in features:
 82 |             if isinstance(f, basestring):
 83 |                 h = self.hashfunc(f.encode('utf-8'))
 84 |                 w = 1
 85 |             else:
 86 |                 assert isinstance(f, collections.Iterable)
 87 |                 h = self.hashfunc(f[0].encode('utf-8'))
 88 |                 w = f[1]
 89 |             for i in range(self.f):
 90 |                 v[i] += w if h & masks[i] else -w
 91 |         ans = 0
 92 |         for i in range(self.f):
 93 |             if v[i] >= 0:
 94 |                 ans |= masks[i]
 95 |         self.value = ans
 96 | 
 97 |     def distance(self, another):
 98 |         assert self.f == another.f
 99 |         x = (self.value ^ another.value) & ((1 << self.f) - 1)
100 |         ans = 0
101 |         while x:
102 |             ans += 1
103 |             x &= x - 1
104 |         return ans
105 | 
106 | 
107 | class SimhashIndex(object):
108 | 
109 |     def __init__(self, objs, f=64, k=2):
110 |         """
111 |         `objs` is a list of (obj_id, simhash)
112 |         obj_id is a string, simhash is an instance of Simhash
113 |         `f` is the same with the one for Simhash
114 |         `k` is the tolerance
115 |         """
116 |         self.k = k
117 |         self.f = f
118 |         count = len(objs)
119 |         logging.info('Initializing %s data.', count)
120 | 
121 |         self.bucket = collections.defaultdict(set)
122 | 
123 |         for i, q in enumerate(objs):
124 |             if i % 10000 == 0 or i == count - 1:
125 |                 logging.info('%s/%s', i + 1, count)
126 | 
127 |             self.add(*q)
128 | 
129 |     def get_near_dups(self, simhash):
130 |         """
131 |         `simhash` is an instance of Simhash
132 |         return a list of obj_id, which is in type of str
133 |         """
134 |         assert simhash.f == self.f
135 | 
136 |         ans = set()
137 | 
138 |         for key in self.get_keys(simhash):
139 |             dups = self.bucket[key]
140 |             logging.debug('key:%s', key)
141 |             if len(dups) > 200:
142 |                 logging.warning('Big bucket found. key:%s, len:%s', key, len(dups))
143 | 
144 |             for dup in dups:
145 |                 sim2, obj_id = dup.split(',', 1)
146 |                 sim2 = Simhash(long(sim2, 16), self.f)
147 | 
148 |                 d = simhash.distance(sim2)
149 |                 if d <= self.k:
150 |                     ans.add(obj_id)
151 |         return list(ans)
152 | 
153 |     def add(self, obj_id, simhash):
154 |         """
155 |         `obj_id` is a string
156 |         `simhash` is an instance of Simhash
157 |         """
158 |         assert simhash.f == self.f
159 | 
160 |         for key in self.get_keys(simhash):
161 |             v = '%x,%s' % (simhash.value, obj_id)
162 |             self.bucket[key].add(v)
163 | 
164 |     def delete(self, obj_id, simhash):
165 |         """
166 |         `obj_id` is a string
167 |         `simhash` is an instance of Simhash
168 |         """
169 |         assert simhash.f == self.f
170 | 
171 |         for key in self.get_keys(simhash):
172 |             v = '%x,%s' % (simhash.value, obj_id)
173 |             if v in self.bucket[key]:
174 |                 self.bucket[key].remove(v)
175 | 
176 |     @property
177 |     def offsets(self):
178 |         """
179 |         You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
180 |         """
181 |         return [self.f // (self.k + 1) * i for i in range(self.k + 1)]
182 | 
183 |     def get_keys(self, simhash):
184 |         for i, offset in enumerate(self.offsets):
185 |             if i == (len(self.offsets) - 1):
186 |                 m = 2 ** (self.f - offset) - 1
187 |             else:
188 |                 m = 2 ** (self.offsets[i + 1] - offset) - 1
189 |             c = simhash.value >> offset & m
190 |             yield '%x:%x' % (c, i)
191 | 
192 |     def bucket_size(self):
193 |         return len(self.bucket)
194 | 


--------------------------------------------------------------------------------
/core/lib/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - beta 1
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under 
  8 | the terms of the GNU General Public License as published by the Free Software 
  9 | Foundation; either version 2 of the License, or (at your option) any later 
 10 | version.
 11 | """
 12 | 
 13 | import sys
 14 | import os
 15 | import time
 16 | import pipes
 17 | import re
 18 | import posixpath
 19 | 
 20 | from urlparse import urlsplit, urljoin, parse_qsl
 21 | from core.lib.exception import *
 22 | from core.constants import *
 23 | from core.crawl.lib.shared import *
 24 | 
 25 | 
 26 | def generate_filename(name, ext=None, out_file_overwrite=False, ask_out_file_overwrite=False):
 27 | 
 28 | 	def fname():
 29 | 		return ".".join([f for f in ft if f])
 30 | 
 31 | 	ft = name.split(".")
 32 | 
 33 | 	if not ext and len(ft) > 1:
 34 | 		ext = ft[-1]
 35 | 
 36 | 	# remove extension if present in name and equal to ext
 37 | 	if ft[-1] == ext: ft.pop()
 38 | 
 39 | 	# always append ext, even if None
 40 | 	ft.append(ext)
 41 | 
 42 | 	if ask_out_file_overwrite and os.path.exists(fname()):
 43 | 		try:
 44 | 			sys.stdout.write("File %s already exists. Overwrite [y/N]: " % fname())
 45 | 			out_file_overwrite = sys.stdin.read(1) == "y"
 46 | 		except KeyboardInterrupt:
 47 | 			print "\nAborted"
 48 | 			sys.exit(0)
 49 | 
 50 | 	if not out_file_overwrite:
 51 | 		bn = ft[-2]
 52 | 		i = 1
 53 | 		while os.path.exists(fname()):
 54 | 			ft[-2] = "%s-%d" % (bn, i)
 55 | 			i += 1
 56 | 
 57 | 	return fname()
 58 | 
 59 | 
 60 | def cmd_to_str(cmd):
 61 | 	ecmd = [pipes.quote(o) for o in cmd]
 62 | 	return " ".join(ecmd)
 63 | 
 64 | 
 65 | def stdoutw(str):
 66 | 	sys.stdout.write(str)
 67 | 	sys.stdout.flush()
 68 | 
 69 | 
 70 | def getrealdir(path):
 71 | 	return os.path.dirname(os.path.realpath(path)) + os.sep 
 72 | 
 73 | 
 74 | 
 75 | def print_progressbar(tot, scanned, start_time, label):
 76 | 	perc = (scanned * 33) / (tot if tot > 0 else 1)
 77 | 	sys.stdout.write("\b"*150)
 78 | 	out = "[%s%s]   %d of %d %s in %d minutes" % ("="*perc, " "*(33-perc), scanned, tot, label, int(time.time() - start_time) / 60)
 79 | 	stdoutw(out)
 80 | 
 81 | 
 82 | def join_qsl(qs):
 83 | 	"""
 84 | 	join a list returned by parse_qsl
 85 | 	do not use urlencode since it will encode values and not just join tuples
 86 | 	"""
 87 | 	return "&".join(["%s=%s" % (k,v) for k,v in qs])
 88 | 
 89 | 
 90 | # ?a=1&a=2&a=3 -> ?a=3, ?a[]=1&a[]=2&a[]=3 -> UNCHANGED
 91 | def group_qs_params(url):
 92 | 	purl = urlsplit(url)
 93 | 	qs = parse_qsl(purl.query)
 94 | 	nqs = list()
 95 | 
 96 | 	for t in reversed(qs):
 97 | 		if t[0].endswith("[]") or t[0] not in [f for f,_ in nqs]:
 98 | 			nqs.append(t)
 99 | 
100 | 
101 | 	purl = purl._replace(query = join_qsl(reversed(nqs)))
102 | 
103 | 	return purl.geturl();
104 | 
105 | 
106 | 
107 | def normalize_url(url):
108 | 	# add http if scheme is not present 
109 | 	# if an url like 'test.com:80//path' is passed to urlsplit the result is:
110 | 	# (scheme='test.com',  path='80//path', ...)
111 | 	if not re.match("^[a-z]+://", url, re.I):
112 | 		url = "http://%s" % url
113 | 
114 | 	purl = urlsplit(url)
115 | 
116 | 	# no path and no query_string .. just ensure url ends with /
117 | 	if not purl.path:
118 | 		return "%s/" % purl.geturl()
119 | 
120 | 	# group multiple / (path//to///file -> path/to/file)
121 | 	new_path = re.sub(r"/+","/", purl.path)
122 | 	# normalize ../../../
123 | 	new_path = posixpath.normpath(new_path)
124 | 	if purl.path.endswith('/') and not new_path.endswith('/'):
125 | 		new_path += '/'
126 | 
127 | 	purl = purl._replace(path = new_path)
128 | 
129 | 	return purl.geturl()
130 | 
131 | 
132 | 
133 | 
134 | def extract_http_auth(url):
135 | 	"""
136 | 	returns a tuple with httpauth string and the original url with http auth removed
137 | 	http://foo:bar@example.local -> (foo:bar, http://example.local)
138 | 	"""
139 | 
140 | 	purl = urlsplit(url)
141 | 	if not purl.netloc:
142 | 		return (None, url)
143 | 	try:
144 | 		auth, netloc = purl.netloc.split("@", 1)
145 | 	except:
146 | 		return (None, url)
147 | 
148 | 
149 | 	purl = purl._replace(netloc=netloc)
150 | 
151 | 	return (auth, purl.geturl())
152 | 
153 | 
154 | 
155 | 
156 | def remove_tokens(query):
157 | 	"""
158 | 	tries to detect and remove tokens from a query string
159 | 	used to compare request ignoring, for example, CSRF tokens
160 | 	"""
161 | 
162 | 	qs = parse_qsl(query)
163 | 	nqs = []
164 | 	for k,v in qs:
165 | 		if len(v) < 32 or not re.match(r'^[a-z0-9\-_\.:=]+$', v, re.I):
166 | 			nqs.append((k,v))
167 | 
168 | 	return join_qsl(nqs)
169 | 
170 | 
171 | def get_phantomjs_cmd():
172 | 	standard_paths = [os.getcwd()]
173 | 	envpath = os.environ['PATH'].split(os.pathsep)
174 | 	exe_name = "phantomjs"
175 | 
176 | 	if sys.platform != "win32":
177 | 		# force check to standard paths in case $PATH is not set (ie crontab)
178 | 		standard_paths.extend(["/usr/bin", "/usr/local/bin", "/usr/share/bin"])
179 | 	else:
180 | 		exe_name = "%s.exe" % exe_name
181 | 
182 | 	exe_paths = ["%s%s%s" % (p, os.sep, exe_name) for p in standard_paths + envpath]
183 | 
184 | 	for exe in exe_paths:
185 | 		if os.path.isfile(exe):
186 | 			return [exe, "--ignore-ssl-errors=yes", "--web-security=false", "--ssl-protocol=any", "--load-images=false","--debug=false","--load-images=no"]
187 | 
188 | 	return None
189 | 
190 | 
191 | 
192 | def url_is_valid(url):
193 | 	purl = urlsplit(url)
194 | 
195 | 	if not purl.scheme or not purl.netloc:
196 | 		return False
197 | 
198 | 	# ensure netloc is in the form "something dot something"
199 | 	#if not re.match(r'^[^\.]+\..*(?<!\.)$', purl.netloc, re.I):
200 | 	if not re.match(r'^.+\..+$', purl.netloc):
201 | 		return False
202 | 
203 | 	return True
204 | 
205 | class HostFilter(object):
206 |     
207 |     def __init__(self, url):
208 |         self.host = url
209 | 		
210 |     def isip(self,host):
211 |         '''
212 |         match host is IP
213 |         '''
214 |         pattern = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
215 |         regex = re.compile(pattern)
216 |         if regex.match(host):
217 |             return True
218 |         return False
219 | 
220 |     def get_subsititute(self,host):
221 |         from tld import get_tld
222 |         if not self.isip(host):
223 |             try:
224 |                 domain = get_tld('http://'+host)
225 |             except Exception as e:
226 |                 domain = host
227 |             
228 |             return domain
229 |         else:
230 |             return host
231 | 
232 |     #获取根域名
233 |     def urlfilter(self):
234 |         import urlparse,os
235 | 
236 |         IGNORE_DOMAIN = [	#强制过滤域名.常见的bat和一些统计分享类的网站
237 |     		'taobao.com','51.la','weibo.com','qq.com','t.cn','baidu.com','gravatar.com','cnzz.com','51yes.com',
238 |     		'google-analytics.com','tanx.com','360.cn','yy.com','163.com','263.com','eqxiu.com','','gnu.org','github.com',
239 | 			'facebook.com','twitter.com','google.com'
240 |     	]
241 | 
242 |         IGNORE_EXT = [ #强制过滤后缀.from sqlmap
243 |     		".css", ".js", ".3ds", ".ttf", ".3g2", ".3gp", ".7z", ".DS_Store", ".a", ".aac", ".adp", ".ai", ".aif", 
244 |     		".aiff", ".apk", ".ar", ".asf", ".au", ".avi", ".bak", ".bin", ".bk", ".bmp", ".btif", ".bz2", ".cab", ".caf", 
245 |     		".cgm", ".cmx", ".cpio", ".cr2", ".dat", ".deb", ".djvu", ".dll", ".dmg", ".dmp", ".dng", ".doc", ".docx", 
246 |     		".dot", ".dotx", ".dra", ".dsk", ".dts", ".dtshd", ".dvb", ".dwg", ".dxf", ".ear", ".ecelp4800", ".ecelp7470", 
247 |     		".ecelp9600", ".egg", ".eol", ".eot", ".epub", ".exe", ".f4v", ".fbs", ".fh", ".fla", ".flac", ".fli", ".flv", 
248 |     		".fpx", ".fst", ".fvt", ".g3", ".gif", ".gz", ".h261", ".h263", ".h264", ".ico", ".ief", ".image", ".img", 
249 |     		".ipa", ".iso", ".jar", ".jpeg", ".jpg", ".jpgv", ".jpm", ".jxr", ".ktx", ".lvp", ".lz", ".lzma", ".lzo", 
250 |     		".m3u", ".m4a", ".m4v", ".mar", ".mdi", ".mid", ".mj2",".ttf",".ico",".mka", ".mkv", ".mmr", ".mng", ".mov", 
251 |     		".movie", ".mp3", ".mp4", ".mp4a", ".mpeg", ".mpg", ".mpga", ".mxu", ".nef", ".npx", ".o", ".oga", ".ogg", 
252 |     		".ogv", ".otf", ".pbm", ".pcx", ".pdf", ".pea", ".pgm", ".pic", ".png", ".pnm", ".ppm", ".pps", ".ppt", 
253 |     		".pptx", ".ps", ".psd", ".pya", ".pyc", ".pyo", ".pyv", ".qt", ".rar", ".ras", ".raw", ".rgb", ".rip", 
254 |     		".rlc", ".rz", ".s3m", ".s7z", ".scm", ".scpt", ".sgi", ".shar", ".sil", ".smv", ".so", ".sub", ".swf", 
255 |     		".tar", ".tbz2", ".tga", ".tgz", ".tif", ".tiff", ".tlz", ".ts", ".ttf", ".uvh", ".uvi", ".uvm", ".uvp", 
256 |     		".uvs", ".uvu", ".viv", ".vob", ".war", ".wav", ".wax", ".wbmp", ".wdp", ".weba", ".webm", ".webp", ".whl", 
257 |     		".wm", ".wma", ".wmv", ".wmx", ".woff", ".woff2", ".wvx", ".xbm", ".xif", ".xls", ".xlsx", ".xlt", ".xm", 
258 |     		".xpi", ".xpm", ".xwd", ".xz", ".z", ".zip", ".zipx",".axd"
259 |     	]
260 |         
261 |         parser = urlparse.urlparse(self.host)
262 | 
263 |         file_ext = os.path.splitext(parser.path)[-1]
264 | 
265 |         host = self.get_subsititute(parser.netloc)
266 |         host = host.replace('http:','').replace('https:','')
267 | 
268 |         #print WebProbe(self.host)
269 |         
270 |         if host in IGNORE_DOMAIN or host not in Shared.allowed_domains or file_ext in IGNORE_EXT:
271 |             return False
272 | 
273 |     	else:
274 | 			return True
275 |     		
276 | 
277 | if __name__ == '__main__':
278 |     url = "http://www.ddcpc.cn/2017/jr_0525/102157.html"
279 |     urlfilt = HostFilter(url)
280 |     print urlfilt.filter()
281 | 
282 | 


--------------------------------------------------------------------------------
/core/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/util/__init__.py


--------------------------------------------------------------------------------
/core/util/base_util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | from __future__ import unicode_literals
14 | import sys
15 | import getopt 
16 | 
17 | class BaseUtil:
18 | 
19 | 	@staticmethod
20 | 	def get_settings():
21 | 		return dict(
22 | 			descr = "",
23 | 			optargs = '',
24 | 			minargs = 0
25 | 		)
26 | 
27 | 	def usage(self):
28 | 		return (
29 | 			"%s\n"
30 | 			"usage: %s\n"
31 | 			% (self.get_settings()['descr'], self.utilname)
32 | 		)
33 | 
34 | 	def __init__(self, argv):
35 | 		self.utilname = argv[0]
36 | 		settings = self.get_settings()
37 | 
38 | 		if len(argv) < (settings['minargs'] + 1):
39 | 			print self.usage()
40 | 			sys.exit(1)
41 | 
42 | 		try:
43 | 			opts, args = getopt.getopt(argv[1:], settings['optargs'])
44 | 		except getopt.GetoptError as err:
45 | 			print str(err)
46 | 			sys.exit(1)
47 | 
48 | 		self.main(args, opts)
49 | 


--------------------------------------------------------------------------------
/core/util/util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | """
 4 | HTCAP - beta 1
 5 | Author: filippo.cavallarin@wearesegment.com
 6 | 
 7 | This program is free software; you can redistribute it and/or modify it under 
 8 | the terms of the GNU General Public License as published by the Free Software 
 9 | Foundation; either version 2 of the License, or (at your option) any later 
10 | version.
11 | """
12 | 
13 | from __future__ import unicode_literals
14 | import sys
15 | import importlib
16 | from glob import glob
17 | from core.lib.utils import *
18 | 
19 | class Util:
20 | 
21 | 	def get_mod(self, path, name):
22 | 		mod = importlib.import_module("%s.%s" % (path, name))
23 | 		return getattr(mod, name.title())
24 | 
25 | 
26 | 	def __init__(self, argv):
27 | 		util = argv[0] if len(argv) >= 1 else ""
28 | 		mp = "core.util.utilities"
29 | 		fp = "%s%sutilities" % (getrealdir(__file__), os.sep)
30 | 		utils = [os.path.basename(m).split(".")[0] for m in glob(os.path.join(fp , '[a-z]*[a-z].py'))]
31 | 
32 | 		if not util in utils:
33 | 			utils.sort()
34 | 			print "Available utilities are:"
35 | 			for u in utils:
36 | 				run = self.get_mod(mp, u)
37 | 				print "   %s%s%s" % (u, " "*(20 - len(u)), run.get_settings()['descr'].split("\n")[0])
38 | 			sys.exit(1)
39 | 
40 | 		run = self.get_mod(mp, util)
41 | 		run([util] + argv[1:])


--------------------------------------------------------------------------------
/core/util/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/0xa-saline/htcap_mysql/8648d19346d03842bf95c9f4d3020540b9848e08/core/util/utilities/__init__.py


--------------------------------------------------------------------------------
/core/util/utilities/htmlreport/report.html:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | <body onload="initReport()">
  4 | <div id=top>
  5 | 	<div id="collapse_top">
  6 | 		<div id=top_header>
  7 | 			<p id=title>Htcap <span id=htcap_version></span> Report</p>
  8 | 			<hr>
  9 | 			<table id=infos>
 10 | 				<tr><td nowrap>Target:</td><td id=infos_target></td></tr>
 11 | 				<tr><td nowrap>Crawl date:</td><td id=infos_scan_date></td></tr>
 12 | 				<tr><td nowrap>Pages crawled:</td><td id=infos_scanned_urls></td></tr>
 13 | 				<tr><td nowrap>Crawl duration:</td><td id=infos_scan_time></td></tr>
 14 | 				<tr><td nowrap>Out of scope:</td><td>
 15 | 					<span id=infos_outofscope></span> 
 16 | 					<span class=button id=outofscope-open>open</span>
 17 | 				</td></tr>
 18 | 				<tr><td nowrap>Non HTML:</td><td>
 19 | 					<span id=infos_nonhtml></span> 
 20 | 					<span class=button id=nonhtml-open>open</span>
 21 | 				</td></tr>
 22 | 				<tr><td nowrap>Command:</td><td id=infos_commandline></td></tr>
 23 | 			</table>
 24 | 
 25 | 		</div>
 26 | 	</div>
 27 | 	<hr>
 28 | 
 29 | 	<div id=filtersbox>
 30 | 		<form>
 31 | 			<div>
 32 | 				<span><input type=checkbox id=allfilters> <b>Show:</b> </span> <span id=filters></span>
 33 | 			</div>
 34 | 			<table>
 35 | 				<tr>
 36 | 					<td>Hide Urls: <p class="hidden addregexp" id=add_url_regexp>add selected</p></td>
 37 | 					<td><textarea id=urlhider tabindex=1  title='write here a regexp to hide matched urls'></textarea></td>
 38 | 					<td>&nbsp;</td>
 39 | 					<td>Hide Results: <p class="hidden addregexp" id=add_result_regexp>add selected text</p></td>
 40 | 					<td><textarea id=reshider tabindex=2 title='write here a regexp to hide matched results'></textarea></td>
 41 | 				</tr>
 42 | 			</table>
 43 | 		</form>
 44 | 	</div>
 45 | 	<div id=buttons>
 46 | 		<span class=button id=notes-open>open notes</span>
 47 | 		<span class=button id=marked-open>open marked</span>
 48 | 		<span class=button id=trash-open>open trash</span>
 49 | 		<span class=button-spacer>&nbsp;</span>
 50 | 		<span class=button id=collapse-all>collapse all</span>
 51 | 		<span class=button id=expand-visibles>expand visibles</span>
 52 | 		<span class=button-spacer>&nbsp;</span>
 53 | 		<label for="load_status_input" class=button>load session</label><input style='display:none' type=file id="load_status_input">
 54 | 		<span class=button id=save-status>save session</span>
 55 | 		<span id=save_status class="hidden">
 56 | 			<input type=text placeholder="htcap_report_status.json"> <a class=button href="" download="">save</a>
 57 | 		</span>
 58 | 		<span id=error_container></span>
 59 | 
 60 | 	</div>
 61 | 	<hr>
 62 | </div>
 63 | 
 64 | <div id=report></div>
 65 | 
 66 | 
 67 | <div id=marked class="modal hidden">
 68 | 	<div class=modal-bar>
 69 | 		Marked
 70 | 		<div id='marked-close' title=close>&#215;</div>
 71 | 	</div>
 72 | 	<div class=modal-content></div>
 73 | </div>
 74 | 
 75 | 
 76 | <div id=trash class="modal hidden">
 77 | 	<div class=modal-bar>
 78 | 		Trash
 79 | 		<div id='trash-close' title=close>&#215;</div>
 80 | 	</div>
 81 | 	<div class=modal-content></div>
 82 | </div>
 83 | 
 84 | 
 85 | <div id=notes class="modal hidden">
 86 | 	<div class=modal-bar>
 87 | 		Notes
 88 | 		<div id='notes-close' title=close>&#215;</div>
 89 | 	</div>
 90 | 	<div class=modal-content>
 91 | 		<textarea></textarea>
 92 | 	</div>
 93 | </div>
 94 | 
 95 | 
 96 | 
 97 | <div id=outofscope class="modal hidden">
 98 | 	<div class=modal-bar>
 99 | 		Out of scope
100 | 		<div id='outofscope-close' title=close>&#215;</div>
101 | 	</div>
102 | 	<div class=modal-content></div>
103 | </div>
104 | 
105 | <div id=nonhtml class="modal hidden">
106 | 	<div class=modal-bar>
107 | 		Non HTML Documents
108 | 		<div id='nonhtml-close' title=close>&#215;</div>
109 | 	</div>
110 | 	<div class=modal-content></div>
111 | </div>
112 | 
113 | 
114 | 
115 | <div id=vulnerability class="modal hidden">
116 | 	<div class=modal-bar>
117 | 		Vulnerability Details
118 | 		<div id='vulnerability-close' title=close>&#215;</div>
119 | 	</div>
120 | 	<div class=modal-content></div>
121 | </div>
122 | 
123 | </body>
124 | 
125 | 


--------------------------------------------------------------------------------
/core/util/utilities/htmlreport/report.js:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | var types = ['xhr','jsonp','websockets', 'forms', 'vulnerabilities','errors'];
  4 | 
  5 | function getIcon(type, founds){
  6 | 	var icons = {
  7 | 		forms:"F",
  8 | 		xhr: "X", 
  9 | 		jsonp: "J",
 10 | 		errors: null, 
 11 | 		websockets: "W",
 12 | 		vulnerabilities:"V"
 13 | 	};
 14 | 
 15 | 	if(icons[type] != null){
 16 | 		var iconclass = founds.indexOf(type) > -1 ? ".icon" : ".icon.icon-hidden";
 17 | 		return newElement("span" + iconclass,[ 'data-for', type,'title',getLabel(type) ], icons[type]);
 18 | 	}
 19 | 	return null;
 20 | }
 21 | 
 22 | 
 23 | function getLabel(type){
 24 | 	var labels = {
 25 | 		forms:"Forms",
 26 | 		xhr: "XHR", 
 27 | 		jsonp: "JSONP", 
 28 | 		errors: "Errors", 
 29 | 		websockets: "Web Sockets",
 30 | 		vulnerabilities: "Vulnerabilities"
 31 | 	};
 32 | 
 33 | 	if(type in labels)
 34 | 		return labels[type];
 35 | 
 36 | 	return null;
 37 | }
 38 | 
 39 | // code_injection*,file_inclusion*,path_traversal*,rfi*,xss*,xxe*
 40 | 
 41 | function getVulnName(type){
 42 | 	var labels = {
 43 | 		xss:"Cross Site Scripting (XSS)",
 44 | 		sqli: "Sql Injection", 
 45 | 		lfi: "Local File Inclusion", 
 46 | 
 47 | 	};
 48 | 	type = type.toLowerCase();
 49 | 	if(type in labels)
 50 | 		return labels[type];
 51 | 
 52 | 	return type;
 53 | }
 54 | 
 55 | 
 56 | function newElement(name, attributes, content, appendTo){
 57 | 	var a;
 58 | 	name = name.split(".");
 59 | 
 60 | 	var el = document.createElement(name.splice(0,1));
 61 | 	if(attributes && attributes.length > 0){
 62 | 		if(attributes.length % 2 != 0) console.error("newEvent: attributes.length must be even");
 63 | 		for(a = 0; a < attributes.length - 1; a += 2){
 64 | 			el.setAttribute(attributes[a], attributes[a+1]);
 65 | 		}
 66 | 	}
 67 | 
 68 | 	for(a = 0; a < name.length; a++)
 69 | 		el.classList.add(name[a]);
 70 | 
 71 | 	if(content){
 72 | 		if(content.constructor != Array)
 73 | 			content = [content];
 74 | 		for(var cont of content){
 75 | 			el.appendChild((typeof cont == 'object' && 'tagName' in cont) ? cont : document.createTextNode(cont));
 76 | 		}
 77 | 	}
 78 | 
 79 | 	if(appendTo){
 80 | 		appendTo.appendChild(el);
 81 | 	}
 82 | 
 83 | 	return el;
 84 | }
 85 | 
 86 | function query(selector, element){
 87 | 	selector = selector.trim();
 88 | 	if(selector.match(/^#\S+$/gi) != null){
 89 | 		// a LOT faster than querySelectorAll
 90 | 		return document.getElementById(selector.substring(1));
 91 | 	}
 92 | 	element = element || document;
 93 | 
 94 | 	var ret = element.querySelectorAll(selector);
 95 | 
 96 | 
 97 | 	return ret;
 98 | 
 99 | }
100 | 
101 | 
102 | function elementHeight(element){
103 | 	var style = getComputedStyle(element);
104 | 	var margins = parseInt(style.getPropertyValue("margin-top")) + parseInt(style.getPropertyValue("margin-bottom"));
105 | 	return element.offsetHeight + margins;
106 | }
107 | 
108 | 
109 | function toggleTrashSection(target){
110 | 	if(target.parentNode.parentNode.id == "trash"){
111 | 		insertSection(target);
112 | 	} else{
113 | 		query('#trash .modal-content')[0].appendChild(target);
114 | 	}
115 | }
116 | 
117 | 
118 | function createSection(result){
119 | 
120 | 	var founds = [];
121 | 	var urlicons = [];
122 | 
123 | 	// if(!resultHasData(result))
124 | 	// 	return;
125 | 
126 | 	if(result.out_of_scope || result.errors.indexOf('contentType') > -1)
127 | 		return;
128 | 
129 | 
130 | 	var accord = newElement("div.mainAccordion.accordion.accordion-closed");
131 | 
132 | 	var parent = newElement("a.parent-url",['href',result.referer, 'target', '_blank','title','parent url'], result.referer);
133 | 	accord.appendChild(parent);
134 | 	for(var i = 0; i < types.length; i++){
135 | 		if(!(types[i] in result) || result[types[i]].length == 0) continue;
136 | 		founds.push(types[i]);
137 | 		var hdr = newElement("p.result-accordion-hdr.hdr-accordion.hdr-accordion-open",['data-for',types[i]], getLabel(types[i]), accord);
138 | 		hdr.onclick = function(){toggleAccordion(this)};
139 | 		newElement("span.result-counter",[],"",hdr)
140 | 
141 | 		var resAccord = newElement("div.results.accordion",['data-for',types[i]]);
142 | 		for(var a = 0; a < result[types[i]].length; a++){
143 | 			var req = result[types[i]][a];
144 | 
145 | 			if(types[i] == "errors"){
146 | 				cont = req;
147 | 			} else {
148 | 				if(req.request){
149 | 					var trigger = newElement("span.trigger",["data-trigger", req.trigger], "☍");
150 | 					if(req.trigger){
151 | 						trigger.onclick = function(){
152 | 							this.textContent = this.textContent.length > 1 ? "☍" : this.getAttribute('data-trigger');
153 | 						}
154 | 						trigger.setAttribute("title","trigger element: " + req.trigger)
155 | 					} else {
156 | 						trigger.className += " empty"
157 | 					}
158 | 
159 | 					var cont = [trigger];
160 | 					if(typeof req.request == 'string'){
161 | 						cont.push(req.request);
162 | 					} else {
163 | 						cont.push(req.request[0] + " ");
164 | 						cont.push(newElement("span.result-post-data", [], req.request[1]));
165 | 					}
166 | 				}
167 | 
168 | 				//var cont = req.trigger + "-->" + result[types[i]][a].request;
169 | 				if(types[i] == "vulnerabilities"){//console.log(req)
170 | 					var vcont = JSON.parse(req);
171 | 					cont = newElement("span.vuln-name",[], getVulnName(vcont.type));
172 | 					cont.onclick = (function(c){return function(){
173 | 						openModal("#vulnerability", newElement("pre",[],c));
174 | 					}})(vcont.description);
175 | 
176 | 				} 
177 | 			}
178 | 			newElement("div.result",[], cont, resAccord);
179 | 
180 | 		}
181 | 		accord.appendChild(resAccord);
182 | 	}
183 | 
184 | 	var section = newElement("section",[
185 | 		"data-id", result.id,
186 | 		"data-method", result.method,
187 | 		"data-post-data", (result.data ? result.data : ""),
188 | 		"data-url", result.url, 
189 | 		"data-index", result.index, 
190 | 		"data-founds",founds.join(","),
191 | 		"title", "Request ID: " + result.id
192 | 	]);
193 | 	var urlclass = ".url";
194 | 	var urlattrs = [];
195 | 	if('errors' in result && result.errors.length > 0){
196 | 		urlclass += ".url-error";
197 | 		urlattrs.push("title", "ERRORS");
198 | 	}
199 | 
200 | 	var link_label = [(result.method == "POST" ? "POST " : "") + decodeURI(result.url)];
201 | 	if(result.data){
202 | 		link_label.push(" ");
203 | 		link_label.push(newElement("span.url-post-data", ['title','POST data'], result.data));
204 | 	}
205 | 	var link = newElement("span" + urlclass + ".hdr-accordion.hdr-accordion-closed", urlattrs, link_label);
206 | 	var ics = newElement("span.icons");
207 | 
208 | 	for(var a = 0; a < types.length; a++){
209 | 		var ico = getIcon(types[a], founds);
210 | 		if(ico) ics.appendChild(ico)
211 | 	}
212 | 
213 | 
214 | 	link.onclick = function(){toggleAccordion(this)}
215 | 	var openico = newElement("a.open-new-win", ['href', result.url, 'target', '_blank', 'title', 'open in new window']);
216 | 	if(result.method == "POST"){
217 | 		openico.setAttribute("data-post-data", result.data ? result.data : "");
218 | 		openico.onclick = function(e){
219 | 			postForm(this.href, this.getAttribute("data-post-data"));
220 | 			e.preventDefault();
221 | 		}
222 | 	}
223 | 
224 | 	var trash = newElement("a.trash-button.button",[], 'trash');
225 | 	trash.onclick = (function(target){ return function(){
226 | 		toggleTrashSection(target);
227 | 	}})(section);
228 | 	var next = newElement("a.next-button.button",[], 'next');
229 | 	next.onclick = (function(target){ return function(){
230 | 		nextSection(target);
231 | 	}})(section);
232 | 
233 | 	var mark = newElement("a.mark-button.button",[], 'mark');
234 | 	mark.onclick = (function(target){ return function(){
235 | 		target.classList.toggle("marked");
236 | 	}})(section);
237 | 
238 | 	newElement("div.resbuttons",[],[next,mark,trash], accord);
239 | 
240 | 	// 2 sec senza crearli
241 | 	// 4.5 sec senza appen
242 | 	// 6 sec hidden
243 | 	// 9 sec full
244 | 
245 | 	section.appendChild(ics);
246 | 	section.appendChild(openico);
247 | 	section.appendChild(link);
248 | 	section.appendChild(accord);
249 | 
250 | 	result.html_element = section;
251 | 	//}
252 | }
253 | 
254 | 
255 | 
256 | 
257 | 
258 | function createSections(limit){
259 | 	var results = report.results;
260 | 	var cont = query("#report");
261 | 
262 | 	var url;
263 | 	var cnt = 0;
264 | 	var index = 0;
265 | 	for(var a = 0; a < results.length; a++){
266 | 
267 | 		if('html_element' in results[a]){
268 | 			index++;
269 | 			continue;
270 | 		}
271 | 
272 | 		if(cnt == limit){
273 | 			break;
274 | 		}
275 | 
276 | 
277 | 		createSection(results[a], index);
278 | 		if('html_element' in results[a]){
279 | 
280 | 			cont.appendChild(results[a].html_element);
281 | 			results[a].index = index;			 
282 | 
283 | 			index++;
284 | 			cnt++;
285 | 		}
286 | 	}
287 | 
288 | 	if(cnt > 0)
289 | 		filter();
290 | 
291 | }
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | function insertSection(section){
299 | 	var index = parseInt(section.getAttribute("data-index"));
300 | 	var sections = query("#report section");
301 | 	var next = null;
302 | 	for(var a = 0; a < sections.length; a++){
303 | 		if(parseInt(sections[a].getAttribute("data-index")) > index){
304 | 			next = sections[a];
305 | 			break;
306 | 		}
307 | 	}
308 | 
309 | 	query("#report").insertBefore(section, next);
310 | 
311 | }
312 | 
313 | 
314 | function sortSections(){
315 | 	return
316 | 
317 | 
318 | 	var cont = query('#report');
319 | 	var secs = query("section .url", cont);
320 | 
321 | 	if(secs.length < 2){
322 | 		return;
323 | 	}
324 | 	var list = [];
325 | 	for(var a = 0; a < secs.length;a++){
326 | 		list.push(secs[a]);
327 | 	}
328 | 
329 | 
330 | 	list = list.sort(function(a,b){
331 | 		if(a.textContent.toLowerCase() == b.textContent.toLowerCase())
332 | 			return 0;
333 | 		return a.textContent.toLowerCase() > b.textContent.toLowerCase() ? 1 : -1; 
334 | 	})
335 | 
336 | 	// for(var a = 0 ;a < list.length;a++)
337 | 	// 	console.log(list[a].textContent.toLowerCase())
338 | 
339 | 	var next = secs[0].parentNode;
340 | 	for(var a = 0; next && a < secs.length; a++){
341 | 		cont.insertBefore(list[a].parentNode, next);
342 | 		next = list[a].parentNode.nextSibling;
343 | 	}
344 | }
345 | 
346 | 
347 | 
348 | 
349 | function toggleAccordion(el,forceState){
350 | 	var st = {open: 'accordion-open', closed:'accordion-closed'};
351 | 	if(!el) return;
352 | 	var hdr = el;
353 | 	do{
354 | 		if(el && el.className && el.classList.contains("accordion")/*el.className.split(" ").indexOf('accordion') > -1*/)
355 | 			break;
356 | 	} while(el = el.nextSibling);
357 | 
358 | 	if(!el) return;
359 | 
360 | 	do{
361 | 		if(hdr && hdr.classList.contains("hdr-accordion"))
362 | 			break
363 | 	}while(hdr = hdr.previousSibling);
364 | 
365 | 	var state;
366 | 
367 | 	if(forceState){
368 | 		state = forceState;
369 | 	} else{
370 | 		state = el.classList.contains(st.open) ? "closed" : "open";
371 | 	}
372 | 
373 | 	if(state == "closed"){
374 | 		el.classList.add(st.closed);
375 | 		el.classList.remove(st.open);
376 | 		if(hdr){
377 | 			hdr.classList.add("hdr-" + st.closed);
378 | 			hdr.classList.remove("hdr-" + st.open);
379 | 		}
380 | 	} else {
381 | 		el.classList.add(st.open);
382 | 		el.classList.remove(st.closed);
383 | 		if(hdr){
384 | 			hdr.classList.add("hdr-" + st.open);
385 | 			hdr.classList.remove("hdr-" + st.closed);
386 | 		}
387 | 	}
388 | 
389 | }
390 | 
391 | 
392 | // function filterResultAccordion(element){
393 | // 	var els = query(".results", element);
394 | // 	for(var a = 0; a < els.length; a++){
395 | // 		console.log(els[a])
396 | // 		var state = query('#opt_'+els[a].getAttribute("data-for")).checked ? "open" : "close";
397 | // 		toggleAccordion(els[a], state);
398 | // 	}
399 | // }
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | function filterSections(showFilters){
407 | 
408 | 
409 | 	// alemeno un el di a1 e' contenuto in a2
410 | 	var arraysub = function(a1, a2){
411 | 		for(var a = 0; a < a2.length; a++){
412 | 			if(a1.indexOf(a2[a]) > -1)
413 | 				return true
414 | 		}
415 | 		return false;
416 | 	}
417 | 
418 | 	var errcont = query("#error_container");
419 | 	errcont.innerText = '';
420 | 
421 | 
422 | 	els = query("#report section");
423 | 	for(var a = 0; a < els.length; a++){
424 | 		var founds = els[a].getAttribute("data-founds");
425 | 		if(!founds)continue;
426 | 		///console.log(founds.split(","))
427 | 		els[a].classList[!arraysub(showFilters, founds.split(",")) ? 'add' : 'remove']('hidden');
428 | 	}
429 | 
430 | 
431 | 	// // url hider filter
432 | 	// var regexp = query('#urlhider').value.replace(/\n/g,"");
433 | 	// var rows = query("[data-url]");
434 | 
435 | 	// for(var a = 0; a < rows.length; a++){
436 | 	// 	var method = rows[a].getAttribute("data-method") ? rows[a].getAttribute("data-method") + " " : "";
437 | 	// 	var url = method + rows[a].getAttribute("data-url") + " " + rows[a].getAttribute("data-post-data");
438 | 	// 	if(!rows[a].classList.contains("hidden")){ // already hidden by checkbox filter
439 | 	// 		try{
440 | 	// 			rows[a].classList[(regexp == "" || url.trim().match(new RegExp(regexp,"gi")) == null) ? 'remove' : 'add']('hidden');
441 | 	// 		}catch(e){
442 | 	// 			errcont.innerText = e.message;
443 | 	// 		}
444 | 	// 	}
445 | 	// }
446 | 
447 | }
448 | 
449 | 
450 | 
451 | function filter(fromindex){
452 | 	fromindex = fromindex || 0;
453 | 
454 | 
455 | 	var errcont = query("#error_container");
456 | 	errcont.innerText = '';
457 | 	// checkbox filters
458 | 	var els = query("#filters").getElementsByTagName("input");
459 | 	var sel = [];
460 | 	var sum = 0;
461 | 	for(var a = 0; a < els.length; a++){
462 | 		if(els[a].checked == true){
463 | 			sel.push(els[a].name);
464 | 			sum++;
465 | 		}
466 | 	}
467 | 	var af = query("#allfilters");
468 | 	af.checked = sum >= (els.length/2) ? true : false;
469 | 	af.indeterminate = (sum > 0 && sum != els.length) ? true : false;
470 | 
471 | 	filterSections(sel);
472 | 
473 | 
474 | 	// url hider filter
475 | 	var regexp = query('#urlhider').value.replace(/\n/g,"");
476 | 	var rows = query("[data-url]");
477 | 
478 | 	for(var a = 0; a < rows.length; a++){
479 | 		var method = rows[a].getAttribute("data-method") ? rows[a].getAttribute("data-method") + " " : "";
480 | 		var url = method + rows[a].getAttribute("data-url") + " " + rows[a].getAttribute("data-post-data");
481 | 		if(!rows[a].classList.contains("hidden")){ // already hidden by checkbox filter
482 | 			try{
483 | 				rows[a].classList[(regexp == "" || url.trim().match(new RegExp(regexp,"gi")) == null) ? 'remove' : 'add']('hidden');
484 | 			}catch(e){
485 | 				errcont.innerText = e.message;
486 | 			}
487 | 		}
488 | 	}
489 | 
490 | 
491 | 
492 | 	// // url hider filter
493 | 	// var regexp = query('#urlhider').value.replace(/\n/g,"");
494 | 	// var rows = query("[data-url]");
495 | 
496 | 	// for(var a = 0; a < rows.length; a++){
497 | 	// 	var url = rows[a].getAttribute("data-url");
498 | 	// 	if(!rows[a].classList.contains("hidden")){ // already hidden by checkbox filter
499 | 	// 		try{
500 | 	// 			rows[a].classList[(regexp == "" || url.match(new RegExp(regexp,"gi")) == null) ? 'remove' : 'add']('hidden');
501 | 	// 		}catch(e){
502 | 	// 			errcont.innerText = e.message;
503 | 	// 		}
504 | 	// 	}
505 | 	// }
506 | 
507 | 
508 | 	// results hider filter
509 | 	var regexp = query('#reshider').value.replace(/\n/g,"");
510 | 	var rows = query(".result");
511 | 	for(var a = 0; a < rows.length; a++){
512 | 		var cont = rows[a].textContent;
513 | 		try{
514 | 			rows[a].classList[(regexp == "" || cont.match(new RegExp(regexp,"gi")) == null) ? 'remove' : 'add']('hidden');
515 | 		}catch(e){
516 | 			errcont.innerText = e.message;
517 | 		}
518 | 	}
519 | 
520 | 
521 | 	//results accoridion autocollapse and counting
522 | 	els = query('#report section');
523 | 	for(var a = 0; a < els.length; a++){
524 | 		var res = query('.mainAccordion .results',els[a]);
525 | 
526 | 		for(var b = 0; b < res.length; b++){
527 | 			var counter = query(".result-counter",res[b].previousSibling)[0];
528 | 			counter.textContent = query(".result",res[b]).length;
529 | 			var state = query('#opt_'+res[b].getAttribute("data-for")).checked;
530 | 			toggleAccordion(res[b], state ? "open" : "closed");
531 | 		}
532 | 	}
533 | 
534 | 	// set icon red if resutl is filtered
535 | 	els = query("#report section");
536 | 	for(var a = 0; a < els.length; a++){
537 | 		var ics = els[a].querySelectorAll(".icon");
538 | 		for(var b = 0; b < ics.length; b++){
539 | 			var sel = '.results[data-for="'+ics[b].getAttribute('data-for')+'"] .result:not(.hidden)';
540 | 			var n = els[a].querySelector(sel);
541 | 			//console.log("-->"+n)
542 | 			ics[b].classList[n  ? 'remove':'add']("icon-filtered");
543 | 
544 | 		}
545 | 	}
546 | 
547 | }
548 | 
549 | 
550 | function postForm(url, data){
551 | 	data = data.split(/(?:&amp;|&)+/);
552 | 	var form = newElement("form.hidden", ['method','POST', 'action', url, 'target','_blank']);
553 | 	for(var a = 0; a < data.length; a++){
554 | 		var d = data[a].split(/=(.*)/);
555 | 		newElement("input", ['name', d[0], 'value', d[1]], [], form);
556 | 	}
557 | 	form.submit();
558 | }
559 | 
560 | 
561 | function openMarked(){
562 | 	var cont = query("#marked .modal-content")[0];
563 | 	var marked = query("#report section.marked");
564 | 	if(marked.length == 0) return;
565 | 
566 | 	for(var a = 0; a < marked.length; a++){
567 | 		cont.appendChild(marked[a]);
568 | 	}
569 | 	var hidden = query(".hidden",cont);
570 | 	for(var a = 0; a < hidden.length; a++){
571 | 		hidden[a].classList.remove('hidden');
572 | 		hidden[a].classList.add('was-hidden');
573 | 	}
574 | 	query("#marked").classList.remove("hidden");
575 | 	document.querySelector("body").style.overflow = 'hidden';
576 | }
577 | 
578 | function closeMarked(){
579 | 	if(query("#marked").classList.contains("hidden"))return;
580 | 	var hidden = query(".was-hidden",query("#marked .modal-content")[0]);
581 | 	for(var a = 0; a < hidden.length; a++){
582 | 		hidden[a].classList.add('hidden');
583 | 		hidden[a].classList.remove('was-hidden');
584 | 	}
585 | 	document.querySelector("body").style.overflow = 'visible';
586 | 	query('#marked').classList.add("hidden");
587 | 	var els = query('#marked .modal-content section');
588 | 	for(var a = 0; a < els.length; a++){
589 | 		//query("#report").appendChild(els[a]);
590 | 		insertSection(els[a])
591 | 	}
592 | 	//sortSections();
593 | 
594 | }
595 | 
596 | function nextSection(current){
597 | 
598 | 	var cont = getComputedStyle(current.parentNode).getPropertyValue('overflow') == 'auto' ? current.parentNode : document.body;
599 | 	toggleAccordion(query(".mainAccordion",current)[0]);
600 | 	//console.log(elementHeight(current))
601 | 	cont.scrollTop = document.body.scrollTop + elementHeight(current);
602 | 
603 | 	var next = current.nextSibling;
604 | 	while(next){
605 | 		if(!next.classList.contains('hidden')){
606 | 			break
607 | 		}
608 | 		next = next.nextSibling;
609 | 	} 
610 | 
611 | 	if(next)
612 | 		toggleAccordion(query(".mainAccordion",next)[0], "open");
613 | }
614 | 
615 | 
616 | function addRegexp(target){
617 | 	var txt = window.getSelection().toString();
618 | 	if(txt == "") return;
619 | 	var cont = query(target);
620 | 	// quote string for regesp
621 | 	//txt = "(" + txt.replace(/([.*+?^${}()|\[\]\/\\])/g, "\\$1") + ")";
622 | 	txt = "(" + txt.replace(/([.*+?^${}()|\[\]\\])/g, "\\$1") + ")";
623 | 	if(cont.value != ""){
624 | 		txt = cont.value + "|" + txt;
625 | 	}
626 | 	cont.value = txt;
627 | 	cont.onblur();
628 | 
629 | }
630 | 
631 | // check if result has data so section will be created
632 | function resultHasData(result){
633 | 	if('out_of_scope' in result)
634 | 		return false;
635 | 	for(var a = 0; a < types.length; a++){
636 | 		if(types[a] in result && result[types[a]].length > 0)
637 | 			return true;
638 | 	}
639 | 
640 | 	return false;
641 | }
642 | 
643 | function openModal(selector, content){
644 | 	document.querySelector("body").style.overflow = 'hidden';
645 | 	query(selector).classList.remove("hidden");
646 | 
647 | 	if(content){
648 | 		var c = query(selector +  " .modal-content")[0];
649 | 		c.innerHTML = "";
650 | 		c.appendChild(content);
651 | 	}
652 | }
653 | 
654 | function closeModal(selector){
655 | 	document.querySelector("body").style.overflow = 'visible';
656 | 	if(selector){
657 | 		query(selector).classList.add("hidden");
658 | 	} else {
659 | 		var modals = query(".modal");
660 | 		for(var a = 0; a < modals.length; a++){
661 | 			modals[a].classList.add("hidden");
662 | 		}
663 | 	}
664 | }
665 | 
666 | function saveStatus(){
667 | 	var a;
668 | 	var status = {
669 | 		shown: [],
670 | 		marked:[],
671 | 		trashed:[],
672 | 		hideUrls: "",
673 | 		hideResults: "",
674 | 		notes: ""
675 | 
676 | 	};
677 | 	if(!query("#save_status").classList.contains("hidden")){
678 | 		query("#save_status").classList.add("hidden");
679 | 		return;
680 | 	}
681 | 	var shown = query("#filters input");
682 | 	for(a = 0; a < shown.length; a++){
683 | 		if(shown[a].checked)
684 | 			status.shown.push(shown[a].id);
685 | 	}
686 | 	var marked = query("#report section.marked");
687 | 	for(a = 0; a < marked.length; a++){
688 | 		status.marked.push(marked[a].getAttribute("data-index"));
689 | 	}
690 | 
691 | 	var trashed = query("#trash section");
692 | 	for(a = 0; a < trashed.length; a++){
693 | 		status.trashed.push(trashed[a].getAttribute("data-index"));
694 | 	}
695 | 
696 | 	status.hideUrls = query("#urlhider").value;
697 | 	status.hideResults = query("#reshider").value;
698 | 	status.notes = query("#notes textarea")[0].value;
699 | 
700 | 	var blob = new Blob([JSON.stringify(status)], {type: "application/octet-stream"});
701 | 	query("#save_status a")[0].href = window.URL.createObjectURL(blob);
702 | 	query("#save_status").classList.remove("hidden");
703 | 	query("#save_status input")[0].focus();
704 | 
705 | }
706 | 
707 | function loadStatus(file){
708 | 	var reader = new FileReader();
709 | 
710 | 	reader.onload = function(event) {
711 | 		var doFilter = false;
712 | 		try{
713 | 			var status = JSON.parse(event.target.result);
714 | 		} catch(e){
715 | 			query("#error_container").innerText = e;
716 | 			return;
717 | 		}
718 | 		if('shown' in status){
719 | 			var shown = query("#filters input");
720 | 			for(var a = 0; a < shown.length; a++){
721 | 				var checked = status.shown.indexOf(shown[a].id) != -1;
722 | 				if(!doFilter) 
723 | 					doFilter = shown[a].checked != checked;
724 | 				shown[a].checked = checked;
725 | 			}
726 | 		}
727 | 		if('marked' in status){
728 | 			for(var a = 0; a < status.marked.length; a++){
729 | 				// any section, both from #report and #trash
730 | 				var sec = document.querySelector('section[data-index="'+status.marked[a]+'"]');
731 | 				if(sec) sec.classList.add("marked");
732 | 			}
733 | 		}
734 | 		if('trashed' in status){
735 | 			for(var a = 0; a < status.trashed.length; a++){
736 | 				// trash only section in #report
737 | 				var sec = document.querySelector('#report section[data-index="'+status.trashed[a]+'"]');
738 | 				if(sec) toggleTrashSection(sec);
739 | 			}
740 | 		}
741 | 
742 | 		if('hideUrls' in status){
743 | 			var hu = query("#urlhider");
744 | 			if(!doFilter)
745 | 				doFilter = hu.value != status.hideUrls;
746 | 			hu.value = status.hideUrls;
747 | 		}
748 | 		if('hideResults' in status){
749 | 			var hr = query("#reshider");
750 | 			if(!doFilter)
751 | 				doFilter = hr.value != status.hideResults;
752 | 			hr.value = status.hideResults;
753 | 		}
754 | 
755 | 		if('notes' in status){
756 | 			query("#notes textarea")[0].value = status.notes;
757 | 		}
758 | 
759 | 		if(doFilter){
760 | 			filter();
761 | 		}
762 | 
763 | 		var ssi = query("#save_status input")[0];
764 | 		ssi.value = file.name
765 | 		ssi.onblur();
766 | 	};
767 | 
768 | 	reader.readAsText(file);
769 | }
770 | 
771 | 
772 | function initGUI(){
773 | 
774 | 	var infos = report.infos;
775 | 
776 | 	var filters = query("#filters");
777 | 
778 | 	query("#htcap_version").textContent = infos.htcap_version;
779 | 
780 | 	query('#allfilters').onchange = function(){
781 | 		var f = query('#filters').getElementsByTagName("input");
782 | 		for(var a = 0; a < f.length; a++)
783 | 			f[a].checked = this.checked;
784 | 		filter();
785 | 	}
786 | 
787 | 	query('#urlhider').onblur = filter;
788 | 	query('#reshider').onblur = filter;
789 | 
790 | 	query('#trash-close').onclick = function(){
791 | 		closeModal("#trash");
792 | 	}
793 | 
794 | 	query('#vulnerability-close').onclick = function(){
795 | 		closeModal("#vulnerability");
796 | 	}
797 | 
798 | 	query('#notes-close').onclick = function(){
799 | 		closeModal("#notes");
800 | 	}
801 | 
802 | 	query('#marked-close').onclick = closeMarked;
803 | 
804 | 	query('#outofscope-close').onclick = function(){
805 | 		closeModal("#outofscope");
806 | 	};
807 | 
808 | 	var btn;
809 | 	var buttons = query("#buttons");
810 | 
811 | 	//btn = newElement("span.button",[],"open marked",buttons);
812 | 	query("#marked-open").onclick = openMarked
813 | 
814 | 	query("#trash-open").onclick = function(){
815 | 		openModal("#trash");
816 | 	}
817 | 
818 | 	query("#notes-open").onclick = function(){
819 | 		openModal("#notes");
820 | 		query("#notes textarea")[0].focus();
821 | 	}
822 | 
823 | 	query("#collapse-all").onclick = function(){
824 | 		var els = query("#report section .mainAccordion");
825 | 		for(var a = 0; a < els.length; a++){
826 | 			toggleAccordion(els[a],'closed');
827 | 		}
828 | 	}
829 | 
830 | 	//btn = newElement("span.button",[],"expand visibles",buttons);
831 | 	query("#expand-visibles").onclick = function(){
832 | 		var els = query("#report section:not(.hidden) .mainAccordion");
833 | 		for(var a = 0; a < els.length; a++){
834 | 			toggleAccordion(els[a],'open');
835 | 		}
836 | 	}
837 | 
838 | 	query("#outofscope-open").onclick = function(){
839 | 		document.querySelector("body").style.overflow = 'hidden';
840 | 		query("#outofscope").classList.remove("hidden");
841 | 	}
842 | 
843 | 
844 | 	query("#nonhtml-open").onclick = function(){
845 | 		openModal("#nonhtml");
846 | 	}
847 | 	query('#nonhtml-close').onclick = function(){
848 | 		closeModal("#nonhtml");
849 | 	};
850 | 
851 | 
852 | 	query("#save-status").onclick = function(){
853 | 		saveStatus();
854 | 	};
855 | 
856 | 	query("#save_status input")[0].onblur = function(){
857 | 		query("#save_status a")[0].download = this.value || this.placeholder;
858 | 	}
859 | 	query("#save_status a")[0].onclick = function(){
860 | 		query("#save_status").classList.add("hidden");
861 | 	};
862 | 
863 | 	query("#load_status_input").onchange = function(){
864 | 		loadStatus(this.files[0]);
865 | 	}
866 | 
867 | 	//newElement("span",['id','error_container'],'',buttons);
868 | 
869 | 
870 | 	document.addEventListener("mouseup", function(e){
871 | 		var txt = window.getSelection().toString();
872 | 		query("#add_result_regexp").classList[(txt == "" ? 'add' : 'remove')]("hidden");
873 | 		query("#add_url_regexp").classList[(txt == "" ? 'add' : 'remove')]("hidden");
874 | 	});
875 | 
876 | 	query("#add_result_regexp").onmousedown = function(){addRegexp('#reshider')};
877 | 	query("#add_url_regexp").onmousedown = function(){addRegexp('#urlhider')};
878 | 
879 | 	var scan_time = parseInt((infos.end_date - infos.start_date) / 60);
880 | 	if(scan_time < 0){
881 | 		scan_time = "-1";
882 | 	}
883 | 	var scan_date = (new Date(parseInt(infos.start_date) * 1000)).toString();
884 | 	query('#infos_target').appendChild(document.createTextNode(infos.target));
885 | 	query('#infos_scan_date').appendChild(document.createTextNode(scan_date));
886 | 	query('#infos_scanned_urls').appendChild(document.createTextNode(infos.pages_crawled));
887 | 	query('#infos_scan_time').appendChild(document.createTextNode(scan_time + " minutes"));
888 | 	query('#infos_commandline').appendChild(document.createTextNode("crawl " + infos.commandline));
889 | 
890 | 	for(var i = 0; i < types.length; i++){
891 | 		var l = types[i];
892 | 
893 | 		var opt = newElement('input',['type','checkbox','id','opt_'+l,'name',l,'checked',true]);
894 | 		opt.onchange = filter;
895 | 
896 | 		//var opt_l = newElement("label",['for',opt.id], labels[l] + " ")
897 | 		var opt_l = newElement("label",['for',opt.id], getLabel(l) + " ")
898 | 
899 | 		filters.appendChild(opt);
900 | 		filters.appendChild(opt_l);
901 | 	}
902 | 
903 | 
904 | 	window.onscroll = (function(height){return function(){
905 | 		var h = height - window.pageYOffset;
906 | 		query("#collapse_top").style.height = (h > -1 ? h : 0) + "px";
907 | 
908 | 	}})(elementHeight(query("#collapse_top")));
909 | 
910 | 	document.onkeydown = function(e){
911 | 		if(e.keyCode == 27){
912 | 			closeMarked();
913 | 			closeModal();
914 | 		}
915 | 	}
916 | }
917 | 
918 | 
919 | function initReport(){
920 | 
921 | 	var results = report.results;
922 | 
923 | 
924 | 	initGUI();
925 | 
926 | 	var tot_outofscope = 0;
927 | 	var tot_nonhtml = 0;
928 | 	var index = 0;
929 | 	var modalurl;
930 | 	for(var a = 0; a < results.length; a++){
931 | 
932 | 		if('out_of_scope' in results[a] || results[a].errors.indexOf('contentType') > -1){
933 | 			if('out_of_scope' in results[a]){
934 | 				modalurl = newElement("p",[],null, query("#outofscope .modal-content")[0]);
935 | 				tot_outofscope++;
936 | 			} else {
937 | 				modalurl = newElement("p",[],null, query("#nonhtml .modal-content")[0]);
938 | 				tot_nonhtml++;
939 | 			}
940 | 			newElement("a.url",['href',results[a].url, 'target','_blank'],results[a].url, modalurl);
941 | 			newElement("br",[],"",modalurl);
942 | 			newElement("a.parent-url",['href',results[a].referer, 'target','_blank','title','parent url'], results[a].referer, modalurl);
943 | 
944 | 		}
945 | 
946 | 		if(resultHasData(results[a])){
947 | 			results[a].index = index++;
948 | 		}
949 | 
950 | 	}
951 | 
952 | 	createSections(-1);
953 | 
954 | 	query("#infos_outofscope").textContent = tot_outofscope;
955 | 	query("#infos_nonhtml").textContent = tot_nonhtml;
956 | 
957 | 	var links = query("a");
958 | 	for(var a = 0; a < links.length; a++){
959 | 		links[a].setAttribute('tabindex', "-1");
960 | 	}
961 | 
962 | 	//query("body")[0].style.paddingTop = 20 + elementHeight(query("#top")) + "px";
963 | 	query("#report").style.marginTop = 20 + elementHeight(query("#top")) + "px";
964 | 
965 | 
966 | 	window.onbeforeunload = function(e){
967 | 		var mess = "Are you sure?"
968 | 		return mess;
969 | 	};
970 | 
971 | }


--------------------------------------------------------------------------------
/core/util/utilities/htmlreport/style.css:
--------------------------------------------------------------------------------
  1 | body{
  2 | 	font-family:Helvetica;
  3 | 	margin:0;
  4 | 	padding:0;
  5 | 	overflow:auto;
  6 | }
  7 | html{
  8 | 	height:100%;
  9 | }
 10 | a{
 11 | 	text-decoration:none;
 12 | 	outline: 0;
 13 | 	color:#000;
 14 | }
 15 | a:active{
 16 | 	color:#000;
 17 | }
 18 | hr{
 19 |     border: 0;
 20 |     height: 1px;
 21 |     background-image: linear-gradient(to right, #000000 1%,#ffffff 100%);
 22 | }
 23 | label{
 24 | 	cursor:pointer;
 25 | }
 26 | /*.row{
 27 | 
 28 | 
 29 | }*/
 30 | 
 31 | 
 32 | .accordion{
 33 | 
 34 | }
 35 | 
 36 | .accordion-closed{
 37 | 	height:0px;
 38 | 	overflow:hidden;
 39 | }
 40 | 
 41 | 
 42 | /*.accordion-open{
 43 | 	overflow:visible;
 44 | }*/
 45 | 
 46 | 
 47 | 
 48 | .parent-url{
 49 | 	margin:0;
 50 | 	color:gray;
 51 | 	font-size:11px;
 52 | }
 53 | 
 54 | .url-post-data, .result-post-data{
 55 | 	margin:0;
 56 | 	color:#5A5A60;
 57 | 	font-size:12px;
 58 | }
 59 | 
 60 | .mainAccordion{
 61 | 	margin:0px 0px 0px 20px;
 62 | 	border-left:1px dotted #047288;
 63 | 	padding-left:6px;
 64 | }
 65 | 
 66 | .result-accordion-hdr{
 67 | 	margin:3px 3px 3px -3px;
 68 | 	font-weight:bold;
 69 | 	cursor:pointer;
 70 | }
 71 | 
 72 | 
 73 | .result-accordion-hdr.hdr-accordion-open:before{
 74 | 	line-height:24px;
 75 | 	color:gray;
 76 | 	font-size:10px;
 77 | 	content: "▼ ";
 78 | }
 79 | 
 80 | .result-accordion-hdr.hdr-accordion-closed:before{
 81 | 	line-height:24px;
 82 | 	color:gray;
 83 | 	font-size:10px;
 84 | 	content: "► ";
 85 | }
 86 | 
 87 | /* must have margin and padding = 0 to be collapsed on scroll */
 88 | #collapse_top{
 89 | 	position:relative;
 90 | 	overflow: hidden;
 91 | 	padding:0px;
 92 | 	margin:0px;
 93 | }
 94 | 
 95 | #top_header{
 96 | 	margin:10px 0 0 10px;
 97 | }
 98 | 
 99 | #title{
100 | 	margin:0 0 10px 0;
101 | 	padding:0;
102 | 	font-weight:bold;
103 | }
104 | 
105 | #top{
106 | 	position:fixed;
107 | 	background-color:#fff;
108 | 	z-index:777;
109 | 	top:0;
110 | 	left:0;
111 | 	margin:0;
112 | 	min-width:900px;
113 | 	width:100%;
114 | }
115 | #top hr{
116 | 	max-width:1200px;
117 | 	margin:0;
118 | }
119 | 
120 | #filtersbox{
121 | 	margin:10px 0 0 10px;
122 | }
123 | 
124 | .hidden{ display: none !important}
125 | 
126 | .icon {
127 | 	display: inline-block;
128 | 
129 | 	background-color:#047288;
130 | 	padding:4px;
131 | 	padding-top:2px;
132 | 	padding-bottom:2px;
133 | 	border-radius:2px;
134 | 	color:#fff;
135 | 	margin-right:4px;
136 | 	margin-left:1px;
137 | 	cursor:default;
138 | 	font-size:10px;
139 | }
140 | 
141 | section{
142 | 	margin:0px 0px 10px 10px;
143 | 	padding:0;
144 | 	white-space: nowrap;
145 | }
146 | 
147 | section.marked{
148 | 	border-left:4px solid #00F500;
149 | 	margin-left:6px;
150 | }
151 | 
152 | section.marked .mark-button:before{
153 | 	content:"un";
154 | }
155 | 
156 | .icon.icon-filtered{
157 | 	opacity:0.5;
158 | }
159 | .open-new-win{
160 | 	display:inline-block;
161 | 	background-image: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAKQWlDQ1BJQ0MgUHJvZmlsZQAASA2dlndUU9kWh8+9N73QEiIgJfQaegkg0jtIFQRRiUmAUAKGhCZ2RAVGFBEpVmRUwAFHhyJjRRQLg4Ji1wnyEFDGwVFEReXdjGsJ7601896a/cdZ39nnt9fZZ+9917oAUPyCBMJ0WAGANKFYFO7rwVwSE8vE9wIYEAEOWAHA4WZmBEf4RALU/L09mZmoSMaz9u4ugGS72yy/UCZz1v9/kSI3QyQGAApF1TY8fiYX5QKUU7PFGTL/BMr0lSkyhjEyFqEJoqwi48SvbPan5iu7yZiXJuShGlnOGbw0noy7UN6aJeGjjAShXJgl4GejfAdlvVRJmgDl9yjT0/icTAAwFJlfzOcmoWyJMkUUGe6J8gIACJTEObxyDov5OWieAHimZ+SKBIlJYqYR15hp5ejIZvrxs1P5YjErlMNN4Yh4TM/0tAyOMBeAr2+WRQElWW2ZaJHtrRzt7VnW5mj5v9nfHn5T/T3IevtV8Sbsz55BjJ5Z32zsrC+9FgD2JFqbHbO+lVUAtG0GQOXhrE/vIADyBQC03pzzHoZsXpLE4gwnC4vs7GxzAZ9rLivoN/ufgm/Kv4Y595nL7vtWO6YXP4EjSRUzZUXlpqemS0TMzAwOl89k/fcQ/+PAOWnNycMsnJ/AF/GF6FVR6JQJhIlou4U8gViQLmQKhH/V4X8YNicHGX6daxRodV8AfYU5ULhJB8hvPQBDIwMkbj96An3rWxAxCsi+vGitka9zjzJ6/uf6Hwtcim7hTEEiU+b2DI9kciWiLBmj34RswQISkAd0oAo0gS4wAixgDRyAM3AD3iAAhIBIEAOWAy5IAmlABLJBPtgACkEx2AF2g2pwANSBetAEToI2cAZcBFfADXALDIBHQAqGwUswAd6BaQiC8BAVokGqkBakD5lC1hAbWgh5Q0FQOBQDxUOJkBCSQPnQJqgYKoOqoUNQPfQjdBq6CF2D+qAH0CA0Bv0BfYQRmALTYQ3YALaA2bA7HAhHwsvgRHgVnAcXwNvhSrgWPg63whfhG/AALIVfwpMIQMgIA9FGWAgb8URCkFgkAREha5EipAKpRZqQDqQbuY1IkXHkAwaHoWGYGBbGGeOHWYzhYlZh1mJKMNWYY5hWTBfmNmYQM4H5gqVi1bGmWCesP3YJNhGbjS3EVmCPYFuwl7ED2GHsOxwOx8AZ4hxwfrgYXDJuNa4Etw/XjLuA68MN4SbxeLwq3hTvgg/Bc/BifCG+Cn8cfx7fjx/GvyeQCVoEa4IPIZYgJGwkVBAaCOcI/YQRwjRRgahPdCKGEHnEXGIpsY7YQbxJHCZOkxRJhiQXUiQpmbSBVElqIl0mPSa9IZPJOmRHchhZQF5PriSfIF8lD5I/UJQoJhRPShxFQtlOOUq5QHlAeUOlUg2obtRYqpi6nVpPvUR9Sn0vR5Mzl/OX48mtk6uRa5Xrl3slT5TXl3eXXy6fJ18hf0r+pvy4AlHBQMFTgaOwVqFG4bTCPYVJRZqilWKIYppiiWKD4jXFUSW8koGStxJPqUDpsNIlpSEaQtOledK4tE20Otpl2jAdRzek+9OT6cX0H+i99AllJWVb5SjlHOUa5bPKUgbCMGD4M1IZpYyTjLuMj/M05rnP48/bNq9pXv+8KZX5Km4qfJUilWaVAZWPqkxVb9UU1Z2qbapP1DBqJmphatlq+9Uuq43Pp893ns+dXzT/5PyH6rC6iXq4+mr1w+o96pMamhq+GhkaVRqXNMY1GZpumsma5ZrnNMe0aFoLtQRa5VrntV4wlZnuzFRmJbOLOaGtru2nLdE+pN2rPa1jqLNYZ6NOs84TXZIuWzdBt1y3U3dCT0svWC9fr1HvoT5Rn62fpL9Hv1t/ysDQINpgi0GbwaihiqG/YZ5ho+FjI6qRq9Eqo1qjO8Y4Y7ZxivE+41smsImdSZJJjclNU9jU3lRgus+0zwxr5mgmNKs1u8eisNxZWaxG1qA5wzzIfKN5m/krCz2LWIudFt0WXyztLFMt6ywfWSlZBVhttOqw+sPaxJprXWN9x4Zq42Ozzqbd5rWtqS3fdr/tfTuaXbDdFrtOu8/2DvYi+yb7MQc9h3iHvQ732HR2KLuEfdUR6+jhuM7xjOMHJ3snsdNJp9+dWc4pzg3OowsMF/AX1C0YctFx4bgccpEuZC6MX3hwodRV25XjWuv6zE3Xjed2xG3E3dg92f24+ysPSw+RR4vHlKeT5xrPC16Il69XkVevt5L3Yu9q76c+Oj6JPo0+E752vqt9L/hh/QL9dvrd89fw5/rX+08EOASsCegKpARGBFYHPgsyCRIFdQTDwQHBu4IfL9JfJFzUFgJC/EN2hTwJNQxdFfpzGC4sNKwm7Hm4VXh+eHcELWJFREPEu0iPyNLIR4uNFksWd0bJR8VF1UdNRXtFl0VLl1gsWbPkRoxajCCmPRYfGxV7JHZyqffS3UuH4+ziCuPuLjNclrPs2nK15anLz66QX8FZcSoeGx8d3xD/iRPCqeVMrvRfuXflBNeTu4f7kufGK+eN8V34ZfyRBJeEsoTRRJfEXYljSa5JFUnjAk9BteB1sl/ygeSplJCUoykzqdGpzWmEtPi000IlYYqwK10zPSe9L8M0ozBDuspp1e5VE6JA0ZFMKHNZZruYjv5M9UiMJJslg1kLs2qy3mdHZZ/KUcwR5vTkmuRuyx3J88n7fjVmNXd1Z752/ob8wTXuaw6thdauXNu5Tnddwbrh9b7rj20gbUjZ8MtGy41lG99uit7UUaBRsL5gaLPv5sZCuUJR4b0tzlsObMVsFWzt3WazrWrblyJe0fViy+KK4k8l3JLr31l9V/ndzPaE7b2l9qX7d+B2CHfc3em681iZYlle2dCu4F2t5czyovK3u1fsvlZhW3FgD2mPZI+0MqiyvUqvakfVp+qk6oEaj5rmvep7t+2d2sfb17/fbX/TAY0DxQc+HhQcvH/I91BrrUFtxWHc4azDz+ui6rq/Z39ff0TtSPGRz0eFR6XHwo911TvU1zeoN5Q2wo2SxrHjccdv/eD1Q3sTq+lQM6O5+AQ4ITnx4sf4H++eDDzZeYp9qukn/Z/2ttBailqh1tzWibakNml7THvf6YDTnR3OHS0/m/989Iz2mZqzymdLz5HOFZybOZ93fvJCxoXxi4kXhzpXdD66tOTSna6wrt7LgZevXvG5cqnbvfv8VZerZ645XTt9nX297Yb9jdYeu56WX+x+aem172296XCz/ZbjrY6+BX3n+l37L972un3ljv+dGwOLBvruLr57/17cPel93v3RB6kPXj/Mejj9aP1j7OOiJwpPKp6qP6391fjXZqm99Oyg12DPs4hnj4a4Qy//lfmvT8MFz6nPK0a0RupHrUfPjPmM3Xqx9MXwy4yX0+OFvyn+tveV0auffnf7vWdiycTwa9HrmT9K3qi+OfrW9m3nZOjk03dp76anit6rvj/2gf2h+2P0x5Hp7E/4T5WfjT93fAn88ngmbWbm3/eE8/syOll+AAAACXBIWXMAAAsTAAALEwEAmpwYAAABh0lEQVQ4EY2Svy4EURTGZ+6dRbIRRKFR0NoHEI3EU3gExYbEJhIKBQ1WIzyGKLReQisKofQAbIKZ6/ft3DNmk5nESb4595zv/LvnTpokSQo8aJIiOqV7zrlt9A8YAV8UxV3GIUQnqlE6eAvv/Tr6OITwnqbpkiIp2FX3OcgtdH0KFe3kef6IfgYOyBdIOqTAGeeEYieqskOB0IQsy64UiMzoQ2xfcegheAWXcg5w6l49sAxWwUrUXbS6K24vJp/L5jwCNyL2OXzh0z4ahZjdmHxhAfj6TLhhE3xDzEdShbQPLU/8QUweykamwN++CNAVVGBWLKLFChIP/wZ/XZqVX6YauLaxtXFJzluvoT/GVlnwCN8t9hNw4wVFsk0p2UZe4AlPmWgzBvv/FNB1bCLl6a/U0iWhfgW7t+kypPyaT1pNzZ54ujxm2P8fzbEyznTFaQKNJ8KmMV8VxEFd9VLiJhqYY5rFvEBqtGo8znVRI7uC7STNeJJ73noRUj9IW3K90Cc5D9GR/wJQ+1+DjtkA4QAAAABJRU5ErkJggg==);
162 | 	width:12px;
163 | 	height:12px;
164 | 	margin-left:6px;
165 | 	margin-right:2px;
166 | 	background-size: 100% auto;
167 | }
168 | 
169 | #urlhider, #reshider{
170 | 	width:300px;
171 | 	height:40px;
172 | }
173 | 
174 | 
175 | .modal-bar{
176 | 	background-color:#E1EBF7;
177 | 	padding:3px;
178 | 	margin:0 0 6px 0;
179 | 	border:1px solid #fff;
180 | 	border-bottom:1px solid #C6DCF5;
181 | }
182 | 
183 | .modal-content{
184 | 	position:relative;
185 | 	margin:0;
186 | 	padding:0;
187 | 	height:calc(100% - 40px);
188 | 	width:100%;
189 | 	overflow:auto;
190 | }
191 | 
192 | 
193 | /*#trash section, #marked section{
194 | 	display:block !important;
195 | }
196 | */
197 | 
198 | .modal{
199 | 	position:fixed;
200 | 	top:10px;
201 | 	left:10px;
202 | 	border:1px solid #047288;
203 | 	background-color:#fff;
204 | 	width:calc(100% - 20px);
205 | 	height:calc(100% - 20px);
206 | 	overflow:auto;
207 | 	z-index:888;
208 | 	box-shadow: 5px 5px 2px #ddd;
209 | }
210 | 
211 | #buttons{
212 | 	position:relative;
213 | 	margin:10px 0 5px 10;
214 | 	padding:0;
215 | }
216 | 
217 | .button,.button-spacer{
218 | 	display:inline-block;
219 | 	height:16px;
220 | 	padding:0 4px 1px 4px;
221 | 	font-size:13px;
222 | 	-webkit-user-select: none;
223 | 	-moz-user-select: none;
224 | 	-ms-user-select: none; 
225 | }
226 | 
227 | 
228 | .button-spacer{
229 | 	border-left:1px dotted gray;
230 | 	width:1px;
231 | }
232 | 
233 | .button{
234 | 	border:1px solid gray;
235 | 	min-width:60px;
236 | 	cursor:pointer;
237 | 	text-align:center;
238 | 	margin-right:10px;
239 | }
240 | 
241 | .button:active{
242 | 	opacity:0.6;
243 | }
244 | 
245 | /*#save_status{
246 | 	display:inline-block;
247 | 	padding:0;
248 | 	margin:0;
249 | }
250 | #save_status input{
251 | 	display:inline-block;
252 | 	height:16px;
253 | 	padding:0 4px 1px 4px;
254 | 	border: 1px dashed gray;
255 | }
256 | */
257 | .result-button{
258 | 	margin-top:10px;
259 | }
260 | 
261 | /*@keyframes button-fadein {
262 | 	from {opacity:0.5;}
263 | 	to {opacity:1;}
264 | }*/
265 | 
266 | .button-delayed{
267 | 
268 |    -webkit-transition: opacity 0.6s ease-in;
269 |        -moz-transition: opacity 0.6s ease-in;
270 |         -ms-transition: opacity 0.6s ease-in;
271 |          -o-transition: opacity 0.6s ease-in;
272 |             transition: opacity 0.6s ease-in;
273 |     opacity:0.5;
274 | }
275 | .button-delayed:hover{
276 | 	opacity:1;
277 | 
278 | }
279 | /*#trash .trash-button{
280 | 	opacity:1;
281 | }
282 | */
283 | #trash .trash-button:before{
284 | 	content:"un";
285 | }
286 | 
287 | 
288 | #trash-close, #marked-close, #outofscope-close, #vulnerability-close, #nonhtml-close, #notes-close{
289 | 	position:absolute;
290 | 	right:10px;
291 | 	top:0px;
292 | 	font-size:22px;
293 | 	cursor:pointer;
294 | }
295 | #trash-close:hover, #marked-close:hover, #outofscope-close:hover, #nonhtml-close:hover, #notes-close:hover{
296 | 	color:#F04158;
297 | }
298 | 
299 | 
300 | .resbuttons{
301 | 	margin-top:10px;
302 | }
303 | 
304 | #infos{
305 | 
306 | }
307 | .addregexp{
308 | 	margin:2px 0 0 0;
309 | 	font-size:12px;
310 | 	cursor:pointer;
311 | 	text-decoration:underline;
312 | }
313 | 
314 | .url{
315 | 	cursor:pointer;
316 | 	display:inline-block;
317 | 	vertical-align: bottom;
318 | 
319 | 
320 | 	white-space:nowrap;
321 | 
322 | 	padding:0;
323 | 	margin:0;
324 | 
325 | 	/*border-right-width: 13px;
326 | 	border-right-style: solid;
327 | 	border-image-slice: 1;
328 | 	border-image:linear-gradient(to right, rgba(255,255,255,0) 0%,rgba(0,0,0,1) 100%);;*/
329 | }
330 | 
331 | #error_container{
332 | 	font-size:12px;
333 | 	color:red;
334 | }
335 | 
336 | .url-error{
337 | 	color:red;
338 | }
339 | 
340 | .url-outofscope{
341 | 	color:gray;
342 | }
343 | 
344 | 
345 | .icon-hidden{
346 | 	opacity:0.05 !important;
347 | }
348 | 
349 | .result{
350 | 	font-size:14px;
351 | }
352 | 
353 | 
354 | #outofscope p{
355 | 	margin: 2px 0 6px 4px;
356 | }
357 | #outofscope p .parent-url{
358 | 	padding-left:6px;
359 | }
360 | 
361 | #nonhtml p{
362 | 	margin: 2px 0 6px 4px;
363 | }
364 | #nonhtml p .parent-url{
365 | 	padding-left:6px;
366 | }
367 | 
368 | .result-counter{
369 | 	display: inline-block;
370 | 	font-weight:normal;
371 | 	background-color:#fff;
372 | 	margin:0;
373 | 	font-size:9px;
374 | 	margin:0px 0 -2px -1px;
375 | 	border:1px solid #047288;
376 | 	border-radius:6px;
377 | 	padding:0px 2px 0px 2px;
378 | 	vertical-align: bottom;
379 | }
380 | 
381 | .results{
382 | 	margin-bottom:8px;
383 | }
384 | 
385 | !.accordion-open + p{
386 | 	color:red;
387 | }
388 | 
389 | #vulnerability pre{
390 | 	padding:5px;
391 | }
392 | 
393 | .vuln-name{
394 | 	cursor:pointer;
395 | }
396 | 
397 | 
398 | span.trigger{
399 | 	display: inline-block;
400 | 	margin:0 4px 0 0;
401 | 	padding: 1;
402 | 
403 | 	min-width:12px;
404 | 	height:12px;
405 | 	cursor:pointer;
406 | 	text-align:center;
407 | 	font-size:13px;
408 | 	line-height:14px;
409 | 	color:#02582F;
410 | }
411 | 
412 | span.trigger.empty{
413 | 	opacity:.1;
414 | 	cursor:default;
415 | }
416 | 
417 | 
418 | 
419 | #notes textarea{
420 | 	display:block;
421 | 	margin:0 auto;
422 | 	width:95%;
423 | 	height:100%;
424 | 	font-size:14px;
425 | 	padding:10px;
426 | }


--------------------------------------------------------------------------------
/core/util/utilities/login.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - htcap.org
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under
  8 | the terms of the GNU General Public License as published by the Free Software
  9 | Foundation; either version 2 of the License, or (at your option) any later
 10 | version.
 11 | """
 12 | 
 13 | import sys
 14 | import sqlite3
 15 | import json
 16 | import getopt 
 17 | import os
 18 | import getpass
 19 | 
 20 | from core.lib.utils import *
 21 | from core.lib.shell import CommandExecutor
 22 | from core.util.base_util import BaseUtil
 23 | from core.lib.cookie import Cookie
 24 | 
 25 | reload(sys)
 26 | sys.setdefaultencoding('utf8')
 27 | 
 28 | class Login(BaseUtil):
 29 | 
 30 | 	@staticmethod
 31 | 	def get_settings():
 32 | 		return dict(
 33 | 			descr = "Login to a webapp to get session cookies and logout urls",
 34 | 			optargs = 'p:HJhAcl',
 35 | 			minargs = 2
 36 | 		)
 37 | 
 38 | 	def usage(self):
 39 | 		return (
 40 | 			"%s\n"
 41 | 			"usage: %s [options] <url> <username> [<text_of_submit_element>]\n"
 42 | 			"Options:\n"
 43 | 			"   -h            This help\n"
 44 | 			"   -p PASSWD     Set login password\n"
 45 | 			"   -c            Do not output cookies\n"
 46 | 			"   -l            Do not output logout urls\n"
 47 | 			"   -H            Format output as htcap arguments\n"
 48 | 			"   -J            Format output as json (for cookies only)\n"
 49 | 			"   -A            Format output as general command line arguments\n"
 50 | 			% (self.get_settings()['descr'], self.utilname)
 51 | 		)
 52 | 
 53 | 
 54 | 	def main(self, args, opts):
 55 | 		passw = None
 56 | 		format = None
 57 | 		out_cookies = True
 58 | 		out_logouts = True
 59 | 		for o,v in opts:
 60 | 			if o == "-h":
 61 | 				print self.usage()
 62 | 				sys.exit(0)
 63 | 			elif o == "-p":
 64 | 				passw = v
 65 | 			elif o == "-c":
 66 | 				out_cookies = False
 67 | 			elif o == "-l":
 68 | 				out_logouts = False
 69 | 			elif o in  ("-H", "-J", "-A"):
 70 | 				format = o
 71 | 
 72 | 		if not passw:
 73 | 			print "The password is hidden here BUT it will be passed to phantomjs via commandline ..."
 74 | 			try:
 75 | 				passw = getpass.getpass()
 76 | 			except KeyboardInterrupt:
 77 | 				print "\nAbort..."
 78 | 				sys.exit(0)
 79 | 
 80 | 		jspath = "%s%s%s%s" % (getrealdir(__file__), "login", os.sep, "login.js")
 81 | 		cmd = get_phantomjs_cmd() + [jspath, args[0], args[1], passw]
 82 | 		if len(args) > 2: cmd.append(args[2])
 83 | 		#print cmd_to_str(cmd)
 84 | 		exe = CommandExecutor(cmd, True)
 85 | 		out, err = exe.execute(20)
 86 | 		if err:
 87 | 			print "Unable to login"
 88 | 			sys.exit(1)
 89 | 
 90 | 		try:
 91 | 			ret = json.loads(out)
 92 | 		except ValueError as e:
 93 | 			print e
 94 | 			sys.exit(1)
 95 | 		allcookies, logouts = ret
 96 | 		cookies = []
 97 | 		if out_cookies:
 98 | 			for c in reversed(allcookies):
 99 | 				cookie = Cookie(c)
100 | 				if not cookie in cookies: cookies.append(cookie)
101 | 		if not out_logouts:
102 | 			logouts = []
103 | 
104 | 		if not format:
105 | 			print "Cookies:"
106 | 			for c in cookies:
107 | 				print " %s=%s" % (c.name, c.value)
108 | 			print "Logout urls:"
109 | 			for u in logouts:
110 | 				print " %s" % u
111 | 		elif format == "-A":
112 | 			for c in cookies:
113 | 				print cmd_to_str([c.name, c.value])
114 | 			for u in logouts:
115 | 				print cmd_to_str([u])
116 | 		elif format == "-H":
117 | 			args = []
118 | 			if len(cookies) > 0:
119 | 				args = ["-c", ";".join(["%s=%s" % (c.name, c.value) for c in cookies])]
120 | 			if len(logouts) > 0:
121 | 				args.extend(["-x", ",".join(logouts)])
122 | 			if len(args) > 0:
123 | 				print cmd_to_str(args)
124 | 		elif format == "-J":
125 | 			cd = []
126 | 			for c in cookies:
127 | 				cd.append(c.get_dict())
128 | 			if out_cookies:
129 | 				print json.dumps(cd)
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/core/util/utilities/login/login.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | HTCAP - htcap.org
  3 | Author: filippo.cavallarin@wearesegment.com
  4 | 
  5 | This program is free software; you can redistribute it and/or modify it under
  6 | the terms of the GNU General Public License as published by the Free Software
  7 | Foundation; either version 2 of the License, or (at your option) any later
  8 | version.
  9 | */
 10 | 
 11 | var system = require('system');
 12 | var fs = require('fs');
 13 | 
 14 | // ctrl-c from probe/function.js .. maybe I should create some common.js and put it somewhere .. but where?.. really.. where??
 15 | function getCookies(headers, url){
 16 | 	var a, b, c, ret = [];
 17 | 	var purl = document.createElement('a');
 18 | 	purl.href = url;
 19 | 	var domain = purl.hostname;
 20 | 
 21 | 	for(a = 0; a < headers.length; a++){
 22 | 		//console.log(JSON.stringify(headers[a]))
 23 | 		if(headers[a].name.toLowerCase() == "set-cookie"){
 24 | 			var cookies = headers[a].value.split("\n");	 // phantomjs stores multiple cookies in this way ..
 25 | 			for(b = 0; b < cookies.length; b++){
 26 | 				var ck = cookies[b].split(/; */);
 27 | 				var cookie = {domain: domain, path: "/", secure: false, httponly:false};
 28 | 				for(c = 0; c < ck.length; c++){
 29 | 					var kv = ck[c].split("=");
 30 | 					if(c == 0){
 31 | 						cookie.name = kv[0];
 32 | 						cookie.value = decodeURIComponent(kv[1]);
 33 | 						continue;
 34 | 					}
 35 | 					switch(kv[0].toLowerCase()){
 36 | 						case "expires":
 37 | 							if(!("expires" in cookie))
 38 | 								cookie.expires = parseInt((new Date(kv[1])).getTime() / 1000);
 39 | 							break;
 40 | 						case "max-age":
 41 | 							cookie.expires = parseInt(((new Date()).getTime() / 1000) + parseInt(kv[1]));
 42 | 							break;
 43 | 						case "domain":
 44 | 						case "path":
 45 | 							cookie[kv[0]] = kv[1];
 46 | 							break;
 47 | 						case "httponly":
 48 | 						case "secure":
 49 | 							cookie[kv[0]] = true;
 50 | 							break;
 51 | 					}
 52 | 				}
 53 | 				ret.push(cookie);
 54 | 			}
 55 | 		}
 56 | 	}
 57 | 	return ret;
 58 | };
 59 | 
 60 | 
 61 | 
 62 | function print(mess){
 63 | 	output.push(mess)
 64 | }
 65 | 
 66 | if(system.args.length < 4){
 67 | 	console.log("usage: login.js <url> <login> <password> [<button_text>]");
 68 | 	phantom.exit(0);
 69 | }
 70 | 
 71 | 
 72 | var step = 0;
 73 | var allCookies = [];
 74 | var output = []
 75 | 
 76 | var url = system.args[1];
 77 | var login = system.args[2];
 78 | var password = system.args[3];
 79 | var buttonTxt = system.args[4] || null;
 80 | 
 81 | 
 82 | var page = require('webpage').create();
 83 | 
 84 | page.onConsoleMessage = function(msg, lineNum, sourceId) {
 85 | 
 86 | 	//console.log("console: " + msg);
 87 | }
 88 | 
 89 | page.onResourceReceived = function(response) {
 90 | 		var cookies = getCookies(response.headers, url)
 91 | 		for(var a = 0; a < cookies.length; a++){
 92 | 			allCookies.push(cookies[a])
 93 | 		}
 94 | };
 95 | 
 96 | phantom.onError = function(msg, trace) {}
 97 | page.onError = function(msg, trace) {}
 98 | 
 99 | page.onCallback = function(data) {
100 | 	switch(data.cmd){
101 | 		case "next":
102 | 			step = 2;
103 | 			//page.render("010101.png")
104 | 			return;
105 | 		case "end":
106 | 			console.log(JSON.stringify(output))
107 | 			phantom.exit(0);
108 | 		case "print":
109 | 			print(JSON.parse(data.par))
110 | 			return;
111 | 	}
112 | }
113 | 
114 | page.onLoadFinished = function(status) {
115 | 	if(status != 'success'){
116 | 		console.log("error loading page " + url);
117 | 		phantom.exit(1);
118 | 	}
119 | 
120 | 	if(step < 2)return;
121 | 
122 | 	if(allCookies.length > 0){
123 | 		print(allCookies);
124 | 	} else {
125 | 		print([]);
126 | 	}
127 | 
128 | 	page.evaluate(function(){
129 | 		var els = document.getElementsByTagName("a");
130 | 		var lu = [];
131 | 		var re = /.*(log|sign)(_|\-| )?(out|off).*/gi;
132 | 		for(var a = 0; a < els.length; a++){
133 | 			if(els[a].href.match(re) || els[a].innerText.match(re)){
134 | 				lu.push(els[a].href)
135 | 			}
136 | 		}
137 | 		if(lu.length > 0){
138 | 			callPhantom({cmd:"print",par:JSON.stringify(lu)});
139 | 		} else {
140 | 			callPhantom({cmd:"print",par:'[]'});
141 | 		}
142 | 
143 | 		callPhantom({cmd:"end"});
144 | 	})
145 | }
146 | 
147 | 
148 | 
149 | page.settings.loadImages = false;
150 | 
151 | page.open(url, {}, function(status){
152 | 	page.evaluate(function(login, password,buttonTxt){
153 | 		function trigger(el, evname){
154 | 			if ('createEvent' in document) {
155 | 				var evt = document.createEvent('HTMLEvents');
156 | 				evt.initEvent(evname, true, false);
157 | 				el.dispatchEvent(evt);
158 | 			} else {
159 | 				evname = 'on' + evname;
160 | 				if( evname in el && typeof el[evname] == "function"){
161 | 					el[evname]();
162 | 				}
163 | 			}
164 | 		};
165 | 
166 | 		function getAdiacent(cont, selector){
167 | 			var ad = null;
168 | 
169 | 			while(!ad && cont){
170 | 				ad = cont.querySelector(selector)
171 | 				cont = cont.parentNode
172 | 			}
173 | 			return ad
174 | 		}
175 | 
176 | 		var passw_el = document.querySelector("input[type=password]");
177 | 		passw_el.value = password;
178 | 		var login_el = getAdiacent(passw_el, "input[type=text],input[type=email],input:not([type])");
179 | 		var button_el = null;
180 | 		if(buttonTxt){
181 | 			var els = document.getElementsByTagName("*");
182 | 			for(var a = 0; a < els.length; a++){
183 | 				for(var ch = els[a].firstChild; ch; ch = ch.nextSibling){
184 | 					if(ch.nodeType != 3)continue; // skip non textNodes
185 | 					if(ch.nodeValue.toLowerCase().trim() == buttonTxt.toLowerCase()){
186 | 						button_el = els[a];
187 | 						break;
188 | 					}
189 | 				}
190 | 			}
191 | 		} else {
192 | 			button_el = getAdiacent(passw_el, "input[type=submit],button");
193 | 			if(!button_el){
194 | 				button_el = getAdiacent(passw_el, "a");
195 | 			}
196 | 		}
197 | 		if(!login_el || ! button_el){
198 | 			console.log("error")
199 | 		}
200 | 		login_el.value = login;
201 | 		trigger(login_el, "blur")
202 | 		trigger(login_el, "change")
203 | 		trigger(passw_el, "blur")
204 | 		trigger(passw_el, "change")
205 | 
206 | 		setTimeout(function(){
207 | 			trigger(button_el, "click");
208 | 			callPhantom({cmd:"next"});
209 | 		},50);
210 | 
211 | 	},login, password, buttonTxt);
212 | });
213 | 
214 | 


--------------------------------------------------------------------------------
/core/util/utilities/lsajax.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import sqlite3
 4 | import json
 5 | import os
 6 | 
 7 | from core.util.base_util import BaseUtil
 8 | 
 9 | reload(sys)
10 | sys.setdefaultencoding('utf8')
11 | 
12 | 
13 | class Lsajax(BaseUtil):
14 | 
15 | 	@staticmethod
16 | 	def get_settings():
17 | 		return dict(
18 | 			descr = "List all pages and related ajax calls",
19 | 			optargs = 'd',
20 | 			minargs = 1
21 | 		)
22 | 
23 | 	def usage(self):
24 | 		return (
25 | 			"usage: %s <dbfile> [<sql_where_clause>]\n"
26 | 			"  Options:\n    -d    print POST data\n\n"
27 | 			% self.utilname
28 | 		)
29 | 
30 | 	def main(self, args, opts):
31 | 		qry = """
32 | 			SELECT r.id, r.url as page, r.referer, a.method, a.url,a.data,a.trigger
33 | 			FROM request r inner join request a on r.id=a.id_parent
34 | 			WHERE (a.type='xhr')
35 | 			AND
36 | 			%s
37 | 		"""
38 | 
39 | 		# try:
40 | 		# 	opts, args = getopt.getopt(argv[1:], 'd')
41 | 		# except getopt.GetoptError as err:
42 | 		# 	print str(err)
43 | 		# 	sys.exit(1)
44 | 
45 | 
46 | 		# if len(args) < 1:
47 | 		# 	print (
48 | 		# 		"usage: %s <dbfile> [<final_part_of_query>]\n"
49 | 		# 		"  Options:\n    -d    print POST data\n\n"
50 | 		# 		"  Base query: %s" % (argv[0], qry)
51 | 		# 	)
52 | 		# 	sys.exit(1)
53 | 
54 | 
55 | 		print_post_data = False
56 | 
57 | 		for o, v in opts:
58 | 			if o == '-d':
59 | 				print_post_data = True
60 | 
61 | 
62 | 		dbfile = args[0]
63 | 
64 | 		if not os.path.exists(dbfile):
65 | 			print "No such file %s" % dbfile
66 | 			sys.exit(1)
67 | 
68 | 		where = args[1] if len(args) > 1 else "1=1"
69 | 
70 | 		conn = sqlite3.connect(dbfile)
71 | 		conn.row_factory = sqlite3.Row 
72 | 
73 | 		cur = conn.cursor()
74 | 		cur.execute(qry % where)
75 | 		pages = {}
76 | 		for res in cur.fetchall():
77 | 			page = (res['id'], res['page'], res['referer'])
78 | 			trigger = json.loads(res['trigger']) if res['trigger'] else None
79 | 			trigger_str = "%s.%s() -> " % (trigger['element'], trigger['event']) if trigger else ""
80 | 			data = " data: %s" % (res['data']) if print_post_data and res['data'] else ""
81 | 			descr = "  %s%s %s%s" % (trigger_str, res['method'], res['url'], data)
82 | 
83 | 			if page in pages: 
84 | 				pages[page].append(descr) 
85 | 			else: 
86 | 				pages[page] = [descr]
87 | 
88 | 		for page,ajax in pages.items():
89 | 			print "Request ID: %s\nPage URL:   %s\nReferer:    %s\nAjax requests:" % page 
90 | 			for aj in ajax:
91 | 				print aj
92 | 			print "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \n"
93 | 


--------------------------------------------------------------------------------
/core/util/utilities/lsvuln.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import sqlite3
 4 | import json
 5 | 
 6 | from core.util.base_util import BaseUtil
 7 | 
 8 | reload(sys)
 9 | sys.setdefaultencoding('utf8')
10 | 
11 | class Lsvuln(BaseUtil):
12 | 
13 | 	@staticmethod
14 | 	def get_settings():
15 | 		return dict(
16 | 			descr = "List all vulnerabilities",
17 | 			optargs = '',
18 | 			minargs = 1
19 | 		)
20 | 
21 | 	def usage(self):
22 | 		return (
23 | 			"%s\n"
24 | 			"usage: %s <dbfile> [<sql_where_clause>]\n" 
25 | 			% (self.get_settings()['descr'], self.utilname)
26 | 		)
27 | 
28 | 	def main(self, args, opts):
29 | 		qry = """
30 | 			SELECT scanner,start_date,end_date,id_request,type,description FROM assessment a 
31 | 			INNER JOIN vulnerability av ON a.id=av.id_assessment
32 | 			WHERE
33 | 			%s
34 | 		"""
35 | 
36 | 		dbfile = args[0]
37 | 		where = args[1] if len(args) > 1 else "1=1"
38 | 
39 | 		conn = sqlite3.connect(dbfile)
40 | 		conn.row_factory = sqlite3.Row 
41 | 
42 | 		cur = conn.cursor()
43 | 		cur.execute(qry % where)
44 | 		for vuln in cur.fetchall():
45 | 			print vuln['description']
46 | 			print "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - "
47 | 
48 | 


--------------------------------------------------------------------------------
/core/util/utilities/report.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | 
  4 | """
  5 | HTCAP - beta 1
  6 | Author: filippo.cavallarin@wearesegment.com
  7 | 
  8 | This program is free software; you can redistribute it and/or modify it under 
  9 | the terms of the GNU General Public License as published by the Free Software 
 10 | Foundation; either version 2 of the License, or (at your option) any later 
 11 | version.
 12 | """
 13 | 
 14 | import sys
 15 | import os
 16 | import sqlite3
 17 | import json
 18 | from urlparse import urlsplit
 19 | from core.util.base_util import BaseUtil
 20 | from core.lib.utils import *
 21 | reload(sys)
 22 | sys.setdefaultencoding('utf8')
 23 | 
 24 | 
 25 | 
 26 | class Report(BaseUtil):
 27 | 
 28 | 	def dict_from_row(self, row):
 29 | 		return dict(zip(row.keys(), row)) 
 30 | 
 31 | 	@staticmethod
 32 | 	def get_settings():
 33 | 		return dict(
 34 | 			descr = "Generate the html report",
 35 | 			optargs = '',
 36 | 			minargs = 2
 37 | 		)
 38 | 
 39 | 	def usage(self):
 40 | 		return (
 41 | 			"%s\n"
 42 | 			"usage: %s <dbfile> <outfile>\n" 
 43 | 			% (self.get_settings()['descr'], self.utilname)
 44 | 		)
 45 | 
 46 | 
 47 | 	def get_report(self, cur):
 48 | 		report = []
 49 | 		qry = """
 50 | 			SELECT r.type,r.id,r.url,r.method,r.data,r.http_auth,r.referer,r.out_of_scope, ri.trigger, r.crawler_errors, 
 51 | 			 (ri.id is not null) AS has_requests, ri.type AS req_type,ri.method AS req_method,ri.url AS req_url,ri.data AS req_data 
 52 | 			FROM request r 
 53 | 			LEFT JOIN request_child rc ON r.id=rc.id_request
 54 | 			LEFT JOIN request ri ON ri.id = rc.id_child
 55 | 			WHERE
 56 | 			r.type IN ('link', 'redirect','form')
 57 | 			and (has_requests=0 OR req_type IN ('xhr','form','websocket') OR (req_type='jsonp' AND ri.trigger <> ''))
 58 | 		"""
 59 | 		try:
 60 | 			cur.execute(qry)
 61 | 			for r in cur.fetchall():
 62 | 				report.append(self.dict_from_row(r))			 
 63 | 		except Exception as e:
 64 | 			print str(e)
 65 | 
 66 | 		return report
 67 | 
 68 | 	def get_assessment_vulnerabilities(self, cur, id_request):
 69 | 		report = []
 70 | 		qry = """
 71 | 			SELECT type, description FROM vulnerability WHERE id_request IN (
 72 | 				SELECT id FROM request WHERE (
 73 | 					id=? AND type IN ('link','redirect')) OR 
 74 | 					(id_parent=? AND type IN ('xhr','jsonp','form','websocket')
 75 | 				)
 76 | 			)
 77 | 		"""
 78 | 
 79 | 		try:
 80 | 
 81 | 			cur.execute(qry, (id_request,id_request))
 82 | 			for r in cur.fetchall():
 83 | 				report.append(json.dumps({"type":r['type'], "description":r['description']}))			 
 84 | 		except Exception as e:
 85 | 			print str(e)
 86 | 
 87 | 
 88 | 		return report
 89 | 
 90 | 
 91 | 	def get_crawl_info(self, cur):
 92 | 		crawl = None
 93 | 		qry = """
 94 | 			SELECT *,
 95 | 			 (SELECT htcap_version FROM crawl_info) AS htcap_version,
 96 | 			 (SELECT COUNT(*) FROM request WHERE crawled=1) AS pages_crawled 
 97 | 			FROM crawl_info
 98 | 		"""
 99 | 
100 | 		try:
101 | 
102 | 			cur.execute(qry)
103 | 			crawl = self.dict_from_row(cur.fetchone())
104 | 		except Exception as e:
105 | 			print str(e)
106 | 
107 | 		return crawl
108 | 
109 | 	def get_request_cmp_tuple(self, row):
110 | 		# http_auth in included in the url
111 | 		return (row['url'], row['method'], row['data'])
112 | 
113 | 	def add_http_auth(self, url, auth):
114 | 		purl = urlsplit(url)
115 | 		return purl._replace(netloc="%s@%s" % (auth, purl.netloc)).geturl()
116 | 
117 | 	def get_json(self, cur):
118 | 		report = self.get_report(cur)
119 | 		infos = self.get_crawl_info(cur)
120 | 
121 | 	 
122 | 		ret = dict(
123 | 			infos= infos,
124 | 			results = []
125 | 		)
126 | 
127 | 		for row in report:
128 | 			if row['http_auth']:
129 | 				row['url'] = self.add_http_auth(row['url'], row['http_auth'])
130 | 
131 | 			if self.get_request_cmp_tuple(row) in [self.get_request_cmp_tuple(r) for r in ret['results']]: continue
132 | 			d = dict(
133 | 				id = row['id'],
134 | 				url = row['url'],
135 | 				method = row['method'],
136 | 				data = row['data'],
137 | 				referer = row['referer'],
138 | 				xhr = [],
139 | 				jsonp = [],
140 | 				websockets = [],
141 | 				forms = [],
142 | 				errors = json.loads(row['crawler_errors']) if row['crawler_errors'] else [],
143 | 				vulnerabilities =  self.get_assessment_vulnerabilities(cur, row['id'])
144 | 			)
145 | 			if row['out_of_scope']: d['out_of_scope'] = True
146 | 
147 | 			if row['has_requests']:
148 | 				for r in report:
149 | 					if r['id'] != row['id']: continue
150 | 					req_obj = {}
151 | 
152 | 					trigger = json.loads(r['trigger']) if 'trigger' in r and r['trigger'] else None # {'event':'ready','element':'[document]'}
153 | 					req_obj['trigger'] = "%s.%s()" % (trigger['element'], trigger['event']) if trigger else ""
154 | 
155 | 					if r['req_type']=='xhr':
156 | 						req_obj['request'] = ["%s %s" % (r['req_method'], r['req_url'])]
157 | 						if r['req_data']: req_obj['request'].append(r['req_data'])
158 | 						d['xhr'].append(req_obj)
159 | 
160 | 					elif r['req_type']=='jsonp':
161 | 						req_obj['request'] = r['req_url']
162 | 						d['jsonp'].append(req_obj)
163 | 
164 | 					elif r['req_type']=='websocket':
165 | 						req_obj['request'] = r['req_url']
166 | 						d['websockets'].append(req_obj)
167 | 
168 | 					elif r['req_type']=='form':
169 | 						#req_obj['request'] = "%s %s data:%s" % (r['req_method'], r['req_url'], r['req_data'])
170 | 						req_obj['request'] = ["%s %s" % (r['req_method'], r['req_url'])]
171 | 						if r['req_data']: req_obj['request'].append(r['req_data'])
172 | 						d['forms'].append(req_obj)
173 | 
174 | 
175 | 			if row['has_requests'] or row['out_of_scope'] or len(d['errors']) > 0 or len(d['vulnerabilities']) > 0:
176 | 				ret['results'].append(d)
177 | 
178 | 		return json.dumps(ret)
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 	def main(self, args, opts):
186 | 
187 | 		base_dir = os.path.dirname(os.path.realpath(__file__)) + os.sep + "htmlreport" + os.sep
188 | 
189 | 		# if len(args) < 3:
190 | 		# 	print "usage: %s <dbfile> <outfile>" % args[0]
191 | 		# 	sys.exit(1)
192 | 
193 | 		dbfile = args[0]
194 | 		outfile = args[1]
195 | 
196 | 		if not os.path.exists(dbfile):
197 | 			print "No such file: %s" % dbfile
198 | 			sys.exit(1)
199 | 
200 | 		if os.path.exists(outfile):
201 | 			sys.stdout.write("File %s already exists. Overwrite [y/N]: " % outfile)
202 | 			if sys.stdin.read(1) != "y":
203 | 				sys.exit(1)
204 | 			os.remove(outfile)
205 | 
206 | 		conn = sqlite3.connect(dbfile)
207 | 		conn.row_factory = sqlite3.Row
208 | 		cur = conn.cursor() 
209 | 
210 | 		base_html = (
211 | 			"<html>\n"
212 | 			"<head>\n"
213 | 			"<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\n"
214 | 			"<style>\n%s\n</style>\n"
215 | 			"<script>\n%s\n%s\n</script>\n"
216 | 			"</head>\n"
217 | 			"%s\n"
218 | 			"</html>\n"
219 | 		)
220 | 
221 | 
222 | 		jsn = "var report = %s;\n" % self.get_json(cur)
223 | 
224 | 		with open("%sreport.html" % base_dir) as html, open("%sreport.js" % base_dir) as js, open("%sstyle.css" % base_dir) as css:
225 | 			html = base_html % (css.read(), jsn, js.read(), html.read())
226 | 
227 | 		with open(outfile,'w') as out:
228 | 			out.write(html)
229 | 
230 | 		print "Report saved to %s" % outfile
231 | 
232 | 


--------------------------------------------------------------------------------
/core/util/utilities/tocurl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import sqlite3
 4 | import json
 5 | 
 6 | from core.lib.utils import *
 7 | from core.util.base_util import BaseUtil
 8 | 
 9 | reload(sys)
10 | sys.setdefaultencoding('utf8')
11 | 
12 | 
13 | class Tocurl(BaseUtil):
14 | 
15 | 	@staticmethod
16 | 	def get_settings():
17 | 		return dict(
18 | 			descr = "Export saved requests to curl arguments",
19 | 			optargs = '',
20 | 			minargs = 1
21 | 		)
22 | 
23 | 	def usage(self):
24 | 		return (
25 | 			"%s\n"
26 | 			"usage: %s <dbfile> [<sql_where_clause>]\n" 
27 | 			% (self.get_settings()['descr'], self.utilname)
28 | 		)
29 | 
30 | 	def main(self, args, opts):
31 | 		qry = "SELECT method, url, data, referer, cookies FROM request WHERE %s"
32 | 
33 | 		dbfile = args[0]
34 | 		where = args[1] if len(args) > 1 else "1=1"
35 | 
36 | 		conn = sqlite3.connect(dbfile)
37 | 		conn.row_factory = sqlite3.Row 
38 | 
39 | 		cur = conn.cursor()
40 | 		cur.execute(qry % where)
41 | 		for req in cur.fetchall():
42 | 			cookies = ["%s=%s" % (c['name'],c['value']) for c in json.loads(req['cookies'])]
43 | 			cookies_str = "Cookie: %s" % ";".join(cookies) if len(cookies) > 0 else ""
44 | 			method = "POST" if req['method'] == "POST" else "GET"
45 | 			referer = "Referer: %s" % req['referer'] if req['referer'] else ""
46 | 			cmd = [ "-k", '-H',cookies_str, '-X', method, '-H', referer, req['url']]
47 | 			if req['data']:
48 | 				cmd.extend(['--data', req['data']])
49 | 
50 | 			print cmd_to_str(cmd)
51 | 


--------------------------------------------------------------------------------
/core/util/utilities/updcookie.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import sqlite3
 4 | import json
 5 | import getopt 
 6 | import os
 7 | 
 8 | from core.util.base_util import BaseUtil
 9 | 
10 | reload(sys)
11 | sys.setdefaultencoding('utf8')
12 | 
13 | class Updcookie(BaseUtil):
14 | 
15 | 	@staticmethod
16 | 	def get_settings():
17 | 		return dict(
18 | 			descr = "Update the value of a cookie of saved requests",
19 | 			optargs = '',
20 | 			minargs = 3
21 | 		)
22 | 
23 | 	def usage(self):
24 | 		return (
25 | 			"%s\n"
26 | 			"usage: %s <dbfile> <cookie_name> <cookie_value> [<sql_where_clause>]\n" 
27 | 			% (self.get_settings()['descr'], self.utilname)
28 | 		)
29 | 
30 | 
31 | 	def main(self, argv):
32 | 		qry = """
33 | 			SELECT id, cookies
34 | 			FROM request 
35 | 			WHERE %s
36 | 		"""
37 | 
38 | 		dbfile = args[0]
39 | 		cname = args[1]
40 | 		cvalue = args[2]
41 | 
42 | 		if not os.path.exists(dbfile):
43 | 			print "No such file %s" % dbfile
44 | 			sys.exit(1)
45 | 
46 | 		where = args[3] if len(args) > 3 else "1=1"
47 | 
48 | 		conn = sqlite3.connect(dbfile)
49 | 		conn.row_factory = sqlite3.Row 
50 | 
51 | 		cur = conn.cursor()
52 | 		wcur = conn.cursor()
53 | 		cur.execute(qry % where)
54 | 		pages = {}
55 | 		for res in cur.fetchall():
56 | 			cookies = res['cookies']
57 | 			if cookies:
58 | 				#print cookies
59 | 				cookies = json.loads(cookies)
60 | 				for cookie in cookies:
61 | 					if cookie['name'] == cname:
62 | 						cookie['value'] = cvalue
63 | 						wcur.execute("update request set cookies=? where id=?",(json.dumps(cookies), res['id']))
64 | 
65 | 		conn.commit()
66 | 		cur.close()
67 | 		wcur.close()
68 | 		conn.close()
69 | 


--------------------------------------------------------------------------------
/core/util/utilities/usgen.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | """
  4 | HTCAP - htcap.org
  5 | Author: filippo.cavallarin@wearesegment.com
  6 | 
  7 | This program is free software; you can redistribute it and/or modify it under
  8 | the terms of the GNU General Public License as published by the Free Software
  9 | Foundation; either version 2 of the License, or (at your option) any later
 10 | version.
 11 | """
 12 | 
 13 | import sys
 14 | import json
 15 | import os
 16 | 
 17 | from core.lib.utils import *
 18 | from core.util.base_util import BaseUtil
 19 | 
 20 | 
 21 | class Usgen(BaseUtil):
 22 | 
 23 | 	@staticmethod
 24 | 	def get_settings():
 25 | 		return dict(
 26 | 			descr = "Generate a sample user script",
 27 | 			optargs = '',
 28 | 			minargs = 1
 29 | 		)
 30 | 
 31 | 	def usage(self):
 32 | 		return (
 33 | 			"%s\n"
 34 | 			"usage: %s  <file>\n"
 35 | 			% (self.get_settings()['descr'], self.utilname)
 36 | 		)
 37 | 
 38 | 
 39 | 	def main(self, args, opts):
 40 | 		usfile = generate_filename(args[0], 'js', False, True)
 41 | 		try:
 42 | 			with open(usfile,'w') as f:
 43 | 				f.write(CONTENT)
 44 | 			print "User Script saved to %s" % usfile
 45 | 		except Exception as e:
 46 | 			print "Unable to write file %s" % usfile
 47 | 			sys.exit(1)
 48 | 
 49 | 
 50 | CONTENT = """/*
 51 | ui Methods:
 52 |     ui.pageEval(function) - evaluate function in the context of the webpage (no scope chain available)
 53 |     ui.print(message) - save a per-request user message into the request table
 54 |     ui.fread(path_to_file) - read from file
 55 |     ui.fwrite(path_to_file, content, mode) - write to file
 56 |     ui.render(path_to_file) - save a screenshot of the page current state
 57 | */
 58 | 
 59 |  US = {
 60 |   onInit: function(ui){
 61 |     // init local variables
 62 |     ui.vars.cnt = 0;
 63 | 
 64 |     // override native methods
 65 |     ui.pageEval(function(){
 66 |         window.prompt = function(){ return "AAA" };
 67 |     });
 68 |   },
 69 | 
 70 |   onStart: function(ui){
 71 |     ui.pageEval(function(){});
 72 |   }, 
 73 | 
 74 |   onTriggerEvent: function(ui){
 75 |     var ok = ui.pageEval(function(element, event){
 76 |         if(event == "click" && element.className == 'kill'){
 77 |             return false;
 78 |         }
 79 |         return true;
 80 |     });
 81 |     // cancel triggering of event
 82 |     if(!ok) return false;
 83 |   },
 84 | 
 85 |   onEventTriggered: function(ui){
 86 |     ui.pageEval(function(element, event){});
 87 |   },
 88 | 
 89 |   onXhr: function(ui){
 90 |     var url = ui.pageEval(function(request){
 91 |         return request.url;
 92 |     });
 93 |     // cancel XHR request if url matches XXX
 94 |     if(url.match(/step=4/)){
 95 |         ui.print("Skipped XHR to " + url)
 96 |         return false;
 97 |     }
 98 |   },
 99 |   onFillInput: function(ui){
100 |     // here it's possible to force a value or prevent it to be filled
101 |     // WARNING: do NOT set dynamic values! for instance something like
102 |     //  element.value = Math.random()
103 |     // will lead to INFINITE CRAWLING if you crawl forms
104 |     return ui.pageEval(function(element){
105 |         if(element.id == "car_vendor"){
106 |             element.value = "Ferrari";
107 |             // prevent element value to be set
108 |             return false;
109 |         }
110 |     });
111 |   },
112 |   onAllXhrsCompleted: function(ui){
113 |     ui.pageEval(function(){});
114 |   },
115 | 
116 |   onDomModified: function(ui){
117 |     ui.pageEval(function(rootElements, allElements){});
118 |     // save a screenshot on every DOM change
119 |     ui.render(ui.id + "-screen-" + ui.vars.cnt + ".png");
120 |     ui.vars.cnt++; 
121 |   },
122 | 
123 |   onEnd: function(ui){
124 |     ui.pageEval(function(){});
125 |   } 
126 | }
127 | """
128 | 
129 | 


--------------------------------------------------------------------------------
/htcap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*- 
 3 | 
 4 | """
 5 | HTCAP - beta 1
 6 | Author: filippo.cavallarin@wearesegment.com
 7 | 
 8 | This program is free software; you can redistribute it and/or modify it under 
 9 | the terms of the GNU General Public License as published by the Free Software 
10 | Foundation; either version 2 of the License, or (at your option) any later 
11 | version.
12 | """
13 | 
14 | from __future__ import unicode_literals
15 | import sys
16 | import os
17 | import datetime
18 | import time
19 | import getopt
20 | 
21 | from core.lib.utils import *
22 | from core.crawl.crawler import Crawler
23 | 
24 | from core.util.util import Util
25 | 
26 | reload(sys)
27 | sys.setdefaultencoding('utf8')
28 | log = "*"*53
29 | log += """
30 | * / _ \|  _ \  / \\ \ / / ___/ ___|  ___ __ _| \ | | *
31 | *| | | | | | |/ _ \\ V /|___ \___ \ / __/ _` |  \| | *
32 | *| |_| | |_| / ___ \| |  ___) |__) | (_| (_| | |\  |*
33 | * \___/|____/_/   \_\_| |____/____/ \___\__,_|_| \_|*
34 | """
35 | log += "*"*53
36 | print log
37 | 
38 | def usage():
39 | 	print (
40 | 		   "usage: htcap <command>\n" 
41 | 		   "Commands: \n"
42 | 		   "  crawl                  run crawler\n"
43 | 		   "  util                   run utility\n"
44 | 		   )
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 
49 | 	if len(sys.argv) < 2:
50 | 		usage()
51 | 		sys.exit(1)
52 | 
53 | 	elif sys.argv[1] == "crawl":
54 | 		Crawler(sys.argv[2:])
55 | 	elif sys.argv[1] == "util":
56 | 		Util(sys.argv[2:])
57 | 	else:
58 | 		usage();
59 | 		sys.exit(1)
60 | 
61 | 	sys.exit(0)
62 | 


--------------------------------------------------------------------------------
/new.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Navicat Premium Data Transfer
 3 | 
 4 |  Source Server         : 本地测试
 5 |  Source Server Type    : MySQL
 6 |  Source Server Version : 50542
 7 |  Source Host           : localhost
 8 |  Source Database       : w3a_scan
 9 | 
10 |  Target Server Type    : MySQL
11 |  Target Server Version : 50542
12 |  File Encoding         : utf-8
13 | 
14 |  Date: 06/26/2017 13:51:33 PM
15 | */
16 | 
17 | SET NAMES utf8;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 | 
20 | -- ----------------------------
21 | --  Table structure for `crawl_info`
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `crawl_info`;
24 | CREATE TABLE `crawl_info` (
25 |   `id` int(11) NOT NULL AUTO_INCREMENT,
26 |   `target` varchar(255) NOT NULL COMMENT '扫描目标',
27 |   `start_date` varchar(255) NOT NULL COMMENT '扫描开始时间',
28 |   `end_date` varchar(255) NOT NULL COMMENT '扫描结束时间',
29 |   `commandline` varchar(255) NOT NULL COMMENT '扫描执行的命令',
30 |   `user_agent` varchar(255) NOT NULL COMMENT '扫描的ua',
31 |   PRIMARY KEY (`id`)
32 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
33 | 
34 | -- ----------------------------
35 | --  Table structure for `request`
36 | -- ----------------------------
37 | DROP TABLE IF EXISTS `request`;
38 | CREATE TABLE `request` (
39 |   `id` int(11) NOT NULL AUTO_INCREMENT,
40 |   `taskid` int(11) DEFAULT NULL,
41 |   `method` varchar(255) NOT NULL COMMENT '请求类型',
42 |   `host` varchar(255) DEFAULT NULL COMMENT '请求地址',,
43 |   `url` varchar(255) NOT NULL COMMENT '地址',
44 |   `data` text COMMENT '参数',
45 |   `referer` varchar(255) DEFAULT NULL COMMENT '来路',
46 |   `type` varchar(255) NOT NULL COMMENT '类型',
47 |   `redirects` varchar(255) DEFAULT '' COMMENT '什么鬼',
48 |   `cookies` varchar(255) DEFAULT NULL COMMENT 'cookie',
49 |   `http_auth` varchar(255) DEFAULT NULL COMMENT 'http认证',
50 |   `out_of_scope` varchar(255) DEFAULT NULL,
51 |   `trigger` varchar(255) DEFAULT NULL  COMMENT '触发事件',
52 |   `html` text  COMMENT 'http内容',
53 |   `user_output` text,
54 |   PRIMARY KEY (`id`)
55 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
56 | 
57 | SET FOREIGN_KEY_CHECKS = 1;
58 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | DBUtils
2 | pymysql
3 | HTMLParser
4 | requests
5 | requests_cache
6 | tld


--------------------------------------------------------------------------------